gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 3, or (at your option) any
  10 later version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  22    operations.  These are common in sequences such as this one:
  23
  24         modulus = sqrt(x*x + y*y + z*z);
  25         x = x / modulus;
  26         y = y / modulus;
  27         z = z / modulus;
  28
  29    that can be optimized to
  30
  31         modulus = sqrt(x*x + y*y + z*z);
  32         rmodulus = 1.0 / modulus;
  33         x = x * rmodulus;
  34         y = y * rmodulus;
  35         z = z * rmodulus;
  36
  37    We do this for loop invariant divisors, and with this pass whenever
  38    we notice that a division has the same divisor multiple times.
  39
  40    Of course, like in PRE, we don't insert a division if a dominator
  41    already has one.  However, this cannot be done as an extension of
  42    PRE for several reasons.
  43
  44    First of all, with some experiments it was found out that the
  45    transformation is not always useful if there are only two divisions
  46    hy the same divisor.  This is probably because modern processors
  47    can pipeline the divisions; on older, in-order processors it should
  48    still be effective to optimize two divisions by the same number.
  49    We make this a param, and it shall be called N in the remainder of
  50    this comment.
  51
  52    Second, if trapping math is active, we have less freedom on where
  53    to insert divisions: we can only do so in basic blocks that already
  54    contain one.  (If divisions don't trap, instead, we can insert
  55    divisions elsewhere, which will be in blocks that are common dominators
  56    of those that have the division).
  57
  58    We really don't want to compute the reciprocal unless a division will
  59    be found.  To do this, we won't insert the division in a basic block
  60    that has less than N divisions *post-dominating* it.
  61
  62    The algorithm constructs a subset of the dominator tree, holding the
  63    blocks containing the divisions and the common dominators to them,
  64    and walk it twice.  The first walk is in post-order, and it annotates
  65    each block with the number of divisions that post-dominate it: this
  66    gives information on where divisions can be inserted profitably.
  67    The second walk is in pre-order, and it inserts divisions as explained
  68    above, and replaces divisions by multiplications.
  69
  70    In the best case, the cost of the pass is O(n_statements).  In the
  71    worst-case, the cost is due to creating the dominator tree subset,
  72    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  73    for n_statements / n_basic_blocks statements.  So, the amortized cost
  74    of creating the dominator tree subset is O(n_basic_blocks) and the
  75    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  76
  77    More practically, the cost will be small because there are few
  78    divisions, and they tend to be in the same basic block, so insert_bb
  79    is called very few times.
  80
  81    If we did this using domwalk.c, an efficient implementation would have
  82    to work on all the variables in a single pass, because we could not
  83    work on just a subset of the dominator tree, as we do now, and the
  84    cost would also be something like O(n_statements * n_basic_blocks).
  85    The data structures would be more complex in order to work on all the
  86    variables in a single pass.  */
  87
  88 #include "config.h"
  89 #include "system.h"
  90 #include "coretypes.h"
  91 #include "tm.h"
  92 #include "flags.h"
  93 #include "tree.h"
  94 #include "tree-flow.h"
  95 #include "tree-pass.h"
  96 #include "alloc-pool.h"
  97 #include "basic-block.h"
  98 #include "target.h"
  99 #include "gimple-pretty-print.h"
 100
 101 /* FIXME: RTL headers have to be included here for optabs.  */
 102 #include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
 103 #include "expr.h"               /* Because optabs.h wants sepops.  */
 104 #include "optabs.h"
 105
 106 /* This structure represents one basic block that either computes a
 107    division, or is a common dominator for basic block that compute a
 108    division.  */
 109 struct occurrence {
 110   /* The basic block represented by this structure.  */
 111   basic_block bb;
 112
 113   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 114      inserted in BB.  */
 115   tree recip_def;
 116
 117   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 118      was inserted in BB.  */
 119   gimple recip_def_stmt;
 120
 121   /* Pointer to a list of "struct occurrence"s for blocks dominated
 122      by BB.  */
 123   struct occurrence *children;
 124
 125   /* Pointer to the next "struct occurrence"s in the list of blocks
 126      sharing a common dominator.  */
 127   struct occurrence *next;
 128
 129   /* The number of divisions that are in BB before compute_merit.  The
 130      number of divisions that are in BB or post-dominate it after
 131      compute_merit.  */
 132   int num_divisions;
 133
 134   /* True if the basic block has a division, false if it is a common
 135      dominator for basic blocks that do.  If it is false and trapping
 136      math is active, BB is not a candidate for inserting a reciprocal.  */
 137   bool bb_has_division;
 138 };
 139
 140 static struct
 141 {
 142   /* Number of 1.0/X ops inserted.  */
 143   int rdivs_inserted;
 144
 145   /* Number of 1.0/FUNC ops inserted.  */
 146   int rfuncs_inserted;
 147 } reciprocal_stats;
 148
 149 static struct
 150 {
 151   /* Number of cexpi calls inserted.  */
 152   int inserted;
 153 } sincos_stats;
 154
 155 static struct
 156 {
 157   /* Number of hand-written 32-bit bswaps found.  */
 158   int found_32bit;
 159
 160   /* Number of hand-written 64-bit bswaps found.  */
 161   int found_64bit;
 162 } bswap_stats;
 163
 164 static struct
 165 {
 166   /* Number of widening multiplication ops inserted.  */
 167   int widen_mults_inserted;
 168
 169   /* Number of integer multiply-and-accumulate ops inserted.  */
 170   int maccs_inserted;
 171
 172   /* Number of fp fused multiply-add ops inserted.  */
 173   int fmas_inserted;
 174 } widen_mul_stats;
 175
 176 /* The instance of "struct occurrence" representing the highest
 177    interesting block in the dominator tree.  */
 178 static struct occurrence *occ_head;
 179
 180 /* Allocation pool for getting instances of "struct occurrence".  */
 181 static alloc_pool occ_pool;
 182
 183
 184
 185 /* Allocate and return a new struct occurrence for basic block BB, and
 186    whose children list is headed by CHILDREN.  */
 187 static struct occurrence *
 188 occ_new (basic_block bb, struct occurrence *children)
 189 {
 190   struct occurrence *occ;
 191
 192   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
 193   memset (occ, 0, sizeof (struct occurrence));
 194
 195   occ->bb = bb;
 196   occ->children = children;
 197   return occ;
 198 }
 199
 200
 201 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 202    list of "struct occurrence"s, one per basic block, having IDOM as
 203    their common dominator.
 204
 205    We try to insert NEW_OCC as deep as possible in the tree, and we also
 206    insert any other block that is a common dominator for BB and one
 207    block already in the tree.  */
 208
 209 static void
 210 insert_bb (struct occurrence *new_occ, basic_block idom,
 211            struct occurrence **p_head)
 212 {
 213   struct occurrence *occ, **p_occ;
 214
 215   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 216     {
 217       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 218       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 219       if (dom == bb)
 220         {
 221           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 222              from its list.  */
 223           *p_occ = occ->next;
 224           occ->next = new_occ->children;
 225           new_occ->children = occ;
 226
 227           /* Try the next block (it may as well be dominated by BB).  */
 228         }
 229
 230       else if (dom == occ_bb)
 231         {
 232           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 233           insert_bb (new_occ, dom, &occ->children);
 234           return;
 235         }
 236
 237       else if (dom != idom)
 238         {
 239           gcc_assert (!dom->aux);
 240
 241           /* There is a dominator between IDOM and BB, add it and make
 242              two children out of NEW_OCC and OCC.  First, remove OCC from
 243              its list.  */
 244           *p_occ = occ->next;
 245           new_occ->next = occ;
 246           occ->next = NULL;
 247
 248           /* None of the previous blocks has DOM as a dominator: if we tail
 249              recursed, we would reexamine them uselessly. Just switch BB with
 250              DOM, and go on looking for blocks dominated by DOM.  */
 251           new_occ = occ_new (dom, new_occ);
 252         }
 253
 254       else
 255         {
 256           /* Nothing special, go on with the next element.  */
 257           p_occ = &occ->next;
 258         }
 259     }
 260
 261   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 262   new_occ->next = *p_head;
 263   *p_head = new_occ;
 264 }
 265
 266 /* Register that we found a division in BB.  */
 267
 268 static inline void
 269 register_division_in (basic_block bb)
 270 {
 271   struct occurrence *occ;
 272
 273   occ = (struct occurrence *) bb->aux;
 274   if (!occ)
 275     {
 276       occ = occ_new (bb, NULL);
 277       insert_bb (occ, ENTRY_BLOCK_PTR, &occ_head);
 278     }
 279
 280   occ->bb_has_division = true;
 281   occ->num_divisions++;
 282 }
 283
 284
 285 /* Compute the number of divisions that postdominate each block in OCC and
 286    its children.  */
 287
 288 static void
 289 compute_merit (struct occurrence *occ)
 290 {
 291   struct occurrence *occ_child;
 292   basic_block dom = occ->bb;
 293
 294   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 295     {
 296       basic_block bb;
 297       if (occ_child->children)
 298         compute_merit (occ_child);
 299
 300       if (flag_exceptions)
 301         bb = single_noncomplex_succ (dom);
 302       else
 303         bb = dom;
 304
 305       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 306         occ->num_divisions += occ_child->num_divisions;
 307     }
 308 }
 309
 310
 311 /* Return whether USE_STMT is a floating-point division by DEF.  */
 312 static inline bool
 313 is_division_by (gimple use_stmt, tree def)
 314 {
 315   return is_gimple_assign (use_stmt)
 316          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 317          && gimple_assign_rhs2 (use_stmt) == def
 318          /* Do not recognize x / x as valid division, as we are getting
 319             confused later by replacing all immediate uses x in such
 320             a stmt.  */
 321          && gimple_assign_rhs1 (use_stmt) != def;
 322 }
 323
 324 /* Walk the subset of the dominator tree rooted at OCC, setting the
 325    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 326    the given basic block.  The field may be left NULL, of course,
 327    if it is not possible or profitable to do the optimization.
 328
 329    DEF_BSI is an iterator pointing at the statement defining DEF.
 330    If RECIP_DEF is set, a dominator already has a computation that can
 331    be used.  */
 332
 333 static void
 334 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 335                     tree def, tree recip_def, int threshold)
 336 {
 337   tree type;
 338   gimple new_stmt;
 339   gimple_stmt_iterator gsi;
 340   struct occurrence *occ_child;
 341
 342   if (!recip_def
 343       && (occ->bb_has_division || !flag_trapping_math)
 344       && occ->num_divisions >= threshold)
 345     {
 346       /* Make a variable with the replacement and substitute it.  */
 347       type = TREE_TYPE (def);
 348       recip_def = make_rename_temp (type, "reciptmp");
 349       new_stmt = gimple_build_assign_with_ops (RDIV_EXPR, recip_def,
 350                                                build_one_cst (type), def);
 351
 352       if (occ->bb_has_division)
 353         {
 354           /* Case 1: insert before an existing division.  */
 355           gsi = gsi_after_labels (occ->bb);
 356           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 357             gsi_next (&gsi);
 358
 359           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 360         }
 361       else if (def_gsi && occ->bb == def_gsi->bb)
 362         {
 363           /* Case 2: insert right after the definition.  Note that this will
 364              never happen if the definition statement can throw, because in
 365              that case the sole successor of the statement's basic block will
 366              dominate all the uses as well.  */
 367           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 368         }
 369       else
 370         {
 371           /* Case 3: insert in a basic block not containing defs/uses.  */
 372           gsi = gsi_after_labels (occ->bb);
 373           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 374         }
 375
 376       reciprocal_stats.rdivs_inserted++;
 377
 378       occ->recip_def_stmt = new_stmt;
 379     }
 380
 381   occ->recip_def = recip_def;
 382   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 383     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 384 }
 385
 386
 387 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 388    possible.  */
 389
 390 static inline void
 391 replace_reciprocal (use_operand_p use_p)
 392 {
 393   gimple use_stmt = USE_STMT (use_p);
 394   basic_block bb = gimple_bb (use_stmt);
 395   struct occurrence *occ = (struct occurrence *) bb->aux;
 396
 397   if (optimize_bb_for_speed_p (bb)
 398       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 399     {
 400       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 401       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 402       SET_USE (use_p, occ->recip_def);
 403       fold_stmt_inplace (&gsi);
 404       update_stmt (use_stmt);
 405     }
 406 }
 407
 408
 409 /* Free OCC and return one more "struct occurrence" to be freed.  */
 410
 411 static struct occurrence *
 412 free_bb (struct occurrence *occ)
 413 {
 414   struct occurrence *child, *next;
 415
 416   /* First get the two pointers hanging off OCC.  */
 417   next = occ->next;
 418   child = occ->children;
 419   occ->bb->aux = NULL;
 420   pool_free (occ_pool, occ);
 421
 422   /* Now ensure that we don't recurse unless it is necessary.  */
 423   if (!child)
 424     return next;
 425   else
 426     {
 427       while (next)
 428         next = free_bb (next);
 429
 430       return child;
 431     }
 432 }
 433
 434
 435 /* Look for floating-point divisions among DEF's uses, and try to
 436    replace them by multiplications with the reciprocal.  Add
 437    as many statements computing the reciprocal as needed.
 438
 439    DEF must be a GIMPLE register of a floating-point type.  */
 440
 441 static void
 442 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 443 {
 444   use_operand_p use_p;
 445   imm_use_iterator use_iter;
 446   struct occurrence *occ;
 447   int count = 0, threshold;
 448
 449   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 450
 451   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 452     {
 453       gimple use_stmt = USE_STMT (use_p);
 454       if (is_division_by (use_stmt, def))
 455         {
 456           register_division_in (gimple_bb (use_stmt));
 457           count++;
 458         }
 459     }
 460
 461   /* Do the expensive part only if we can hope to optimize something.  */
 462   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 463   if (count >= threshold)
 464     {
 465       gimple use_stmt;
 466       for (occ = occ_head; occ; occ = occ->next)
 467         {
 468           compute_merit (occ);
 469           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 470         }
 471
 472       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 473         {
 474           if (is_division_by (use_stmt, def))
 475             {
 476               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 477                 replace_reciprocal (use_p);
 478             }
 479         }
 480     }
 481
 482   for (occ = occ_head; occ; )
 483     occ = free_bb (occ);
 484
 485   occ_head = NULL;
 486 }
 487
 488 static bool
 489 gate_cse_reciprocals (void)
 490 {
 491   return optimize && flag_reciprocal_math;
 492 }
 493
 494 /* Go through all the floating-point SSA_NAMEs, and call
 495    execute_cse_reciprocals_1 on each of them.  */
 496 static unsigned int
 497 execute_cse_reciprocals (void)
 498 {
 499   basic_block bb;
 500   tree arg;
 501
 502   occ_pool = create_alloc_pool ("dominators for recip",
 503                                 sizeof (struct occurrence),
 504                                 n_basic_blocks / 3 + 1);
 505
 506   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 507   calculate_dominance_info (CDI_DOMINATORS);
 508   calculate_dominance_info (CDI_POST_DOMINATORS);
 509
 510 #ifdef ENABLE_CHECKING
 511   FOR_EACH_BB (bb)
 512     gcc_assert (!bb->aux);
 513 #endif
 514
 515   for (arg = DECL_ARGUMENTS (cfun->decl); arg; arg = DECL_CHAIN (arg))
 516     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 517         && is_gimple_reg (arg))
 518       {
 519         tree name = ssa_default_def (cfun, arg);
 520         if (name)
 521           execute_cse_reciprocals_1 (NULL, name);
 522       }
 523
 524   FOR_EACH_BB (bb)
 525     {
 526       gimple_stmt_iterator gsi;
 527       gimple phi;
 528       tree def;
 529
 530       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 531         {
 532           phi = gsi_stmt (gsi);
 533           def = PHI_RESULT (phi);
 534           if (FLOAT_TYPE_P (TREE_TYPE (def))
 535               && is_gimple_reg (def))
 536             execute_cse_reciprocals_1 (NULL, def);
 537         }
 538
 539       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 540         {
 541           gimple stmt = gsi_stmt (gsi);
 542
 543           if (gimple_has_lhs (stmt)
 544               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 545               && FLOAT_TYPE_P (TREE_TYPE (def))
 546               && TREE_CODE (def) == SSA_NAME)
 547             execute_cse_reciprocals_1 (&gsi, def);
 548         }
 549
 550       if (optimize_bb_for_size_p (bb))
 551         continue;
 552
 553       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 554       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 555         {
 556           gimple stmt = gsi_stmt (gsi);
 557           tree fndecl;
 558
 559           if (is_gimple_assign (stmt)
 560               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 561             {
 562               tree arg1 = gimple_assign_rhs2 (stmt);
 563               gimple stmt1;
 564
 565               if (TREE_CODE (arg1) != SSA_NAME)
 566                 continue;
 567
 568               stmt1 = SSA_NAME_DEF_STMT (arg1);
 569
 570               if (is_gimple_call (stmt1)
 571                   && gimple_call_lhs (stmt1)
 572                   && (fndecl = gimple_call_fndecl (stmt1))
 573                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 574                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 575                 {
 576                   enum built_in_function code;
 577                   bool md_code, fail;
 578                   imm_use_iterator ui;
 579                   use_operand_p use_p;
 580
 581                   code = DECL_FUNCTION_CODE (fndecl);
 582                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 583
 584                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 585                   if (!fndecl)
 586                     continue;
 587
 588                   /* Check that all uses of the SSA name are divisions,
 589                      otherwise replacing the defining statement will do
 590                      the wrong thing.  */
 591                   fail = false;
 592                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 593                     {
 594                       gimple stmt2 = USE_STMT (use_p);
 595                       if (is_gimple_debug (stmt2))
 596                         continue;
 597                       if (!is_gimple_assign (stmt2)
 598                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 599                           || gimple_assign_rhs1 (stmt2) == arg1
 600                           || gimple_assign_rhs2 (stmt2) != arg1)
 601                         {
 602                           fail = true;
 603                           break;
 604                         }
 605                     }
 606                   if (fail)
 607                     continue;
 608
 609                   gimple_replace_lhs (stmt1, arg1);
 610                   gimple_call_set_fndecl (stmt1, fndecl);
 611                   update_stmt (stmt1);
 612                   reciprocal_stats.rfuncs_inserted++;
 613
 614                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 615                     {
 616                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 617                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 618                       fold_stmt_inplace (&gsi);
 619                       update_stmt (stmt);
 620                     }
 621                 }
 622             }
 623         }
 624     }
 625
 626   statistics_counter_event (cfun, "reciprocal divs inserted",
 627                             reciprocal_stats.rdivs_inserted);
 628   statistics_counter_event (cfun, "reciprocal functions inserted",
 629                             reciprocal_stats.rfuncs_inserted);
 630
 631   free_dominance_info (CDI_DOMINATORS);
 632   free_dominance_info (CDI_POST_DOMINATORS);
 633   free_alloc_pool (occ_pool);
 634   return 0;
 635 }
 636
 637 struct gimple_opt_pass pass_cse_reciprocals =
 638 {
 639  {
 640   GIMPLE_PASS,
 641   "recip",                              /* name */
 642   gate_cse_reciprocals,                 /* gate */
 643   execute_cse_reciprocals,              /* execute */
 644   NULL,                                 /* sub */
 645   NULL,                                 /* next */
 646   0,                                    /* static_pass_number */
 647   TV_NONE,                              /* tv_id */
 648   PROP_ssa,                             /* properties_required */
 649   0,                                    /* properties_provided */
 650   0,                                    /* properties_destroyed */
 651   0,                                    /* todo_flags_start */
 652   TODO_update_ssa | TODO_verify_ssa
 653     | TODO_verify_stmts                /* todo_flags_finish */
 654  }
 655 };
 656
 657 /* Records an occurrence at statement USE_STMT in the vector of trees
 658    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 659    is not yet initialized.  Returns true if the occurrence was pushed on
 660    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 661    statements in the vector.  */
 662
 663 static bool
 664 maybe_record_sincos (VEC(gimple, heap) **stmts,
 665                      basic_block *top_bb, gimple use_stmt)
 666 {
 667   basic_block use_bb = gimple_bb (use_stmt);
 668   if (*top_bb
 669       && (*top_bb == use_bb
 670           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 671     VEC_safe_push (gimple, heap, *stmts, use_stmt);
 672   else if (!*top_bb
 673            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 674     {
 675       VEC_safe_push (gimple, heap, *stmts, use_stmt);
 676       *top_bb = use_bb;
 677     }
 678   else
 679     return false;
 680
 681   return true;
 682 }
 683
 684 /* Look for sin, cos and cexpi calls with the same argument NAME and
 685    create a single call to cexpi CSEing the result in this case.
 686    We first walk over all immediate uses of the argument collecting
 687    statements that we can CSE in a vector and in a second pass replace
 688    the statement rhs with a REALPART or IMAGPART expression on the
 689    result of the cexpi call we insert before the use statement that
 690    dominates all other candidates.  */
 691
 692 static bool
 693 execute_cse_sincos_1 (tree name)
 694 {
 695   gimple_stmt_iterator gsi;
 696   imm_use_iterator use_iter;
 697   tree fndecl, res, type;
 698   gimple def_stmt, use_stmt, stmt;
 699   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 700   VEC(gimple, heap) *stmts = NULL;
 701   basic_block top_bb = NULL;
 702   int i;
 703   bool cfg_changed = false;
 704
 705   type = TREE_TYPE (name);
 706   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 707     {
 708       if (gimple_code (use_stmt) != GIMPLE_CALL
 709           || !gimple_call_lhs (use_stmt)
 710           || !(fndecl = gimple_call_fndecl (use_stmt))
 711           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 712         continue;
 713
 714       switch (DECL_FUNCTION_CODE (fndecl))
 715         {
 716         CASE_FLT_FN (BUILT_IN_COS):
 717           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 718           break;
 719
 720         CASE_FLT_FN (BUILT_IN_SIN):
 721           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 722           break;
 723
 724         CASE_FLT_FN (BUILT_IN_CEXPI):
 725           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 726           break;
 727
 728         default:;
 729         }
 730     }
 731
 732   if (seen_cos + seen_sin + seen_cexpi <= 1)
 733     {
 734       VEC_free(gimple, heap, stmts);
 735       return false;
 736     }
 737
 738   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 739      the name def statement.  */
 740   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 741   if (!fndecl)
 742     return false;
 743   res = create_tmp_reg (TREE_TYPE (TREE_TYPE (fndecl)), "sincostmp");
 744   stmt = gimple_build_call (fndecl, 1, name);
 745   res = make_ssa_name (res, stmt);
 746   gimple_call_set_lhs (stmt, res);
 747
 748   def_stmt = SSA_NAME_DEF_STMT (name);
 749   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 750       && gimple_code (def_stmt) != GIMPLE_PHI
 751       && gimple_bb (def_stmt) == top_bb)
 752     {
 753       gsi = gsi_for_stmt (def_stmt);
 754       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 755     }
 756   else
 757     {
 758       gsi = gsi_after_labels (top_bb);
 759       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 760     }
 761   update_stmt (stmt);
 762   sincos_stats.inserted++;
 763
 764   /* And adjust the recorded old call sites.  */
 765   for (i = 0; VEC_iterate(gimple, stmts, i, use_stmt); ++i)
 766     {
 767       tree rhs = NULL;
 768       fndecl = gimple_call_fndecl (use_stmt);
 769
 770       switch (DECL_FUNCTION_CODE (fndecl))
 771         {
 772         CASE_FLT_FN (BUILT_IN_COS):
 773           rhs = fold_build1 (REALPART_EXPR, type, res);
 774           break;
 775
 776         CASE_FLT_FN (BUILT_IN_SIN):
 777           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 778           break;
 779
 780         CASE_FLT_FN (BUILT_IN_CEXPI):
 781           rhs = res;
 782           break;
 783
 784         default:;
 785           gcc_unreachable ();
 786         }
 787
 788         /* Replace call with a copy.  */
 789         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 790
 791         gsi = gsi_for_stmt (use_stmt);
 792         gsi_replace (&gsi, stmt, true);
 793         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 794           cfg_changed = true;
 795     }
 796
 797   VEC_free(gimple, heap, stmts);
 798
 799   return cfg_changed;
 800 }
 801
 802 /* To evaluate powi(x,n), the floating point value x raised to the
 803    constant integer exponent n, we use a hybrid algorithm that
 804    combines the "window method" with look-up tables.  For an
 805    introduction to exponentiation algorithms and "addition chains",
 806    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 807    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 808    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 809    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 810
 811 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 812    multiplications to inline before calling the system library's pow
 813    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 814    so this default never requires calling pow, powf or powl.  */
 815
 816 #ifndef POWI_MAX_MULTS
 817 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 818 #endif
 819
 820 /* The size of the "optimal power tree" lookup table.  All
 821    exponents less than this value are simply looked up in the
 822    powi_table below.  This threshold is also used to size the
 823    cache of pseudo registers that hold intermediate results.  */
 824 #define POWI_TABLE_SIZE 256
 825
 826 /* The size, in bits of the window, used in the "window method"
 827    exponentiation algorithm.  This is equivalent to a radix of
 828    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 829 #define POWI_WINDOW_SIZE 3
 830
 831 /* The following table is an efficient representation of an
 832    "optimal power tree".  For each value, i, the corresponding
 833    value, j, in the table states than an optimal evaluation
 834    sequence for calculating pow(x,i) can be found by evaluating
 835    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 836    100 integers is given in Knuth's "Seminumerical algorithms".  */
 837
 838 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 839   {
 840       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 841       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 842       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 843      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 844      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 845      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 846      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 847      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 848      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 849      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 850      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 851      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 852      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 853      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 854      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 855      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 856      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 857      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 858      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 859      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 860      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 861      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 862      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 863      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 864      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 865     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 866     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 867     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 868     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 869     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 870     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 871     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 872   };
 873
 874
 875 /* Return the number of multiplications required to calculate
 876    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 877    subroutine of powi_cost.  CACHE is an array indicating
 878    which exponents have already been calculated.  */
 879
 880 static int
 881 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 882 {
 883   /* If we've already calculated this exponent, then this evaluation
 884      doesn't require any additional multiplications.  */
 885   if (cache[n])
 886     return 0;
 887
 888   cache[n] = true;
 889   return powi_lookup_cost (n - powi_table[n], cache)
 890          + powi_lookup_cost (powi_table[n], cache) + 1;
 891 }
 892
 893 /* Return the number of multiplications required to calculate
 894    powi(x,n) for an arbitrary x, given the exponent N.  This
 895    function needs to be kept in sync with powi_as_mults below.  */
 896
 897 static int
 898 powi_cost (HOST_WIDE_INT n)
 899 {
 900   bool cache[POWI_TABLE_SIZE];
 901   unsigned HOST_WIDE_INT digit;
 902   unsigned HOST_WIDE_INT val;
 903   int result;
 904
 905   if (n == 0)
 906     return 0;
 907
 908   /* Ignore the reciprocal when calculating the cost.  */
 909   val = (n < 0) ? -n : n;
 910
 911   /* Initialize the exponent cache.  */
 912   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 913   cache[1] = true;
 914
 915   result = 0;
 916
 917   while (val >= POWI_TABLE_SIZE)
 918     {
 919       if (val & 1)
 920         {
 921           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 922           result += powi_lookup_cost (digit, cache)
 923                     + POWI_WINDOW_SIZE + 1;
 924           val >>= POWI_WINDOW_SIZE;
 925         }
 926       else
 927         {
 928           val >>= 1;
 929           result++;
 930         }
 931     }
 932
 933   return result + powi_lookup_cost (val, cache);
 934 }
 935
 936 /* Recursive subroutine of powi_as_mults.  This function takes the
 937    array, CACHE, of already calculated exponents and an exponent N and
 938    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 939
 940 static tree
 941 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 942                  HOST_WIDE_INT n, tree *cache, tree target)
 943 {
 944   tree op0, op1, ssa_target;
 945   unsigned HOST_WIDE_INT digit;
 946   gimple mult_stmt;
 947
 948   if (n < POWI_TABLE_SIZE && cache[n])
 949     return cache[n];
 950
 951   ssa_target = make_ssa_name (target, NULL);
 952
 953   if (n < POWI_TABLE_SIZE)
 954     {
 955       cache[n] = ssa_target;
 956       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache, target);
 957       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache, target);
 958     }
 959   else if (n & 1)
 960     {
 961       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
 962       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache, target);
 963       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache, target);
 964     }
 965   else
 966     {
 967       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache, target);
 968       op1 = op0;
 969     }
 970
 971   mult_stmt = gimple_build_assign_with_ops (MULT_EXPR, ssa_target, op0, op1);
 972   gimple_set_location (mult_stmt, loc);
 973   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
 974
 975   return ssa_target;
 976 }
 977
 978 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
 979    This function needs to be kept in sync with powi_cost above.  */
 980
 981 static tree
 982 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
 983                tree arg0, HOST_WIDE_INT n)
 984 {
 985   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0), target;
 986   gimple div_stmt;
 987
 988   if (n == 0)
 989     return build_real (type, dconst1);
 990
 991   memset (cache, 0,  sizeof (cache));
 992   cache[1] = arg0;
 993
 994   target = create_tmp_reg (type, "powmult");
 995   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache, target);
 996
 997   if (n >= 0)
 998     return result;
 999
1000   /* If the original exponent was negative, reciprocate the result.  */
1001   target = make_ssa_name (target, NULL);
1002   div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target,
1003                                            build_real (type, dconst1),
1004                                            result);
1005   gimple_set_location (div_stmt, loc);
1006   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1007
1008   return target;
1009 }
1010
1011 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1012    location info LOC.  If the arguments are appropriate, create an
1013    equivalent sequence of statements prior to GSI using an optimal
1014    number of multiplications, and return an expession holding the
1015    result.  */
1016
1017 static tree
1018 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1019                             tree arg0, HOST_WIDE_INT n)
1020 {
1021   /* Avoid largest negative number.  */
1022   if (n != -n
1023       && ((n >= -1 && n <= 2)
1024           || (optimize_function_for_speed_p (cfun)
1025               && powi_cost (n) <= POWI_MAX_MULTS)))
1026     return powi_as_mults (gsi, loc, arg0, n);
1027
1028   return NULL_TREE;
1029 }
1030
1031 /* Build a gimple call statement that calls FN with argument ARG.
1032    Set the lhs of the call statement to a fresh SSA name for
1033    variable VAR.  If VAR is NULL, first allocate it.  Insert the
1034    statement prior to GSI's current position, and return the fresh
1035    SSA name.  */
1036
1037 static tree
1038 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1039                        tree *var, tree fn, tree arg)
1040 {
1041   gimple call_stmt;
1042   tree ssa_target;
1043
1044   if (!*var)
1045     *var = create_tmp_reg (TREE_TYPE (arg), "powroot");
1046
1047   call_stmt = gimple_build_call (fn, 1, arg);
1048   ssa_target = make_ssa_name (*var, NULL);
1049   gimple_set_lhs (call_stmt, ssa_target);
1050   gimple_set_location (call_stmt, loc);
1051   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1052
1053   return ssa_target;
1054 }
1055
1056 /* Build a gimple binary operation with the given CODE and arguments
1057    ARG0, ARG1, assigning the result to a new SSA name for variable
1058    TARGET.  Insert the statement prior to GSI's current position, and
1059    return the fresh SSA name.*/
1060
1061 static tree
1062 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1063                         tree target, enum tree_code code, tree arg0, tree arg1)
1064 {
1065   tree result = make_ssa_name (target, NULL);
1066   gimple stmt = gimple_build_assign_with_ops (code, result, arg0, arg1);
1067   gimple_set_location (stmt, loc);
1068   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1069   return result;
1070 }
1071
1072 /* Build a gimple reference operation with the given CODE and argument
1073    ARG, assigning the result to a new SSA name for variable TARGET.
1074    Insert the statement prior to GSI's current position, and return
1075    the fresh SSA name.  */
1076
1077 static inline tree
1078 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1079                       tree target, enum tree_code code, tree arg0)
1080 {
1081   tree result = make_ssa_name (target, NULL);
1082   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1083   gimple_set_location (stmt, loc);
1084   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1085   return result;
1086 }
1087
1088 /* Build a gimple assignment to cast VAL to TARGET.  Insert the statement
1089    prior to GSI's current position, and return the fresh SSA name.  */
1090
1091 static tree
1092 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1093                        tree target, tree val)
1094 {
1095   return build_and_insert_binop (gsi, loc, target, CONVERT_EXPR, val, NULL);
1096 }
1097
1098 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1099    with location info LOC.  If possible, create an equivalent and
1100    less expensive sequence of statements prior to GSI, and return an
1101    expession holding the result.  */
1102
1103 static tree
1104 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1105                            tree arg0, tree arg1)
1106 {
1107   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1108   REAL_VALUE_TYPE c2, dconst3;
1109   HOST_WIDE_INT n;
1110   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1111   tree target = NULL_TREE;
1112   enum machine_mode mode;
1113   bool hw_sqrt_exists;
1114
1115   /* If the exponent isn't a constant, there's nothing of interest
1116      to be done.  */
1117   if (TREE_CODE (arg1) != REAL_CST)
1118     return NULL_TREE;
1119
1120   /* If the exponent is equivalent to an integer, expand to an optimal
1121      multiplication sequence when profitable.  */
1122   c = TREE_REAL_CST (arg1);
1123   n = real_to_integer (&c);
1124   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1125
1126   if (real_identical (&c, &cint)
1127       && ((n >= -1 && n <= 2)
1128           || (flag_unsafe_math_optimizations
1129               && optimize_insn_for_speed_p ()
1130               && powi_cost (n) <= POWI_MAX_MULTS)))
1131     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1132
1133   /* Attempt various optimizations using sqrt and cbrt.  */
1134   type = TREE_TYPE (arg0);
1135   mode = TYPE_MODE (type);
1136   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1137
1138   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1139      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1140      sqrt(-0) = -0.  */
1141   if (sqrtfn
1142       && REAL_VALUES_EQUAL (c, dconsthalf)
1143       && !HONOR_SIGNED_ZEROS (mode))
1144     return build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1145
1146   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1147      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1148      so do this optimization even if -Os.  Don't do this optimization
1149      if we don't have a hardware sqrt insn.  */
1150   dconst1_4 = dconst1;
1151   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1152   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1153
1154   if (flag_unsafe_math_optimizations
1155       && sqrtfn
1156       && REAL_VALUES_EQUAL (c, dconst1_4)
1157       && hw_sqrt_exists)
1158     {
1159       /* sqrt(x)  */
1160       sqrt_arg0 = build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1161
1162       /* sqrt(sqrt(x))  */
1163       return build_and_insert_call (gsi, loc, &target, sqrtfn, sqrt_arg0);
1164     }
1165
1166   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1167      optimizing for space.  Don't do this optimization if we don't have
1168      a hardware sqrt insn.  */
1169   real_from_integer (&dconst3_4, VOIDmode, 3, 0, 0);
1170   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1171
1172   if (flag_unsafe_math_optimizations
1173       && sqrtfn
1174       && optimize_function_for_speed_p (cfun)
1175       && REAL_VALUES_EQUAL (c, dconst3_4)
1176       && hw_sqrt_exists)
1177     {
1178       /* sqrt(x)  */
1179       sqrt_arg0 = build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1180
1181       /* sqrt(sqrt(x))  */
1182       sqrt_sqrt = build_and_insert_call (gsi, loc, &target, sqrtfn, sqrt_arg0);
1183
1184       /* sqrt(x) * sqrt(sqrt(x))  */
1185       return build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1186                                      sqrt_arg0, sqrt_sqrt);
1187     }
1188
1189   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1190      optimizations since 1./3. is not exactly representable.  If x
1191      is negative and finite, the correct value of pow(x,1./3.) is
1192      a NaN with the "invalid" exception raised, because the value
1193      of 1./3. actually has an even denominator.  The correct value
1194      of cbrt(x) is a negative real value.  */
1195   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1196   dconst1_3 = real_value_truncate (mode, dconst_third ());
1197
1198   if (flag_unsafe_math_optimizations
1199       && cbrtfn
1200       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1201       && REAL_VALUES_EQUAL (c, dconst1_3))
1202     return build_and_insert_call (gsi, loc, &target, cbrtfn, arg0);
1203
1204   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1205      if we don't have a hardware sqrt insn.  */
1206   dconst1_6 = dconst1_3;
1207   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1208
1209   if (flag_unsafe_math_optimizations
1210       && sqrtfn
1211       && cbrtfn
1212       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1213       && optimize_function_for_speed_p (cfun)
1214       && hw_sqrt_exists
1215       && REAL_VALUES_EQUAL (c, dconst1_6))
1216     {
1217       /* sqrt(x)  */
1218       sqrt_arg0 = build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1219
1220       /* cbrt(sqrt(x))  */
1221       return build_and_insert_call (gsi, loc, &target, cbrtfn, sqrt_arg0);
1222     }
1223
1224   /* Optimize pow(x,c), where n = 2c for some nonzero integer n, into
1225
1226        sqrt(x) * powi(x, n/2),                n > 0;
1227        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1228
1229      Do not calculate the powi factor when n/2 = 0.  */
1230   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1231   n = real_to_integer (&c2);
1232   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1233
1234   if (flag_unsafe_math_optimizations
1235       && sqrtfn
1236       && real_identical (&c2, &cint))
1237     {
1238       tree powi_x_ndiv2 = NULL_TREE;
1239
1240       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1241          possible or profitable, give up.  Skip the degenerate case when
1242          n is 1 or -1, where the result is always 1.  */
1243       if (absu_hwi (n) != 1)
1244         {
1245           powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1246                                                      abs_hwi (n / 2));
1247           if (!powi_x_ndiv2)
1248             return NULL_TREE;
1249         }
1250
1251       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1252          result of the optimal multiply sequence just calculated.  */
1253       sqrt_arg0 = build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1254
1255       if (absu_hwi (n) == 1)
1256         result = sqrt_arg0;
1257       else
1258         result = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1259                                          sqrt_arg0, powi_x_ndiv2);
1260
1261       /* If n is negative, reciprocate the result.  */
1262       if (n < 0)
1263         result = build_and_insert_binop (gsi, loc, target, RDIV_EXPR,
1264                                          build_real (type, dconst1), result);
1265       return result;
1266     }
1267
1268   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1269
1270      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1271      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1272
1273      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1274      different from pow(x, 1./3.) due to rounding and behavior with
1275      negative x, we need to constrain this transformation to unsafe
1276      math and positive x or finite math.  */
1277   real_from_integer (&dconst3, VOIDmode, 3, 0, 0);
1278   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1279   real_round (&c2, mode, &c2);
1280   n = real_to_integer (&c2);
1281   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1282   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1283   real_convert (&c2, mode, &c2);
1284
1285   if (flag_unsafe_math_optimizations
1286       && cbrtfn
1287       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1288       && real_identical (&c2, &c)
1289       && optimize_function_for_speed_p (cfun)
1290       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1291     {
1292       tree powi_x_ndiv3 = NULL_TREE;
1293
1294       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1295          possible or profitable, give up.  Skip the degenerate case when
1296          abs(n) < 3, where the result is always 1.  */
1297       if (absu_hwi (n) >= 3)
1298         {
1299           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1300                                                      abs_hwi (n / 3));
1301           if (!powi_x_ndiv3)
1302             return NULL_TREE;
1303         }
1304
1305       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1306          as that creates an unnecessary variable.  Instead, just produce
1307          either cbrt(x) or cbrt(x) * cbrt(x).  */
1308       cbrt_x = build_and_insert_call (gsi, loc, &target, cbrtfn, arg0);
1309
1310       if (absu_hwi (n) % 3 == 1)
1311         powi_cbrt_x = cbrt_x;
1312       else
1313         powi_cbrt_x = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1314                                               cbrt_x, cbrt_x);
1315
1316       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1317       if (absu_hwi (n) < 3)
1318         result = powi_cbrt_x;
1319       else
1320         result = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1321                                          powi_x_ndiv3, powi_cbrt_x);
1322
1323       /* If n is negative, reciprocate the result.  */
1324       if (n < 0)
1325         result = build_and_insert_binop (gsi, loc, target, RDIV_EXPR,
1326                                          build_real (type, dconst1), result);
1327
1328       return result;
1329     }
1330
1331   /* No optimizations succeeded.  */
1332   return NULL_TREE;
1333 }
1334
1335 /* ARG is the argument to a cabs builtin call in GSI with location info
1336    LOC.  Create a sequence of statements prior to GSI that calculates
1337    sqrt(R*R + I*I), where R and I are the real and imaginary components
1338    of ARG, respectively.  Return an expression holding the result.  */
1339
1340 static tree
1341 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1342 {
1343   tree target, real_part, imag_part, addend1, addend2, sum, result;
1344   tree type = TREE_TYPE (TREE_TYPE (arg));
1345   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1346   enum machine_mode mode = TYPE_MODE (type);
1347
1348   if (!flag_unsafe_math_optimizations
1349       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1350       || !sqrtfn
1351       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1352     return NULL_TREE;
1353
1354   target = create_tmp_reg (type, "cabs");
1355   real_part = build_and_insert_ref (gsi, loc, type, target,
1356                                     REALPART_EXPR, arg);
1357   addend1 = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1358                                     real_part, real_part);
1359   imag_part = build_and_insert_ref (gsi, loc, type, target,
1360                                     IMAGPART_EXPR, arg);
1361   addend2 = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1362                                     imag_part, imag_part);
1363   sum = build_and_insert_binop (gsi, loc, target, PLUS_EXPR, addend1, addend2);
1364   result = build_and_insert_call (gsi, loc, &target, sqrtfn, sum);
1365
1366   return result;
1367 }
1368
1369 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1370    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1371    an optimal number of multiplies, when n is a constant.  */
1372
1373 static unsigned int
1374 execute_cse_sincos (void)
1375 {
1376   basic_block bb;
1377   bool cfg_changed = false;
1378
1379   calculate_dominance_info (CDI_DOMINATORS);
1380   memset (&sincos_stats, 0, sizeof (sincos_stats));
1381
1382   FOR_EACH_BB (bb)
1383     {
1384       gimple_stmt_iterator gsi;
1385
1386       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1387         {
1388           gimple stmt = gsi_stmt (gsi);
1389           tree fndecl;
1390
1391           if (is_gimple_call (stmt)
1392               && gimple_call_lhs (stmt)
1393               && (fndecl = gimple_call_fndecl (stmt))
1394               && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1395             {
1396               tree arg, arg0, arg1, result;
1397               HOST_WIDE_INT n;
1398               location_t loc;
1399
1400               switch (DECL_FUNCTION_CODE (fndecl))
1401                 {
1402                 CASE_FLT_FN (BUILT_IN_COS):
1403                 CASE_FLT_FN (BUILT_IN_SIN):
1404                 CASE_FLT_FN (BUILT_IN_CEXPI):
1405                   /* Make sure we have either sincos or cexp.  */
1406                   if (!TARGET_HAS_SINCOS && !TARGET_C99_FUNCTIONS)
1407                     break;
1408
1409                   arg = gimple_call_arg (stmt, 0);
1410                   if (TREE_CODE (arg) == SSA_NAME)
1411                     cfg_changed |= execute_cse_sincos_1 (arg);
1412                   break;
1413
1414                 CASE_FLT_FN (BUILT_IN_POW):
1415                   arg0 = gimple_call_arg (stmt, 0);
1416                   arg1 = gimple_call_arg (stmt, 1);
1417
1418                   loc = gimple_location (stmt);
1419                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1420
1421                   if (result)
1422                     {
1423                       tree lhs = gimple_get_lhs (stmt);
1424                       gimple new_stmt = gimple_build_assign (lhs, result);
1425                       gimple_set_location (new_stmt, loc);
1426                       unlink_stmt_vdef (stmt);
1427                       gsi_replace (&gsi, new_stmt, true);
1428                       if (gimple_vdef (stmt))
1429                         release_ssa_name (gimple_vdef (stmt));
1430                     }
1431                   break;
1432
1433                 CASE_FLT_FN (BUILT_IN_POWI):
1434                   arg0 = gimple_call_arg (stmt, 0);
1435                   arg1 = gimple_call_arg (stmt, 1);
1436                   if (!host_integerp (arg1, 0))
1437                     break;
1438
1439                   n = TREE_INT_CST_LOW (arg1);
1440                   loc = gimple_location (stmt);
1441                   result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1442
1443                   if (result)
1444                     {
1445                       tree lhs = gimple_get_lhs (stmt);
1446                       gimple new_stmt = gimple_build_assign (lhs, result);
1447                       gimple_set_location (new_stmt, loc);
1448                       unlink_stmt_vdef (stmt);
1449                       gsi_replace (&gsi, new_stmt, true);
1450                       if (gimple_vdef (stmt))
1451                         release_ssa_name (gimple_vdef (stmt));
1452                     }
1453                   break;
1454
1455                 CASE_FLT_FN (BUILT_IN_CABS):
1456                   arg0 = gimple_call_arg (stmt, 0);
1457                   loc = gimple_location (stmt);
1458                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1459
1460                   if (result)
1461                     {
1462                       tree lhs = gimple_get_lhs (stmt);
1463                       gimple new_stmt = gimple_build_assign (lhs, result);
1464                       gimple_set_location (new_stmt, loc);
1465                       unlink_stmt_vdef (stmt);
1466                       gsi_replace (&gsi, new_stmt, true);
1467                       if (gimple_vdef (stmt))
1468                         release_ssa_name (gimple_vdef (stmt));
1469                     }
1470                   break;
1471
1472                 default:;
1473                 }
1474             }
1475         }
1476     }
1477
1478   statistics_counter_event (cfun, "sincos statements inserted",
1479                             sincos_stats.inserted);
1480
1481   free_dominance_info (CDI_DOMINATORS);
1482   return cfg_changed ? TODO_cleanup_cfg : 0;
1483 }
1484
1485 static bool
1486 gate_cse_sincos (void)
1487 {
1488   /* We no longer require either sincos or cexp, since powi expansion
1489      piggybacks on this pass.  */
1490   return optimize;
1491 }
1492
1493 struct gimple_opt_pass pass_cse_sincos =
1494 {
1495  {
1496   GIMPLE_PASS,
1497   "sincos",                             /* name */
1498   gate_cse_sincos,                      /* gate */
1499   execute_cse_sincos,                   /* execute */
1500   NULL,                                 /* sub */
1501   NULL,                                 /* next */
1502   0,                                    /* static_pass_number */
1503   TV_NONE,                              /* tv_id */
1504   PROP_ssa,                             /* properties_required */
1505   0,                                    /* properties_provided */
1506   0,                                    /* properties_destroyed */
1507   0,                                    /* todo_flags_start */
1508   TODO_update_ssa | TODO_verify_ssa
1509     | TODO_verify_stmts                 /* todo_flags_finish */
1510  }
1511 };
1512
1513 /* A symbolic number is used to detect byte permutation and selection
1514    patterns.  Therefore the field N contains an artificial number
1515    consisting of byte size markers:
1516
1517    0    - byte has the value 0
1518    1..size - byte contains the content of the byte
1519    number indexed with that value minus one  */
1520
1521 struct symbolic_number {
1522   unsigned HOST_WIDEST_INT n;
1523   int size;
1524 };
1525
1526 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1527    number N.  Return false if the requested operation is not permitted
1528    on a symbolic number.  */
1529
1530 static inline bool
1531 do_shift_rotate (enum tree_code code,
1532                  struct symbolic_number *n,
1533                  int count)
1534 {
1535   if (count % 8 != 0)
1536     return false;
1537
1538   /* Zero out the extra bits of N in order to avoid them being shifted
1539      into the significant bits.  */
1540   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1541     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1542
1543   switch (code)
1544     {
1545     case LSHIFT_EXPR:
1546       n->n <<= count;
1547       break;
1548     case RSHIFT_EXPR:
1549       n->n >>= count;
1550       break;
1551     case LROTATE_EXPR:
1552       n->n = (n->n << count) | (n->n >> ((n->size * BITS_PER_UNIT) - count));
1553       break;
1554     case RROTATE_EXPR:
1555       n->n = (n->n >> count) | (n->n << ((n->size * BITS_PER_UNIT) - count));
1556       break;
1557     default:
1558       return false;
1559     }
1560   /* Zero unused bits for size.  */
1561   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1562     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1563   return true;
1564 }
1565
1566 /* Perform sanity checking for the symbolic number N and the gimple
1567    statement STMT.  */
1568
1569 static inline bool
1570 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1571 {
1572   tree lhs_type;
1573
1574   lhs_type = gimple_expr_type (stmt);
1575
1576   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1577     return false;
1578
1579   if (TYPE_PRECISION (lhs_type) != n->size * BITS_PER_UNIT)
1580     return false;
1581
1582   return true;
1583 }
1584
1585 /* find_bswap_1 invokes itself recursively with N and tries to perform
1586    the operation given by the rhs of STMT on the result.  If the
1587    operation could successfully be executed the function returns the
1588    tree expression of the source operand and NULL otherwise.  */
1589
1590 static tree
1591 find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
1592 {
1593   enum tree_code code;
1594   tree rhs1, rhs2 = NULL;
1595   gimple rhs1_stmt, rhs2_stmt;
1596   tree source_expr1;
1597   enum gimple_rhs_class rhs_class;
1598
1599   if (!limit || !is_gimple_assign (stmt))
1600     return NULL_TREE;
1601
1602   rhs1 = gimple_assign_rhs1 (stmt);
1603
1604   if (TREE_CODE (rhs1) != SSA_NAME)
1605     return NULL_TREE;
1606
1607   code = gimple_assign_rhs_code (stmt);
1608   rhs_class = gimple_assign_rhs_class (stmt);
1609   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1610
1611   if (rhs_class == GIMPLE_BINARY_RHS)
1612     rhs2 = gimple_assign_rhs2 (stmt);
1613
1614   /* Handle unary rhs and binary rhs with integer constants as second
1615      operand.  */
1616
1617   if (rhs_class == GIMPLE_UNARY_RHS
1618       || (rhs_class == GIMPLE_BINARY_RHS
1619           && TREE_CODE (rhs2) == INTEGER_CST))
1620     {
1621       if (code != BIT_AND_EXPR
1622           && code != LSHIFT_EXPR
1623           && code != RSHIFT_EXPR
1624           && code != LROTATE_EXPR
1625           && code != RROTATE_EXPR
1626           && code != NOP_EXPR
1627           && code != CONVERT_EXPR)
1628         return NULL_TREE;
1629
1630       source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
1631
1632       /* If find_bswap_1 returned NULL STMT is a leaf node and we have
1633          to initialize the symbolic number.  */
1634       if (!source_expr1)
1635         {
1636           /* Set up the symbolic number N by setting each byte to a
1637              value between 1 and the byte size of rhs1.  The highest
1638              order byte is set to n->size and the lowest order
1639              byte to 1.  */
1640           n->size = TYPE_PRECISION (TREE_TYPE (rhs1));
1641           if (n->size % BITS_PER_UNIT != 0)
1642             return NULL_TREE;
1643           n->size /= BITS_PER_UNIT;
1644           n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1645                   (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
1646
1647           if (n->size < (int)sizeof (HOST_WIDEST_INT))
1648             n->n &= ((unsigned HOST_WIDEST_INT)1 <<
1649                      (n->size * BITS_PER_UNIT)) - 1;
1650
1651           source_expr1 = rhs1;
1652         }
1653
1654       switch (code)
1655         {
1656         case BIT_AND_EXPR:
1657           {
1658             int i;
1659             unsigned HOST_WIDEST_INT val = widest_int_cst_value (rhs2);
1660             unsigned HOST_WIDEST_INT tmp = val;
1661
1662             /* Only constants masking full bytes are allowed.  */
1663             for (i = 0; i < n->size; i++, tmp >>= BITS_PER_UNIT)
1664               if ((tmp & 0xff) != 0 && (tmp & 0xff) != 0xff)
1665                 return NULL_TREE;
1666
1667             n->n &= val;
1668           }
1669           break;
1670         case LSHIFT_EXPR:
1671         case RSHIFT_EXPR:
1672         case LROTATE_EXPR:
1673         case RROTATE_EXPR:
1674           if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
1675             return NULL_TREE;
1676           break;
1677         CASE_CONVERT:
1678           {
1679             int type_size;
1680
1681             type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1682             if (type_size % BITS_PER_UNIT != 0)
1683               return NULL_TREE;
1684
1685             if (type_size / BITS_PER_UNIT < (int)(sizeof (HOST_WIDEST_INT)))
1686               {
1687                 /* If STMT casts to a smaller type mask out the bits not
1688                    belonging to the target type.  */
1689                 n->n &= ((unsigned HOST_WIDEST_INT)1 << type_size) - 1;
1690               }
1691             n->size = type_size / BITS_PER_UNIT;
1692           }
1693           break;
1694         default:
1695           return NULL_TREE;
1696         };
1697       return verify_symbolic_number_p (n, stmt) ? source_expr1 : NULL;
1698     }
1699
1700   /* Handle binary rhs.  */
1701
1702   if (rhs_class == GIMPLE_BINARY_RHS)
1703     {
1704       struct symbolic_number n1, n2;
1705       tree source_expr2;
1706
1707       if (code != BIT_IOR_EXPR)
1708         return NULL_TREE;
1709
1710       if (TREE_CODE (rhs2) != SSA_NAME)
1711         return NULL_TREE;
1712
1713       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1714
1715       switch (code)
1716         {
1717         case BIT_IOR_EXPR:
1718           source_expr1 = find_bswap_1 (rhs1_stmt, &n1, limit - 1);
1719
1720           if (!source_expr1)
1721             return NULL_TREE;
1722
1723           source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
1724
1725           if (source_expr1 != source_expr2
1726               || n1.size != n2.size)
1727             return NULL_TREE;
1728
1729           n->size = n1.size;
1730           n->n = n1.n | n2.n;
1731
1732           if (!verify_symbolic_number_p (n, stmt))
1733             return NULL_TREE;
1734
1735           break;
1736         default:
1737           return NULL_TREE;
1738         }
1739       return source_expr1;
1740     }
1741   return NULL_TREE;
1742 }
1743
1744 /* Check if STMT completes a bswap implementation consisting of ORs,
1745    SHIFTs and ANDs.  Return the source tree expression on which the
1746    byte swap is performed and NULL if no bswap was found.  */
1747
1748 static tree
1749 find_bswap (gimple stmt)
1750 {
1751 /* The number which the find_bswap result should match in order to
1752    have a full byte swap.  The number is shifted to the left according
1753    to the size of the symbolic number before using it.  */
1754   unsigned HOST_WIDEST_INT cmp =
1755     sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1756     (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708;
1757
1758   struct symbolic_number n;
1759   tree source_expr;
1760   int limit;
1761
1762   /* The last parameter determines the depth search limit.  It usually
1763      correlates directly to the number of bytes to be touched.  We
1764      increase that number by three  here in order to also
1765      cover signed -> unsigned converions of the src operand as can be seen
1766      in libgcc, and for initial shift/and operation of the src operand.  */
1767   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
1768   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
1769   source_expr =  find_bswap_1 (stmt, &n, limit);
1770
1771   if (!source_expr)
1772     return NULL_TREE;
1773
1774   /* Zero out the extra bits of N and CMP.  */
1775   if (n.size < (int)sizeof (HOST_WIDEST_INT))
1776     {
1777       unsigned HOST_WIDEST_INT mask =
1778         ((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
1779
1780       n.n &= mask;
1781       cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
1782     }
1783
1784   /* A complete byte swap should make the symbolic number to start
1785      with the largest digit in the highest order byte.  */
1786   if (cmp != n.n)
1787     return NULL_TREE;
1788
1789   return source_expr;
1790 }
1791
1792 /* Find manual byte swap implementations and turn them into a bswap
1793    builtin invokation.  */
1794
1795 static unsigned int
1796 execute_optimize_bswap (void)
1797 {
1798   basic_block bb;
1799   bool bswap32_p, bswap64_p;
1800   bool changed = false;
1801   tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
1802
1803   if (BITS_PER_UNIT != 8)
1804     return 0;
1805
1806   if (sizeof (HOST_WIDEST_INT) < 8)
1807     return 0;
1808
1809   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
1810                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
1811   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
1812                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
1813                    || (bswap32_p && word_mode == SImode)));
1814
1815   if (!bswap32_p && !bswap64_p)
1816     return 0;
1817
1818   /* Determine the argument type of the builtins.  The code later on
1819      assumes that the return and argument type are the same.  */
1820   if (bswap32_p)
1821     {
1822       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1823       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1824     }
1825
1826   if (bswap64_p)
1827     {
1828       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1829       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1830     }
1831
1832   memset (&bswap_stats, 0, sizeof (bswap_stats));
1833
1834   FOR_EACH_BB (bb)
1835     {
1836       gimple_stmt_iterator gsi;
1837
1838       /* We do a reverse scan for bswap patterns to make sure we get the
1839          widest match. As bswap pattern matching doesn't handle
1840          previously inserted smaller bswap replacements as sub-
1841          patterns, the wider variant wouldn't be detected.  */
1842       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1843         {
1844           gimple stmt = gsi_stmt (gsi);
1845           tree bswap_src, bswap_type;
1846           tree bswap_tmp;
1847           tree fndecl = NULL_TREE;
1848           int type_size;
1849           gimple call;
1850
1851           if (!is_gimple_assign (stmt)
1852               || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
1853             continue;
1854
1855           type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1856
1857           switch (type_size)
1858             {
1859             case 32:
1860               if (bswap32_p)
1861                 {
1862                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1863                   bswap_type = bswap32_type;
1864                 }
1865               break;
1866             case 64:
1867               if (bswap64_p)
1868                 {
1869                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1870                   bswap_type = bswap64_type;
1871                 }
1872               break;
1873             default:
1874               continue;
1875             }
1876
1877           if (!fndecl)
1878             continue;
1879
1880           bswap_src = find_bswap (stmt);
1881
1882           if (!bswap_src)
1883             continue;
1884
1885           changed = true;
1886           if (type_size == 32)
1887             bswap_stats.found_32bit++;
1888           else
1889             bswap_stats.found_64bit++;
1890
1891           bswap_tmp = bswap_src;
1892
1893           /* Convert the src expression if necessary.  */
1894           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
1895             {
1896               gimple convert_stmt;
1897
1898               bswap_tmp = create_tmp_var (bswap_type, "bswapsrc");
1899               bswap_tmp = make_ssa_name (bswap_tmp, NULL);
1900
1901               convert_stmt = gimple_build_assign_with_ops (
1902                                CONVERT_EXPR, bswap_tmp, bswap_src, NULL);
1903               gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
1904             }
1905
1906           call = gimple_build_call (fndecl, 1, bswap_tmp);
1907
1908           bswap_tmp = gimple_assign_lhs (stmt);
1909
1910           /* Convert the result if necessary.  */
1911           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
1912             {
1913               gimple convert_stmt;
1914
1915               bswap_tmp = create_tmp_var (bswap_type, "bswapdst");
1916               bswap_tmp = make_ssa_name (bswap_tmp, NULL);
1917               convert_stmt = gimple_build_assign_with_ops (
1918                                CONVERT_EXPR, gimple_assign_lhs (stmt), bswap_tmp, NULL);
1919               gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
1920             }
1921
1922           gimple_call_set_lhs (call, bswap_tmp);
1923
1924           if (dump_file)
1925             {
1926               fprintf (dump_file, "%d bit bswap implementation found at: ",
1927                        (int)type_size);
1928               print_gimple_stmt (dump_file, stmt, 0, 0);
1929             }
1930
1931           gsi_insert_after (&gsi, call, GSI_SAME_STMT);
1932           gsi_remove (&gsi, true);
1933         }
1934     }
1935
1936   statistics_counter_event (cfun, "32-bit bswap implementations found",
1937                             bswap_stats.found_32bit);
1938   statistics_counter_event (cfun, "64-bit bswap implementations found",
1939                             bswap_stats.found_64bit);
1940
1941   return (changed ? TODO_update_ssa | TODO_verify_ssa
1942           | TODO_verify_stmts : 0);
1943 }
1944
1945 static bool
1946 gate_optimize_bswap (void)
1947 {
1948   return flag_expensive_optimizations && optimize;
1949 }
1950
1951 struct gimple_opt_pass pass_optimize_bswap =
1952 {
1953  {
1954   GIMPLE_PASS,
1955   "bswap",                              /* name */
1956   gate_optimize_bswap,                  /* gate */
1957   execute_optimize_bswap,               /* execute */
1958   NULL,                                 /* sub */
1959   NULL,                                 /* next */
1960   0,                                    /* static_pass_number */
1961   TV_NONE,                              /* tv_id */
1962   PROP_ssa,                             /* properties_required */
1963   0,                                    /* properties_provided */
1964   0,                                    /* properties_destroyed */
1965   0,                                    /* todo_flags_start */
1966   0                                     /* todo_flags_finish */
1967  }
1968 };
1969
1970 /* Return true if RHS is a suitable operand for a widening multiplication,
1971    assuming a target type of TYPE.
1972    There are two cases:
1973
1974      - RHS makes some value at least twice as wide.  Store that value
1975        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
1976
1977      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
1978        but leave *TYPE_OUT untouched.  */
1979
1980 static bool
1981 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
1982                         tree *new_rhs_out)
1983 {
1984   gimple stmt;
1985   tree type1, rhs1;
1986   enum tree_code rhs_code;
1987
1988   if (TREE_CODE (rhs) == SSA_NAME)
1989     {
1990       stmt = SSA_NAME_DEF_STMT (rhs);
1991       if (is_gimple_assign (stmt))
1992         {
1993           rhs_code = gimple_assign_rhs_code (stmt);
1994           if (TREE_CODE (type) == INTEGER_TYPE
1995               ? !CONVERT_EXPR_CODE_P (rhs_code)
1996               : rhs_code != FIXED_CONVERT_EXPR)
1997             rhs1 = rhs;
1998           else
1999             {
2000               rhs1 = gimple_assign_rhs1 (stmt);
2001
2002               if (TREE_CODE (rhs1) == INTEGER_CST)
2003                 {
2004                   *new_rhs_out = rhs1;
2005                   *type_out = NULL;
2006                   return true;
2007                 }
2008             }
2009         }
2010       else
2011         rhs1 = rhs;
2012
2013       type1 = TREE_TYPE (rhs1);
2014
2015       if (TREE_CODE (type1) != TREE_CODE (type)
2016           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2017         return false;
2018
2019       *new_rhs_out = rhs1;
2020       *type_out = type1;
2021       return true;
2022     }
2023
2024   if (TREE_CODE (rhs) == INTEGER_CST)
2025     {
2026       *new_rhs_out = rhs;
2027       *type_out = NULL;
2028       return true;
2029     }
2030
2031   return false;
2032 }
2033
2034 /* Return true if STMT performs a widening multiplication, assuming the
2035    output type is TYPE.  If so, store the unwidened types of the operands
2036    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2037    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2038    and *TYPE2_OUT would give the operands of the multiplication.  */
2039
2040 static bool
2041 is_widening_mult_p (gimple stmt,
2042                     tree *type1_out, tree *rhs1_out,
2043                     tree *type2_out, tree *rhs2_out)
2044 {
2045   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2046
2047   if (TREE_CODE (type) != INTEGER_TYPE
2048       && TREE_CODE (type) != FIXED_POINT_TYPE)
2049     return false;
2050
2051   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2052                                rhs1_out))
2053     return false;
2054
2055   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2056                                rhs2_out))
2057     return false;
2058
2059   if (*type1_out == NULL)
2060     {
2061       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2062         return false;
2063       *type1_out = *type2_out;
2064     }
2065
2066   if (*type2_out == NULL)
2067     {
2068       if (!int_fits_type_p (*rhs2_out, *type1_out))
2069         return false;
2070       *type2_out = *type1_out;
2071     }
2072
2073   /* Ensure that the larger of the two operands comes first. */
2074   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2075     {
2076       tree tmp;
2077       tmp = *type1_out;
2078       *type1_out = *type2_out;
2079       *type2_out = tmp;
2080       tmp = *rhs1_out;
2081       *rhs1_out = *rhs2_out;
2082       *rhs2_out = tmp;
2083     }
2084
2085   return true;
2086 }
2087
2088 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2089    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2090    value is true iff we converted the statement.  */
2091
2092 static bool
2093 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2094 {
2095   tree lhs, rhs1, rhs2, type, type1, type2, tmp = NULL;
2096   enum insn_code handler;
2097   enum machine_mode to_mode, from_mode, actual_mode;
2098   optab op;
2099   int actual_precision;
2100   location_t loc = gimple_location (stmt);
2101   bool from_unsigned1, from_unsigned2;
2102
2103   lhs = gimple_assign_lhs (stmt);
2104   type = TREE_TYPE (lhs);
2105   if (TREE_CODE (type) != INTEGER_TYPE)
2106     return false;
2107
2108   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2109     return false;
2110
2111   to_mode = TYPE_MODE (type);
2112   from_mode = TYPE_MODE (type1);
2113   from_unsigned1 = TYPE_UNSIGNED (type1);
2114   from_unsigned2 = TYPE_UNSIGNED (type2);
2115
2116   if (from_unsigned1 && from_unsigned2)
2117     op = umul_widen_optab;
2118   else if (!from_unsigned1 && !from_unsigned2)
2119     op = smul_widen_optab;
2120   else
2121     op = usmul_widen_optab;
2122
2123   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2124                                                   0, &actual_mode);
2125
2126   if (handler == CODE_FOR_nothing)
2127     {
2128       if (op != smul_widen_optab)
2129         {
2130           /* We can use a signed multiply with unsigned types as long as
2131              there is a wider mode to use, or it is the smaller of the two
2132              types that is unsigned.  Note that type1 >= type2, always.  */
2133           if ((TYPE_UNSIGNED (type1)
2134                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2135               || (TYPE_UNSIGNED (type2)
2136                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2137             {
2138               from_mode = GET_MODE_WIDER_MODE (from_mode);
2139               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2140                 return false;
2141             }
2142
2143           op = smul_widen_optab;
2144           handler = find_widening_optab_handler_and_mode (op, to_mode,
2145                                                           from_mode, 0,
2146                                                           &actual_mode);
2147
2148           if (handler == CODE_FOR_nothing)
2149             return false;
2150
2151           from_unsigned1 = from_unsigned2 = false;
2152         }
2153       else
2154         return false;
2155     }
2156
2157   /* Ensure that the inputs to the handler are in the correct precison
2158      for the opcode.  This will be the full mode size.  */
2159   actual_precision = GET_MODE_PRECISION (actual_mode);
2160   if (2 * actual_precision > TYPE_PRECISION (type))
2161     return false;
2162   if (actual_precision != TYPE_PRECISION (type1)
2163       || from_unsigned1 != TYPE_UNSIGNED (type1))
2164     {
2165       tmp = create_tmp_var (build_nonstandard_integer_type
2166                                 (actual_precision, from_unsigned1),
2167                             NULL);
2168       rhs1 = build_and_insert_cast (gsi, loc, tmp, rhs1);
2169     }
2170   if (actual_precision != TYPE_PRECISION (type2)
2171       || from_unsigned2 != TYPE_UNSIGNED (type2))
2172     {
2173       /* Reuse the same type info, if possible.  */
2174       if (!tmp || from_unsigned1 != from_unsigned2)
2175         tmp = create_tmp_var (build_nonstandard_integer_type
2176                                 (actual_precision, from_unsigned2),
2177                               NULL);
2178       rhs2 = build_and_insert_cast (gsi, loc, tmp, rhs2);
2179     }
2180
2181   /* Handle constants.  */
2182   if (TREE_CODE (rhs1) == INTEGER_CST)
2183     rhs1 = fold_convert (type1, rhs1);
2184   if (TREE_CODE (rhs2) == INTEGER_CST)
2185     rhs2 = fold_convert (type2, rhs2);
2186
2187   gimple_assign_set_rhs1 (stmt, rhs1);
2188   gimple_assign_set_rhs2 (stmt, rhs2);
2189   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2190   update_stmt (stmt);
2191   widen_mul_stats.widen_mults_inserted++;
2192   return true;
2193 }
2194
2195 /* Process a single gimple statement STMT, which is found at the
2196    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2197    rhs (given by CODE), and try to convert it into a
2198    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2199    is true iff we converted the statement.  */
2200
2201 static bool
2202 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2203                             enum tree_code code)
2204 {
2205   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2206   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2207   tree type, type1, type2, optype, tmp = NULL;
2208   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2209   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2210   optab this_optab;
2211   enum tree_code wmult_code;
2212   enum insn_code handler;
2213   enum machine_mode to_mode, from_mode, actual_mode;
2214   location_t loc = gimple_location (stmt);
2215   int actual_precision;
2216   bool from_unsigned1, from_unsigned2;
2217
2218   lhs = gimple_assign_lhs (stmt);
2219   type = TREE_TYPE (lhs);
2220   if (TREE_CODE (type) != INTEGER_TYPE
2221       && TREE_CODE (type) != FIXED_POINT_TYPE)
2222     return false;
2223
2224   if (code == MINUS_EXPR)
2225     wmult_code = WIDEN_MULT_MINUS_EXPR;
2226   else
2227     wmult_code = WIDEN_MULT_PLUS_EXPR;
2228
2229   rhs1 = gimple_assign_rhs1 (stmt);
2230   rhs2 = gimple_assign_rhs2 (stmt);
2231
2232   if (TREE_CODE (rhs1) == SSA_NAME)
2233     {
2234       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2235       if (is_gimple_assign (rhs1_stmt))
2236         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2237     }
2238
2239   if (TREE_CODE (rhs2) == SSA_NAME)
2240     {
2241       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2242       if (is_gimple_assign (rhs2_stmt))
2243         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2244     }
2245
2246   /* Allow for one conversion statement between the multiply
2247      and addition/subtraction statement.  If there are more than
2248      one conversions then we assume they would invalidate this
2249      transformation.  If that's not the case then they should have
2250      been folded before now.  */
2251   if (CONVERT_EXPR_CODE_P (rhs1_code))
2252     {
2253       conv1_stmt = rhs1_stmt;
2254       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2255       if (TREE_CODE (rhs1) == SSA_NAME)
2256         {
2257           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2258           if (is_gimple_assign (rhs1_stmt))
2259             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2260         }
2261       else
2262         return false;
2263     }
2264   if (CONVERT_EXPR_CODE_P (rhs2_code))
2265     {
2266       conv2_stmt = rhs2_stmt;
2267       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2268       if (TREE_CODE (rhs2) == SSA_NAME)
2269         {
2270           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2271           if (is_gimple_assign (rhs2_stmt))
2272             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2273         }
2274       else
2275         return false;
2276     }
2277
2278   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2279      is_widening_mult_p, but we still need the rhs returns.
2280
2281      It might also appear that it would be sufficient to use the existing
2282      operands of the widening multiply, but that would limit the choice of
2283      multiply-and-accumulate instructions.  */
2284   if (code == PLUS_EXPR
2285       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2286     {
2287       if (!is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2288                                &type2, &mult_rhs2))
2289         return false;
2290       add_rhs = rhs2;
2291       conv_stmt = conv1_stmt;
2292     }
2293   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2294     {
2295       if (!is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2296                                &type2, &mult_rhs2))
2297         return false;
2298       add_rhs = rhs1;
2299       conv_stmt = conv2_stmt;
2300     }
2301   else
2302     return false;
2303
2304   to_mode = TYPE_MODE (type);
2305   from_mode = TYPE_MODE (type1);
2306   from_unsigned1 = TYPE_UNSIGNED (type1);
2307   from_unsigned2 = TYPE_UNSIGNED (type2);
2308   optype = type1;
2309
2310   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2311   if (from_unsigned1 != from_unsigned2)
2312     {
2313       if (!INTEGRAL_TYPE_P (type))
2314         return false;
2315       /* We can use a signed multiply with unsigned types as long as
2316          there is a wider mode to use, or it is the smaller of the two
2317          types that is unsigned.  Note that type1 >= type2, always.  */
2318       if ((from_unsigned1
2319            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2320           || (from_unsigned2
2321               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2322         {
2323           from_mode = GET_MODE_WIDER_MODE (from_mode);
2324           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2325             return false;
2326         }
2327
2328       from_unsigned1 = from_unsigned2 = false;
2329       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2330                                                false);
2331     }
2332
2333   /* If there was a conversion between the multiply and addition
2334      then we need to make sure it fits a multiply-and-accumulate.
2335      The should be a single mode change which does not change the
2336      value.  */
2337   if (conv_stmt)
2338     {
2339       /* We use the original, unmodified data types for this.  */
2340       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2341       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2342       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2343       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2344
2345       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2346         {
2347           /* Conversion is a truncate.  */
2348           if (TYPE_PRECISION (to_type) < data_size)
2349             return false;
2350         }
2351       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2352         {
2353           /* Conversion is an extend.  Check it's the right sort.  */
2354           if (TYPE_UNSIGNED (from_type) != is_unsigned
2355               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2356             return false;
2357         }
2358       /* else convert is a no-op for our purposes.  */
2359     }
2360
2361   /* Verify that the machine can perform a widening multiply
2362      accumulate in this mode/signedness combination, otherwise
2363      this transformation is likely to pessimize code.  */
2364   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2365   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2366                                                   from_mode, 0, &actual_mode);
2367
2368   if (handler == CODE_FOR_nothing)
2369     return false;
2370
2371   /* Ensure that the inputs to the handler are in the correct precison
2372      for the opcode.  This will be the full mode size.  */
2373   actual_precision = GET_MODE_PRECISION (actual_mode);
2374   if (actual_precision != TYPE_PRECISION (type1)
2375       || from_unsigned1 != TYPE_UNSIGNED (type1))
2376     {
2377       tmp = create_tmp_var (build_nonstandard_integer_type
2378                                 (actual_precision, from_unsigned1),
2379                             NULL);
2380       mult_rhs1 = build_and_insert_cast (gsi, loc, tmp, mult_rhs1);
2381     }
2382   if (actual_precision != TYPE_PRECISION (type2)
2383       || from_unsigned2 != TYPE_UNSIGNED (type2))
2384     {
2385       if (!tmp || from_unsigned1 != from_unsigned2)
2386         tmp = create_tmp_var (build_nonstandard_integer_type
2387                                 (actual_precision, from_unsigned2),
2388                               NULL);
2389       mult_rhs2 = build_and_insert_cast (gsi, loc, tmp, mult_rhs2);
2390     }
2391
2392   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2393     add_rhs = build_and_insert_cast (gsi, loc, create_tmp_var (type, NULL),
2394                                      add_rhs);
2395
2396   /* Handle constants.  */
2397   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2398     mult_rhs1 = fold_convert (type1, mult_rhs1);
2399   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2400     mult_rhs2 = fold_convert (type2, mult_rhs2);
2401
2402   gimple_assign_set_rhs_with_ops_1 (gsi, wmult_code, mult_rhs1, mult_rhs2,
2403                                     add_rhs);
2404   update_stmt (gsi_stmt (*gsi));
2405   widen_mul_stats.maccs_inserted++;
2406   return true;
2407 }
2408
2409 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
2410    with uses in additions and subtractions to form fused multiply-add
2411    operations.  Returns true if successful and MUL_STMT should be removed.  */
2412
2413 static bool
2414 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
2415 {
2416   tree mul_result = gimple_get_lhs (mul_stmt);
2417   tree type = TREE_TYPE (mul_result);
2418   gimple use_stmt, neguse_stmt, fma_stmt;
2419   use_operand_p use_p;
2420   imm_use_iterator imm_iter;
2421
2422   if (FLOAT_TYPE_P (type)
2423       && flag_fp_contract_mode == FP_CONTRACT_OFF)
2424     return false;
2425
2426   /* We don't want to do bitfield reduction ops.  */
2427   if (INTEGRAL_TYPE_P (type)
2428       && (TYPE_PRECISION (type)
2429           != GET_MODE_PRECISION (TYPE_MODE (type))))
2430     return false;
2431
2432   /* If the target doesn't support it, don't generate it.  We assume that
2433      if fma isn't available then fms, fnma or fnms are not either.  */
2434   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
2435     return false;
2436
2437   /* If the multiplication has zero uses, it is kept around probably because
2438      of -fnon-call-exceptions.  Don't optimize it away in that case,
2439      it is DCE job.  */
2440   if (has_zero_uses (mul_result))
2441     return false;
2442
2443   /* Make sure that the multiplication statement becomes dead after
2444      the transformation, thus that all uses are transformed to FMAs.
2445      This means we assume that an FMA operation has the same cost
2446      as an addition.  */
2447   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
2448     {
2449       enum tree_code use_code;
2450       tree result = mul_result;
2451       bool negate_p = false;
2452
2453       use_stmt = USE_STMT (use_p);
2454
2455       if (is_gimple_debug (use_stmt))
2456         continue;
2457
2458       /* For now restrict this operations to single basic blocks.  In theory
2459          we would want to support sinking the multiplication in
2460          m = a*b;
2461          if ()
2462            ma = m + c;
2463          else
2464            d = m;
2465          to form a fma in the then block and sink the multiplication to the
2466          else block.  */
2467       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2468         return false;
2469
2470       if (!is_gimple_assign (use_stmt))
2471         return false;
2472
2473       use_code = gimple_assign_rhs_code (use_stmt);
2474
2475       /* A negate on the multiplication leads to FNMA.  */
2476       if (use_code == NEGATE_EXPR)
2477         {
2478           ssa_op_iter iter;
2479           use_operand_p usep;
2480
2481           result = gimple_assign_lhs (use_stmt);
2482
2483           /* Make sure the negate statement becomes dead with this
2484              single transformation.  */
2485           if (!single_imm_use (gimple_assign_lhs (use_stmt),
2486                                &use_p, &neguse_stmt))
2487             return false;
2488
2489           /* Make sure the multiplication isn't also used on that stmt.  */
2490           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
2491             if (USE_FROM_PTR (usep) == mul_result)
2492               return false;
2493
2494           /* Re-validate.  */
2495           use_stmt = neguse_stmt;
2496           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2497             return false;
2498           if (!is_gimple_assign (use_stmt))
2499             return false;
2500
2501           use_code = gimple_assign_rhs_code (use_stmt);
2502           negate_p = true;
2503         }
2504
2505       switch (use_code)
2506         {
2507         case MINUS_EXPR:
2508           if (gimple_assign_rhs2 (use_stmt) == result)
2509             negate_p = !negate_p;
2510           break;
2511         case PLUS_EXPR:
2512           break;
2513         default:
2514           /* FMA can only be formed from PLUS and MINUS.  */
2515           return false;
2516         }
2517
2518       /* We can't handle a * b + a * b.  */
2519       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
2520         return false;
2521
2522       /* While it is possible to validate whether or not the exact form
2523          that we've recognized is available in the backend, the assumption
2524          is that the transformation is never a loss.  For instance, suppose
2525          the target only has the plain FMA pattern available.  Consider
2526          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
2527          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
2528          still have 3 operations, but in the FMA form the two NEGs are
2529          independent and could be run in parallel.  */
2530     }
2531
2532   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2533     {
2534       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2535       enum tree_code use_code;
2536       tree addop, mulop1 = op1, result = mul_result;
2537       bool negate_p = false;
2538
2539       if (is_gimple_debug (use_stmt))
2540         continue;
2541
2542       use_code = gimple_assign_rhs_code (use_stmt);
2543       if (use_code == NEGATE_EXPR)
2544         {
2545           result = gimple_assign_lhs (use_stmt);
2546           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2547           gsi_remove (&gsi, true);
2548           release_defs (use_stmt);
2549
2550           use_stmt = neguse_stmt;
2551           gsi = gsi_for_stmt (use_stmt);
2552           use_code = gimple_assign_rhs_code (use_stmt);
2553           negate_p = true;
2554         }
2555
2556       if (gimple_assign_rhs1 (use_stmt) == result)
2557         {
2558           addop = gimple_assign_rhs2 (use_stmt);
2559           /* a * b - c -> a * b + (-c)  */
2560           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2561             addop = force_gimple_operand_gsi (&gsi,
2562                                               build1 (NEGATE_EXPR,
2563                                                       type, addop),
2564                                               true, NULL_TREE, true,
2565                                               GSI_SAME_STMT);
2566         }
2567       else
2568         {
2569           addop = gimple_assign_rhs1 (use_stmt);
2570           /* a - b * c -> (-b) * c + a */
2571           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2572             negate_p = !negate_p;
2573         }
2574
2575       if (negate_p)
2576         mulop1 = force_gimple_operand_gsi (&gsi,
2577                                            build1 (NEGATE_EXPR,
2578                                                    type, mulop1),
2579                                            true, NULL_TREE, true,
2580                                            GSI_SAME_STMT);
2581
2582       fma_stmt = gimple_build_assign_with_ops3 (FMA_EXPR,
2583                                                 gimple_assign_lhs (use_stmt),
2584                                                 mulop1, op2,
2585                                                 addop);
2586       gsi_replace (&gsi, fma_stmt, true);
2587       widen_mul_stats.fmas_inserted++;
2588     }
2589
2590   return true;
2591 }
2592
2593 /* Find integer multiplications where the operands are extended from
2594    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
2595    where appropriate.  */
2596
2597 static unsigned int
2598 execute_optimize_widening_mul (void)
2599 {
2600   basic_block bb;
2601   bool cfg_changed = false;
2602
2603   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
2604
2605   FOR_EACH_BB (bb)
2606     {
2607       gimple_stmt_iterator gsi;
2608
2609       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
2610         {
2611           gimple stmt = gsi_stmt (gsi);
2612           enum tree_code code;
2613
2614           if (is_gimple_assign (stmt))
2615             {
2616               code = gimple_assign_rhs_code (stmt);
2617               switch (code)
2618                 {
2619                 case MULT_EXPR:
2620                   if (!convert_mult_to_widen (stmt, &gsi)
2621                       && convert_mult_to_fma (stmt,
2622                                               gimple_assign_rhs1 (stmt),
2623                                               gimple_assign_rhs2 (stmt)))
2624                     {
2625                       gsi_remove (&gsi, true);
2626                       release_defs (stmt);
2627                       continue;
2628                     }
2629                   break;
2630
2631                 case PLUS_EXPR:
2632                 case MINUS_EXPR:
2633                   convert_plusminus_to_widen (&gsi, stmt, code);
2634                   break;
2635
2636                 default:;
2637                 }
2638             }
2639           else if (is_gimple_call (stmt)
2640                    && gimple_call_lhs (stmt))
2641             {
2642               tree fndecl = gimple_call_fndecl (stmt);
2643               if (fndecl
2644                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
2645                 {
2646                   switch (DECL_FUNCTION_CODE (fndecl))
2647                     {
2648                       case BUILT_IN_POWF:
2649                       case BUILT_IN_POW:
2650                       case BUILT_IN_POWL:
2651                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
2652                             && REAL_VALUES_EQUAL
2653                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
2654                                   dconst2)
2655                             && convert_mult_to_fma (stmt,
2656                                                     gimple_call_arg (stmt, 0),
2657                                                     gimple_call_arg (stmt, 0)))
2658                           {
2659                             unlink_stmt_vdef (stmt);
2660                             if (gsi_remove (&gsi, true)
2661                                 && gimple_purge_dead_eh_edges (bb))
2662                               cfg_changed = true;
2663                             release_defs (stmt);
2664                             continue;
2665                           }
2666                           break;
2667
2668                       default:;
2669                     }
2670                 }
2671             }
2672           gsi_next (&gsi);
2673         }
2674     }
2675
2676   statistics_counter_event (cfun, "widening multiplications inserted",
2677                             widen_mul_stats.widen_mults_inserted);
2678   statistics_counter_event (cfun, "widening maccs inserted",
2679                             widen_mul_stats.maccs_inserted);
2680   statistics_counter_event (cfun, "fused multiply-adds inserted",
2681                             widen_mul_stats.fmas_inserted);
2682
2683   return cfg_changed ? TODO_cleanup_cfg : 0;
2684 }
2685
2686 static bool
2687 gate_optimize_widening_mul (void)
2688 {
2689   return flag_expensive_optimizations && optimize;
2690 }
2691
2692 struct gimple_opt_pass pass_optimize_widening_mul =
2693 {
2694  {
2695   GIMPLE_PASS,
2696   "widening_mul",                       /* name */
2697   gate_optimize_widening_mul,           /* gate */
2698   execute_optimize_widening_mul,        /* execute */
2699   NULL,                                 /* sub */
2700   NULL,                                 /* next */
2701   0,                                    /* static_pass_number */
2702   TV_NONE,                              /* tv_id */
2703   PROP_ssa,                             /* properties_required */
2704   0,                                    /* properties_provided */
2705   0,                                    /* properties_destroyed */
2706   0,                                    /* todo_flags_start */
2707   TODO_verify_ssa
2708   | TODO_verify_stmts
2709   | TODO_update_ssa                     /* todo_flags_finish */
2710  }
2711 };