gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 3, or (at your option) any
  10 later version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  22    operations.  These are common in sequences such as this one:
  23
  24         modulus = sqrt(x*x + y*y + z*z);
  25         x = x / modulus;
  26         y = y / modulus;
  27         z = z / modulus;
  28
  29    that can be optimized to
  30
  31         modulus = sqrt(x*x + y*y + z*z);
  32         rmodulus = 1.0 / modulus;
  33         x = x * rmodulus;
  34         y = y * rmodulus;
  35         z = z * rmodulus;
  36
  37    We do this for loop invariant divisors, and with this pass whenever
  38    we notice that a division has the same divisor multiple times.
  39
  40    Of course, like in PRE, we don't insert a division if a dominator
  41    already has one.  However, this cannot be done as an extension of
  42    PRE for several reasons.
  43
  44    First of all, with some experiments it was found out that the
  45    transformation is not always useful if there are only two divisions
  46    hy the same divisor.  This is probably because modern processors
  47    can pipeline the divisions; on older, in-order processors it should
  48    still be effective to optimize two divisions by the same number.
  49    We make this a param, and it shall be called N in the remainder of
  50    this comment.
  51
  52    Second, if trapping math is active, we have less freedom on where
  53    to insert divisions: we can only do so in basic blocks that already
  54    contain one.  (If divisions don't trap, instead, we can insert
  55    divisions elsewhere, which will be in blocks that are common dominators
  56    of those that have the division).
  57
  58    We really don't want to compute the reciprocal unless a division will
  59    be found.  To do this, we won't insert the division in a basic block
  60    that has less than N divisions *post-dominating* it.
  61
  62    The algorithm constructs a subset of the dominator tree, holding the
  63    blocks containing the divisions and the common dominators to them,
  64    and walk it twice.  The first walk is in post-order, and it annotates
  65    each block with the number of divisions that post-dominate it: this
  66    gives information on where divisions can be inserted profitably.
  67    The second walk is in pre-order, and it inserts divisions as explained
  68    above, and replaces divisions by multiplications.
  69
  70    In the best case, the cost of the pass is O(n_statements).  In the
  71    worst-case, the cost is due to creating the dominator tree subset,
  72    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  73    for n_statements / n_basic_blocks statements.  So, the amortized cost
  74    of creating the dominator tree subset is O(n_basic_blocks) and the
  75    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  76
  77    More practically, the cost will be small because there are few
  78    divisions, and they tend to be in the same basic block, so insert_bb
  79    is called very few times.
  80
  81    If we did this using domwalk.c, an efficient implementation would have
  82    to work on all the variables in a single pass, because we could not
  83    work on just a subset of the dominator tree, as we do now, and the
  84    cost would also be something like O(n_statements * n_basic_blocks).
  85    The data structures would be more complex in order to work on all the
  86    variables in a single pass.  */
  87
  88 #include "config.h"
  89 #include "system.h"
  90 #include "coretypes.h"
  91 #include "tm.h"
  92 #include "flags.h"
  93 #include "tree.h"
  94 #include "tree-flow.h"
  95 #include "tree-pass.h"
  96 #include "alloc-pool.h"
  97 #include "basic-block.h"
  98 #include "target.h"
  99 #include "gimple-pretty-print.h"
 100
 101 /* FIXME: RTL headers have to be included here for optabs.  */
 102 #include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
 103 #include "expr.h"               /* Because optabs.h wants sepops.  */
 104 #include "optabs.h"
 105
 106 /* This structure represents one basic block that either computes a
 107    division, or is a common dominator for basic block that compute a
 108    division.  */
 109 struct occurrence {
 110   /* The basic block represented by this structure.  */
 111   basic_block bb;
 112
 113   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 114      inserted in BB.  */
 115   tree recip_def;
 116
 117   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 118      was inserted in BB.  */
 119   gimple recip_def_stmt;
 120
 121   /* Pointer to a list of "struct occurrence"s for blocks dominated
 122      by BB.  */
 123   struct occurrence *children;
 124
 125   /* Pointer to the next "struct occurrence"s in the list of blocks
 126      sharing a common dominator.  */
 127   struct occurrence *next;
 128
 129   /* The number of divisions that are in BB before compute_merit.  The
 130      number of divisions that are in BB or post-dominate it after
 131      compute_merit.  */
 132   int num_divisions;
 133
 134   /* True if the basic block has a division, false if it is a common
 135      dominator for basic blocks that do.  If it is false and trapping
 136      math is active, BB is not a candidate for inserting a reciprocal.  */
 137   bool bb_has_division;
 138 };
 139
 140 static struct
 141 {
 142   /* Number of 1.0/X ops inserted.  */
 143   int rdivs_inserted;
 144
 145   /* Number of 1.0/FUNC ops inserted.  */
 146   int rfuncs_inserted;
 147 } reciprocal_stats;
 148
 149 static struct
 150 {
 151   /* Number of cexpi calls inserted.  */
 152   int inserted;
 153 } sincos_stats;
 154
 155 static struct
 156 {
 157   /* Number of hand-written 32-bit bswaps found.  */
 158   int found_32bit;
 159
 160   /* Number of hand-written 64-bit bswaps found.  */
 161   int found_64bit;
 162 } bswap_stats;
 163
 164 static struct
 165 {
 166   /* Number of widening multiplication ops inserted.  */
 167   int widen_mults_inserted;
 168
 169   /* Number of integer multiply-and-accumulate ops inserted.  */
 170   int maccs_inserted;
 171
 172   /* Number of fp fused multiply-add ops inserted.  */
 173   int fmas_inserted;
 174 } widen_mul_stats;
 175
 176 /* The instance of "struct occurrence" representing the highest
 177    interesting block in the dominator tree.  */
 178 static struct occurrence *occ_head;
 179
 180 /* Allocation pool for getting instances of "struct occurrence".  */
 181 static alloc_pool occ_pool;
 182
 183
 184
 185 /* Allocate and return a new struct occurrence for basic block BB, and
 186    whose children list is headed by CHILDREN.  */
 187 static struct occurrence *
 188 occ_new (basic_block bb, struct occurrence *children)
 189 {
 190   struct occurrence *occ;
 191
 192   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
 193   memset (occ, 0, sizeof (struct occurrence));
 194
 195   occ->bb = bb;
 196   occ->children = children;
 197   return occ;
 198 }
 199
 200
 201 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 202    list of "struct occurrence"s, one per basic block, having IDOM as
 203    their common dominator.
 204
 205    We try to insert NEW_OCC as deep as possible in the tree, and we also
 206    insert any other block that is a common dominator for BB and one
 207    block already in the tree.  */
 208
 209 static void
 210 insert_bb (struct occurrence *new_occ, basic_block idom,
 211            struct occurrence **p_head)
 212 {
 213   struct occurrence *occ, **p_occ;
 214
 215   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 216     {
 217       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 218       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 219       if (dom == bb)
 220         {
 221           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 222              from its list.  */
 223           *p_occ = occ->next;
 224           occ->next = new_occ->children;
 225           new_occ->children = occ;
 226
 227           /* Try the next block (it may as well be dominated by BB).  */
 228         }
 229
 230       else if (dom == occ_bb)
 231         {
 232           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 233           insert_bb (new_occ, dom, &occ->children);
 234           return;
 235         }
 236
 237       else if (dom != idom)
 238         {
 239           gcc_assert (!dom->aux);
 240
 241           /* There is a dominator between IDOM and BB, add it and make
 242              two children out of NEW_OCC and OCC.  First, remove OCC from
 243              its list.  */
 244           *p_occ = occ->next;
 245           new_occ->next = occ;
 246           occ->next = NULL;
 247
 248           /* None of the previous blocks has DOM as a dominator: if we tail
 249              recursed, we would reexamine them uselessly. Just switch BB with
 250              DOM, and go on looking for blocks dominated by DOM.  */
 251           new_occ = occ_new (dom, new_occ);
 252         }
 253
 254       else
 255         {
 256           /* Nothing special, go on with the next element.  */
 257           p_occ = &occ->next;
 258         }
 259     }
 260
 261   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 262   new_occ->next = *p_head;
 263   *p_head = new_occ;
 264 }
 265
 266 /* Register that we found a division in BB.  */
 267
 268 static inline void
 269 register_division_in (basic_block bb)
 270 {
 271   struct occurrence *occ;
 272
 273   occ = (struct occurrence *) bb->aux;
 274   if (!occ)
 275     {
 276       occ = occ_new (bb, NULL);
 277       insert_bb (occ, ENTRY_BLOCK_PTR, &occ_head);
 278     }
 279
 280   occ->bb_has_division = true;
 281   occ->num_divisions++;
 282 }
 283
 284
 285 /* Compute the number of divisions that postdominate each block in OCC and
 286    its children.  */
 287
 288 static void
 289 compute_merit (struct occurrence *occ)
 290 {
 291   struct occurrence *occ_child;
 292   basic_block dom = occ->bb;
 293
 294   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 295     {
 296       basic_block bb;
 297       if (occ_child->children)
 298         compute_merit (occ_child);
 299
 300       if (flag_exceptions)
 301         bb = single_noncomplex_succ (dom);
 302       else
 303         bb = dom;
 304
 305       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 306         occ->num_divisions += occ_child->num_divisions;
 307     }
 308 }
 309
 310
 311 /* Return whether USE_STMT is a floating-point division by DEF.  */
 312 static inline bool
 313 is_division_by (gimple use_stmt, tree def)
 314 {
 315   return is_gimple_assign (use_stmt)
 316          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 317          && gimple_assign_rhs2 (use_stmt) == def
 318          /* Do not recognize x / x as valid division, as we are getting
 319             confused later by replacing all immediate uses x in such
 320             a stmt.  */
 321          && gimple_assign_rhs1 (use_stmt) != def;
 322 }
 323
 324 /* Walk the subset of the dominator tree rooted at OCC, setting the
 325    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 326    the given basic block.  The field may be left NULL, of course,
 327    if it is not possible or profitable to do the optimization.
 328
 329    DEF_BSI is an iterator pointing at the statement defining DEF.
 330    If RECIP_DEF is set, a dominator already has a computation that can
 331    be used.  */
 332
 333 static void
 334 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 335                     tree def, tree recip_def, int threshold)
 336 {
 337   tree type;
 338   gimple new_stmt;
 339   gimple_stmt_iterator gsi;
 340   struct occurrence *occ_child;
 341
 342   if (!recip_def
 343       && (occ->bb_has_division || !flag_trapping_math)
 344       && occ->num_divisions >= threshold)
 345     {
 346       /* Make a variable with the replacement and substitute it.  */
 347       type = TREE_TYPE (def);
 348       recip_def = create_tmp_reg (type, "reciptmp");
 349       new_stmt = gimple_build_assign_with_ops (RDIV_EXPR, recip_def,
 350                                                build_one_cst (type), def);
 351
 352       if (occ->bb_has_division)
 353         {
 354           /* Case 1: insert before an existing division.  */
 355           gsi = gsi_after_labels (occ->bb);
 356           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 357             gsi_next (&gsi);
 358
 359           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 360         }
 361       else if (def_gsi && occ->bb == def_gsi->bb)
 362         {
 363           /* Case 2: insert right after the definition.  Note that this will
 364              never happen if the definition statement can throw, because in
 365              that case the sole successor of the statement's basic block will
 366              dominate all the uses as well.  */
 367           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 368         }
 369       else
 370         {
 371           /* Case 3: insert in a basic block not containing defs/uses.  */
 372           gsi = gsi_after_labels (occ->bb);
 373           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 374         }
 375
 376       reciprocal_stats.rdivs_inserted++;
 377
 378       occ->recip_def_stmt = new_stmt;
 379     }
 380
 381   occ->recip_def = recip_def;
 382   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 383     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 384 }
 385
 386
 387 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 388    possible.  */
 389
 390 static inline void
 391 replace_reciprocal (use_operand_p use_p)
 392 {
 393   gimple use_stmt = USE_STMT (use_p);
 394   basic_block bb = gimple_bb (use_stmt);
 395   struct occurrence *occ = (struct occurrence *) bb->aux;
 396
 397   if (optimize_bb_for_speed_p (bb)
 398       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 399     {
 400       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 401       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 402       SET_USE (use_p, occ->recip_def);
 403       fold_stmt_inplace (&gsi);
 404       update_stmt (use_stmt);
 405     }
 406 }
 407
 408
 409 /* Free OCC and return one more "struct occurrence" to be freed.  */
 410
 411 static struct occurrence *
 412 free_bb (struct occurrence *occ)
 413 {
 414   struct occurrence *child, *next;
 415
 416   /* First get the two pointers hanging off OCC.  */
 417   next = occ->next;
 418   child = occ->children;
 419   occ->bb->aux = NULL;
 420   pool_free (occ_pool, occ);
 421
 422   /* Now ensure that we don't recurse unless it is necessary.  */
 423   if (!child)
 424     return next;
 425   else
 426     {
 427       while (next)
 428         next = free_bb (next);
 429
 430       return child;
 431     }
 432 }
 433
 434
 435 /* Look for floating-point divisions among DEF's uses, and try to
 436    replace them by multiplications with the reciprocal.  Add
 437    as many statements computing the reciprocal as needed.
 438
 439    DEF must be a GIMPLE register of a floating-point type.  */
 440
 441 static void
 442 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 443 {
 444   use_operand_p use_p;
 445   imm_use_iterator use_iter;
 446   struct occurrence *occ;
 447   int count = 0, threshold;
 448
 449   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 450
 451   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 452     {
 453       gimple use_stmt = USE_STMT (use_p);
 454       if (is_division_by (use_stmt, def))
 455         {
 456           register_division_in (gimple_bb (use_stmt));
 457           count++;
 458         }
 459     }
 460
 461   /* Do the expensive part only if we can hope to optimize something.  */
 462   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 463   if (count >= threshold)
 464     {
 465       gimple use_stmt;
 466       for (occ = occ_head; occ; occ = occ->next)
 467         {
 468           compute_merit (occ);
 469           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 470         }
 471
 472       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 473         {
 474           if (is_division_by (use_stmt, def))
 475             {
 476               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 477                 replace_reciprocal (use_p);
 478             }
 479         }
 480     }
 481
 482   for (occ = occ_head; occ; )
 483     occ = free_bb (occ);
 484
 485   occ_head = NULL;
 486 }
 487
 488 static bool
 489 gate_cse_reciprocals (void)
 490 {
 491   return optimize && flag_reciprocal_math;
 492 }
 493
 494 /* Go through all the floating-point SSA_NAMEs, and call
 495    execute_cse_reciprocals_1 on each of them.  */
 496 static unsigned int
 497 execute_cse_reciprocals (void)
 498 {
 499   basic_block bb;
 500   tree arg;
 501
 502   occ_pool = create_alloc_pool ("dominators for recip",
 503                                 sizeof (struct occurrence),
 504                                 n_basic_blocks / 3 + 1);
 505
 506   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 507   calculate_dominance_info (CDI_DOMINATORS);
 508   calculate_dominance_info (CDI_POST_DOMINATORS);
 509
 510 #ifdef ENABLE_CHECKING
 511   FOR_EACH_BB (bb)
 512     gcc_assert (!bb->aux);
 513 #endif
 514
 515   for (arg = DECL_ARGUMENTS (cfun->decl); arg; arg = DECL_CHAIN (arg))
 516     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 517         && is_gimple_reg (arg))
 518       {
 519         tree name = ssa_default_def (cfun, arg);
 520         if (name)
 521           execute_cse_reciprocals_1 (NULL, name);
 522       }
 523
 524   FOR_EACH_BB (bb)
 525     {
 526       gimple_stmt_iterator gsi;
 527       gimple phi;
 528       tree def;
 529
 530       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 531         {
 532           phi = gsi_stmt (gsi);
 533           def = PHI_RESULT (phi);
 534           if (! virtual_operand_p (def)
 535               && FLOAT_TYPE_P (TREE_TYPE (def)))
 536             execute_cse_reciprocals_1 (NULL, def);
 537         }
 538
 539       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 540         {
 541           gimple stmt = gsi_stmt (gsi);
 542
 543           if (gimple_has_lhs (stmt)
 544               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 545               && FLOAT_TYPE_P (TREE_TYPE (def))
 546               && TREE_CODE (def) == SSA_NAME)
 547             execute_cse_reciprocals_1 (&gsi, def);
 548         }
 549
 550       if (optimize_bb_for_size_p (bb))
 551         continue;
 552
 553       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 554       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 555         {
 556           gimple stmt = gsi_stmt (gsi);
 557           tree fndecl;
 558
 559           if (is_gimple_assign (stmt)
 560               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 561             {
 562               tree arg1 = gimple_assign_rhs2 (stmt);
 563               gimple stmt1;
 564
 565               if (TREE_CODE (arg1) != SSA_NAME)
 566                 continue;
 567
 568               stmt1 = SSA_NAME_DEF_STMT (arg1);
 569
 570               if (is_gimple_call (stmt1)
 571                   && gimple_call_lhs (stmt1)
 572                   && (fndecl = gimple_call_fndecl (stmt1))
 573                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 574                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 575                 {
 576                   enum built_in_function code;
 577                   bool md_code, fail;
 578                   imm_use_iterator ui;
 579                   use_operand_p use_p;
 580
 581                   code = DECL_FUNCTION_CODE (fndecl);
 582                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 583
 584                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 585                   if (!fndecl)
 586                     continue;
 587
 588                   /* Check that all uses of the SSA name are divisions,
 589                      otherwise replacing the defining statement will do
 590                      the wrong thing.  */
 591                   fail = false;
 592                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 593                     {
 594                       gimple stmt2 = USE_STMT (use_p);
 595                       if (is_gimple_debug (stmt2))
 596                         continue;
 597                       if (!is_gimple_assign (stmt2)
 598                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 599                           || gimple_assign_rhs1 (stmt2) == arg1
 600                           || gimple_assign_rhs2 (stmt2) != arg1)
 601                         {
 602                           fail = true;
 603                           break;
 604                         }
 605                     }
 606                   if (fail)
 607                     continue;
 608
 609                   gimple_replace_lhs (stmt1, arg1);
 610                   gimple_call_set_fndecl (stmt1, fndecl);
 611                   update_stmt (stmt1);
 612                   reciprocal_stats.rfuncs_inserted++;
 613
 614                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 615                     {
 616                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 617                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 618                       fold_stmt_inplace (&gsi);
 619                       update_stmt (stmt);
 620                     }
 621                 }
 622             }
 623         }
 624     }
 625
 626   statistics_counter_event (cfun, "reciprocal divs inserted",
 627                             reciprocal_stats.rdivs_inserted);
 628   statistics_counter_event (cfun, "reciprocal functions inserted",
 629                             reciprocal_stats.rfuncs_inserted);
 630
 631   free_dominance_info (CDI_DOMINATORS);
 632   free_dominance_info (CDI_POST_DOMINATORS);
 633   free_alloc_pool (occ_pool);
 634   return 0;
 635 }
 636
 637 struct gimple_opt_pass pass_cse_reciprocals =
 638 {
 639  {
 640   GIMPLE_PASS,
 641   "recip",                              /* name */
 642   gate_cse_reciprocals,                 /* gate */
 643   execute_cse_reciprocals,              /* execute */
 644   NULL,                                 /* sub */
 645   NULL,                                 /* next */
 646   0,                                    /* static_pass_number */
 647   TV_NONE,                              /* tv_id */
 648   PROP_ssa,                             /* properties_required */
 649   0,                                    /* properties_provided */
 650   0,                                    /* properties_destroyed */
 651   0,                                    /* todo_flags_start */
 652   TODO_update_ssa | TODO_verify_ssa
 653     | TODO_verify_stmts                /* todo_flags_finish */
 654  }
 655 };
 656
 657 /* Records an occurrence at statement USE_STMT in the vector of trees
 658    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 659    is not yet initialized.  Returns true if the occurrence was pushed on
 660    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 661    statements in the vector.  */
 662
 663 static bool
 664 maybe_record_sincos (VEC(gimple, heap) **stmts,
 665                      basic_block *top_bb, gimple use_stmt)
 666 {
 667   basic_block use_bb = gimple_bb (use_stmt);
 668   if (*top_bb
 669       && (*top_bb == use_bb
 670           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 671     VEC_safe_push (gimple, heap, *stmts, use_stmt);
 672   else if (!*top_bb
 673            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 674     {
 675       VEC_safe_push (gimple, heap, *stmts, use_stmt);
 676       *top_bb = use_bb;
 677     }
 678   else
 679     return false;
 680
 681   return true;
 682 }
 683
 684 /* Look for sin, cos and cexpi calls with the same argument NAME and
 685    create a single call to cexpi CSEing the result in this case.
 686    We first walk over all immediate uses of the argument collecting
 687    statements that we can CSE in a vector and in a second pass replace
 688    the statement rhs with a REALPART or IMAGPART expression on the
 689    result of the cexpi call we insert before the use statement that
 690    dominates all other candidates.  */
 691
 692 static bool
 693 execute_cse_sincos_1 (tree name)
 694 {
 695   gimple_stmt_iterator gsi;
 696   imm_use_iterator use_iter;
 697   tree fndecl, res, type;
 698   gimple def_stmt, use_stmt, stmt;
 699   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 700   VEC(gimple, heap) *stmts = NULL;
 701   basic_block top_bb = NULL;
 702   int i;
 703   bool cfg_changed = false;
 704
 705   type = TREE_TYPE (name);
 706   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 707     {
 708       if (gimple_code (use_stmt) != GIMPLE_CALL
 709           || !gimple_call_lhs (use_stmt)
 710           || !(fndecl = gimple_call_fndecl (use_stmt))
 711           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 712         continue;
 713
 714       switch (DECL_FUNCTION_CODE (fndecl))
 715         {
 716         CASE_FLT_FN (BUILT_IN_COS):
 717           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 718           break;
 719
 720         CASE_FLT_FN (BUILT_IN_SIN):
 721           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 722           break;
 723
 724         CASE_FLT_FN (BUILT_IN_CEXPI):
 725           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 726           break;
 727
 728         default:;
 729         }
 730     }
 731
 732   if (seen_cos + seen_sin + seen_cexpi <= 1)
 733     {
 734       VEC_free(gimple, heap, stmts);
 735       return false;
 736     }
 737
 738   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 739      the name def statement.  */
 740   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 741   if (!fndecl)
 742     return false;
 743   stmt = gimple_build_call (fndecl, 1, name);
 744   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
 745   gimple_call_set_lhs (stmt, res);
 746
 747   def_stmt = SSA_NAME_DEF_STMT (name);
 748   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 749       && gimple_code (def_stmt) != GIMPLE_PHI
 750       && gimple_bb (def_stmt) == top_bb)
 751     {
 752       gsi = gsi_for_stmt (def_stmt);
 753       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 754     }
 755   else
 756     {
 757       gsi = gsi_after_labels (top_bb);
 758       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 759     }
 760   sincos_stats.inserted++;
 761
 762   /* And adjust the recorded old call sites.  */
 763   for (i = 0; VEC_iterate(gimple, stmts, i, use_stmt); ++i)
 764     {
 765       tree rhs = NULL;
 766       fndecl = gimple_call_fndecl (use_stmt);
 767
 768       switch (DECL_FUNCTION_CODE (fndecl))
 769         {
 770         CASE_FLT_FN (BUILT_IN_COS):
 771           rhs = fold_build1 (REALPART_EXPR, type, res);
 772           break;
 773
 774         CASE_FLT_FN (BUILT_IN_SIN):
 775           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 776           break;
 777
 778         CASE_FLT_FN (BUILT_IN_CEXPI):
 779           rhs = res;
 780           break;
 781
 782         default:;
 783           gcc_unreachable ();
 784         }
 785
 786         /* Replace call with a copy.  */
 787         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 788
 789         gsi = gsi_for_stmt (use_stmt);
 790         gsi_replace (&gsi, stmt, true);
 791         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 792           cfg_changed = true;
 793     }
 794
 795   VEC_free(gimple, heap, stmts);
 796
 797   return cfg_changed;
 798 }
 799
 800 /* To evaluate powi(x,n), the floating point value x raised to the
 801    constant integer exponent n, we use a hybrid algorithm that
 802    combines the "window method" with look-up tables.  For an
 803    introduction to exponentiation algorithms and "addition chains",
 804    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 805    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 806    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 807    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 808
 809 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 810    multiplications to inline before calling the system library's pow
 811    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 812    so this default never requires calling pow, powf or powl.  */
 813
 814 #ifndef POWI_MAX_MULTS
 815 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 816 #endif
 817
 818 /* The size of the "optimal power tree" lookup table.  All
 819    exponents less than this value are simply looked up in the
 820    powi_table below.  This threshold is also used to size the
 821    cache of pseudo registers that hold intermediate results.  */
 822 #define POWI_TABLE_SIZE 256
 823
 824 /* The size, in bits of the window, used in the "window method"
 825    exponentiation algorithm.  This is equivalent to a radix of
 826    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 827 #define POWI_WINDOW_SIZE 3
 828
 829 /* The following table is an efficient representation of an
 830    "optimal power tree".  For each value, i, the corresponding
 831    value, j, in the table states than an optimal evaluation
 832    sequence for calculating pow(x,i) can be found by evaluating
 833    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 834    100 integers is given in Knuth's "Seminumerical algorithms".  */
 835
 836 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 837   {
 838       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 839       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 840       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 841      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 842      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 843      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 844      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 845      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 846      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 847      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 848      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 849      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 850      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 851      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 852      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 853      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 854      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 855      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 856      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 857      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 858      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 859      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 860      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 861      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 862      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 863     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 864     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 865     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 866     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 867     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 868     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 869     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 870   };
 871
 872
 873 /* Return the number of multiplications required to calculate
 874    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 875    subroutine of powi_cost.  CACHE is an array indicating
 876    which exponents have already been calculated.  */
 877
 878 static int
 879 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 880 {
 881   /* If we've already calculated this exponent, then this evaluation
 882      doesn't require any additional multiplications.  */
 883   if (cache[n])
 884     return 0;
 885
 886   cache[n] = true;
 887   return powi_lookup_cost (n - powi_table[n], cache)
 888          + powi_lookup_cost (powi_table[n], cache) + 1;
 889 }
 890
 891 /* Return the number of multiplications required to calculate
 892    powi(x,n) for an arbitrary x, given the exponent N.  This
 893    function needs to be kept in sync with powi_as_mults below.  */
 894
 895 static int
 896 powi_cost (HOST_WIDE_INT n)
 897 {
 898   bool cache[POWI_TABLE_SIZE];
 899   unsigned HOST_WIDE_INT digit;
 900   unsigned HOST_WIDE_INT val;
 901   int result;
 902
 903   if (n == 0)
 904     return 0;
 905
 906   /* Ignore the reciprocal when calculating the cost.  */
 907   val = (n < 0) ? -n : n;
 908
 909   /* Initialize the exponent cache.  */
 910   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 911   cache[1] = true;
 912
 913   result = 0;
 914
 915   while (val >= POWI_TABLE_SIZE)
 916     {
 917       if (val & 1)
 918         {
 919           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 920           result += powi_lookup_cost (digit, cache)
 921                     + POWI_WINDOW_SIZE + 1;
 922           val >>= POWI_WINDOW_SIZE;
 923         }
 924       else
 925         {
 926           val >>= 1;
 927           result++;
 928         }
 929     }
 930
 931   return result + powi_lookup_cost (val, cache);
 932 }
 933
 934 /* Recursive subroutine of powi_as_mults.  This function takes the
 935    array, CACHE, of already calculated exponents and an exponent N and
 936    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 937
 938 static tree
 939 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 940                  HOST_WIDE_INT n, tree *cache)
 941 {
 942   tree op0, op1, ssa_target;
 943   unsigned HOST_WIDE_INT digit;
 944   gimple mult_stmt;
 945
 946   if (n < POWI_TABLE_SIZE && cache[n])
 947     return cache[n];
 948
 949   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
 950
 951   if (n < POWI_TABLE_SIZE)
 952     {
 953       cache[n] = ssa_target;
 954       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
 955       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
 956     }
 957   else if (n & 1)
 958     {
 959       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
 960       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
 961       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
 962     }
 963   else
 964     {
 965       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
 966       op1 = op0;
 967     }
 968
 969   mult_stmt = gimple_build_assign_with_ops (MULT_EXPR, ssa_target, op0, op1);
 970   gimple_set_location (mult_stmt, loc);
 971   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
 972
 973   return ssa_target;
 974 }
 975
 976 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
 977    This function needs to be kept in sync with powi_cost above.  */
 978
 979 static tree
 980 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
 981                tree arg0, HOST_WIDE_INT n)
 982 {
 983   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
 984   gimple div_stmt;
 985   tree target;
 986
 987   if (n == 0)
 988     return build_real (type, dconst1);
 989
 990   memset (cache, 0,  sizeof (cache));
 991   cache[1] = arg0;
 992
 993   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
 994   if (n >= 0)
 995     return result;
 996
 997   /* If the original exponent was negative, reciprocate the result.  */
 998   target = make_temp_ssa_name (type, NULL, "powmult");
 999   div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target,
1000                                            build_real (type, dconst1),
1001                                            result);
1002   gimple_set_location (div_stmt, loc);
1003   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1004
1005   return target;
1006 }
1007
1008 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1009    location info LOC.  If the arguments are appropriate, create an
1010    equivalent sequence of statements prior to GSI using an optimal
1011    number of multiplications, and return an expession holding the
1012    result.  */
1013
1014 static tree
1015 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1016                             tree arg0, HOST_WIDE_INT n)
1017 {
1018   /* Avoid largest negative number.  */
1019   if (n != -n
1020       && ((n >= -1 && n <= 2)
1021           || (optimize_function_for_speed_p (cfun)
1022               && powi_cost (n) <= POWI_MAX_MULTS)))
1023     return powi_as_mults (gsi, loc, arg0, n);
1024
1025   return NULL_TREE;
1026 }
1027
1028 /* Build a gimple call statement that calls FN with argument ARG.
1029    Set the lhs of the call statement to a fresh SSA name.  Insert the
1030    statement prior to GSI's current position, and return the fresh
1031    SSA name.  */
1032
1033 static tree
1034 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1035                        tree fn, tree arg)
1036 {
1037   gimple call_stmt;
1038   tree ssa_target;
1039
1040   call_stmt = gimple_build_call (fn, 1, arg);
1041   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1042   gimple_set_lhs (call_stmt, ssa_target);
1043   gimple_set_location (call_stmt, loc);
1044   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1045
1046   return ssa_target;
1047 }
1048
1049 /* Build a gimple binary operation with the given CODE and arguments
1050    ARG0, ARG1, assigning the result to a new SSA name for variable
1051    TARGET.  Insert the statement prior to GSI's current position, and
1052    return the fresh SSA name.*/
1053
1054 static tree
1055 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1056                         const char *name, enum tree_code code,
1057                         tree arg0, tree arg1)
1058 {
1059   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1060   gimple stmt = gimple_build_assign_with_ops (code, result, arg0, arg1);
1061   gimple_set_location (stmt, loc);
1062   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1063   return result;
1064 }
1065
1066 /* Build a gimple reference operation with the given CODE and argument
1067    ARG, assigning the result to a new SSA name of TYPE with NAME.
1068    Insert the statement prior to GSI's current position, and return
1069    the fresh SSA name.  */
1070
1071 static inline tree
1072 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1073                       const char *name, enum tree_code code, tree arg0)
1074 {
1075   tree result = make_temp_ssa_name (type, NULL, name);
1076   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1077   gimple_set_location (stmt, loc);
1078   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1079   return result;
1080 }
1081
1082 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1083    prior to GSI's current position, and return the fresh SSA name.  */
1084
1085 static tree
1086 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1087                        tree type, tree val)
1088 {
1089   tree result = make_ssa_name (type, NULL);
1090   gimple stmt = gimple_build_assign_with_ops (NOP_EXPR, result, val, NULL_TREE);
1091   gimple_set_location (stmt, loc);
1092   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1093   return result;
1094 }
1095
1096 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1097    with location info LOC.  If possible, create an equivalent and
1098    less expensive sequence of statements prior to GSI, and return an
1099    expession holding the result.  */
1100
1101 static tree
1102 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1103                            tree arg0, tree arg1)
1104 {
1105   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1106   REAL_VALUE_TYPE c2, dconst3;
1107   HOST_WIDE_INT n;
1108   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1109   enum machine_mode mode;
1110   bool hw_sqrt_exists;
1111
1112   /* If the exponent isn't a constant, there's nothing of interest
1113      to be done.  */
1114   if (TREE_CODE (arg1) != REAL_CST)
1115     return NULL_TREE;
1116
1117   /* If the exponent is equivalent to an integer, expand to an optimal
1118      multiplication sequence when profitable.  */
1119   c = TREE_REAL_CST (arg1);
1120   n = real_to_integer (&c);
1121   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1122
1123   if (real_identical (&c, &cint)
1124       && ((n >= -1 && n <= 2)
1125           || (flag_unsafe_math_optimizations
1126               && optimize_insn_for_speed_p ()
1127               && powi_cost (n) <= POWI_MAX_MULTS)))
1128     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1129
1130   /* Attempt various optimizations using sqrt and cbrt.  */
1131   type = TREE_TYPE (arg0);
1132   mode = TYPE_MODE (type);
1133   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1134
1135   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1136      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1137      sqrt(-0) = -0.  */
1138   if (sqrtfn
1139       && REAL_VALUES_EQUAL (c, dconsthalf)
1140       && !HONOR_SIGNED_ZEROS (mode))
1141     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1142
1143   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1144      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1145      so do this optimization even if -Os.  Don't do this optimization
1146      if we don't have a hardware sqrt insn.  */
1147   dconst1_4 = dconst1;
1148   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1149   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1150
1151   if (flag_unsafe_math_optimizations
1152       && sqrtfn
1153       && REAL_VALUES_EQUAL (c, dconst1_4)
1154       && hw_sqrt_exists)
1155     {
1156       /* sqrt(x)  */
1157       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1158
1159       /* sqrt(sqrt(x))  */
1160       return build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1161     }
1162
1163   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1164      optimizing for space.  Don't do this optimization if we don't have
1165      a hardware sqrt insn.  */
1166   real_from_integer (&dconst3_4, VOIDmode, 3, 0, 0);
1167   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1168
1169   if (flag_unsafe_math_optimizations
1170       && sqrtfn
1171       && optimize_function_for_speed_p (cfun)
1172       && REAL_VALUES_EQUAL (c, dconst3_4)
1173       && hw_sqrt_exists)
1174     {
1175       /* sqrt(x)  */
1176       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1177
1178       /* sqrt(sqrt(x))  */
1179       sqrt_sqrt = build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1180
1181       /* sqrt(x) * sqrt(sqrt(x))  */
1182       return build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1183                                      sqrt_arg0, sqrt_sqrt);
1184     }
1185
1186   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1187      optimizations since 1./3. is not exactly representable.  If x
1188      is negative and finite, the correct value of pow(x,1./3.) is
1189      a NaN with the "invalid" exception raised, because the value
1190      of 1./3. actually has an even denominator.  The correct value
1191      of cbrt(x) is a negative real value.  */
1192   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1193   dconst1_3 = real_value_truncate (mode, dconst_third ());
1194
1195   if (flag_unsafe_math_optimizations
1196       && cbrtfn
1197       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1198       && REAL_VALUES_EQUAL (c, dconst1_3))
1199     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1200
1201   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1202      if we don't have a hardware sqrt insn.  */
1203   dconst1_6 = dconst1_3;
1204   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1205
1206   if (flag_unsafe_math_optimizations
1207       && sqrtfn
1208       && cbrtfn
1209       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1210       && optimize_function_for_speed_p (cfun)
1211       && hw_sqrt_exists
1212       && REAL_VALUES_EQUAL (c, dconst1_6))
1213     {
1214       /* sqrt(x)  */
1215       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1216
1217       /* cbrt(sqrt(x))  */
1218       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1219     }
1220
1221   /* Optimize pow(x,c), where n = 2c for some nonzero integer n, into
1222
1223        sqrt(x) * powi(x, n/2),                n > 0;
1224        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1225
1226      Do not calculate the powi factor when n/2 = 0.  */
1227   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1228   n = real_to_integer (&c2);
1229   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1230
1231   if (flag_unsafe_math_optimizations
1232       && sqrtfn
1233       && real_identical (&c2, &cint))
1234     {
1235       tree powi_x_ndiv2 = NULL_TREE;
1236
1237       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1238          possible or profitable, give up.  Skip the degenerate case when
1239          n is 1 or -1, where the result is always 1.  */
1240       if (absu_hwi (n) != 1)
1241         {
1242           powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1243                                                      abs_hwi (n / 2));
1244           if (!powi_x_ndiv2)
1245             return NULL_TREE;
1246         }
1247
1248       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1249          result of the optimal multiply sequence just calculated.  */
1250       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1251
1252       if (absu_hwi (n) == 1)
1253         result = sqrt_arg0;
1254       else
1255         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1256                                          sqrt_arg0, powi_x_ndiv2);
1257
1258       /* If n is negative, reciprocate the result.  */
1259       if (n < 0)
1260         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1261                                          build_real (type, dconst1), result);
1262       return result;
1263     }
1264
1265   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1266
1267      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1268      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1269
1270      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1271      different from pow(x, 1./3.) due to rounding and behavior with
1272      negative x, we need to constrain this transformation to unsafe
1273      math and positive x or finite math.  */
1274   real_from_integer (&dconst3, VOIDmode, 3, 0, 0);
1275   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1276   real_round (&c2, mode, &c2);
1277   n = real_to_integer (&c2);
1278   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1279   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1280   real_convert (&c2, mode, &c2);
1281
1282   if (flag_unsafe_math_optimizations
1283       && cbrtfn
1284       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1285       && real_identical (&c2, &c)
1286       && optimize_function_for_speed_p (cfun)
1287       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1288     {
1289       tree powi_x_ndiv3 = NULL_TREE;
1290
1291       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1292          possible or profitable, give up.  Skip the degenerate case when
1293          abs(n) < 3, where the result is always 1.  */
1294       if (absu_hwi (n) >= 3)
1295         {
1296           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1297                                                      abs_hwi (n / 3));
1298           if (!powi_x_ndiv3)
1299             return NULL_TREE;
1300         }
1301
1302       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1303          as that creates an unnecessary variable.  Instead, just produce
1304          either cbrt(x) or cbrt(x) * cbrt(x).  */
1305       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1306
1307       if (absu_hwi (n) % 3 == 1)
1308         powi_cbrt_x = cbrt_x;
1309       else
1310         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1311                                               cbrt_x, cbrt_x);
1312
1313       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1314       if (absu_hwi (n) < 3)
1315         result = powi_cbrt_x;
1316       else
1317         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1318                                          powi_x_ndiv3, powi_cbrt_x);
1319
1320       /* If n is negative, reciprocate the result.  */
1321       if (n < 0)
1322         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1323                                          build_real (type, dconst1), result);
1324
1325       return result;
1326     }
1327
1328   /* No optimizations succeeded.  */
1329   return NULL_TREE;
1330 }
1331
1332 /* ARG is the argument to a cabs builtin call in GSI with location info
1333    LOC.  Create a sequence of statements prior to GSI that calculates
1334    sqrt(R*R + I*I), where R and I are the real and imaginary components
1335    of ARG, respectively.  Return an expression holding the result.  */
1336
1337 static tree
1338 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1339 {
1340   tree real_part, imag_part, addend1, addend2, sum, result;
1341   tree type = TREE_TYPE (TREE_TYPE (arg));
1342   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1343   enum machine_mode mode = TYPE_MODE (type);
1344
1345   if (!flag_unsafe_math_optimizations
1346       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1347       || !sqrtfn
1348       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1349     return NULL_TREE;
1350
1351   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1352                                     REALPART_EXPR, arg);
1353   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1354                                     real_part, real_part);
1355   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1356                                     IMAGPART_EXPR, arg);
1357   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1358                                     imag_part, imag_part);
1359   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1360   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1361
1362   return result;
1363 }
1364
1365 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1366    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1367    an optimal number of multiplies, when n is a constant.  */
1368
1369 static unsigned int
1370 execute_cse_sincos (void)
1371 {
1372   basic_block bb;
1373   bool cfg_changed = false;
1374
1375   calculate_dominance_info (CDI_DOMINATORS);
1376   memset (&sincos_stats, 0, sizeof (sincos_stats));
1377
1378   FOR_EACH_BB (bb)
1379     {
1380       gimple_stmt_iterator gsi;
1381       bool cleanup_eh = false;
1382
1383       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1384         {
1385           gimple stmt = gsi_stmt (gsi);
1386           tree fndecl;
1387
1388           /* Only the last stmt in a bb could throw, no need to call
1389              gimple_purge_dead_eh_edges if we change something in the middle
1390              of a basic block.  */
1391           cleanup_eh = false;
1392
1393           if (is_gimple_call (stmt)
1394               && gimple_call_lhs (stmt)
1395               && (fndecl = gimple_call_fndecl (stmt))
1396               && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1397             {
1398               tree arg, arg0, arg1, result;
1399               HOST_WIDE_INT n;
1400               location_t loc;
1401
1402               switch (DECL_FUNCTION_CODE (fndecl))
1403                 {
1404                 CASE_FLT_FN (BUILT_IN_COS):
1405                 CASE_FLT_FN (BUILT_IN_SIN):
1406                 CASE_FLT_FN (BUILT_IN_CEXPI):
1407                   /* Make sure we have either sincos or cexp.  */
1408                   if (!TARGET_HAS_SINCOS && !TARGET_C99_FUNCTIONS)
1409                     break;
1410
1411                   arg = gimple_call_arg (stmt, 0);
1412                   if (TREE_CODE (arg) == SSA_NAME)
1413                     cfg_changed |= execute_cse_sincos_1 (arg);
1414                   break;
1415
1416                 CASE_FLT_FN (BUILT_IN_POW):
1417                   arg0 = gimple_call_arg (stmt, 0);
1418                   arg1 = gimple_call_arg (stmt, 1);
1419
1420                   loc = gimple_location (stmt);
1421                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1422
1423                   if (result)
1424                     {
1425                       tree lhs = gimple_get_lhs (stmt);
1426                       gimple new_stmt = gimple_build_assign (lhs, result);
1427                       gimple_set_location (new_stmt, loc);
1428                       unlink_stmt_vdef (stmt);
1429                       gsi_replace (&gsi, new_stmt, true);
1430                       cleanup_eh = true;
1431                       if (gimple_vdef (stmt))
1432                         release_ssa_name (gimple_vdef (stmt));
1433                     }
1434                   break;
1435
1436                 CASE_FLT_FN (BUILT_IN_POWI):
1437                   arg0 = gimple_call_arg (stmt, 0);
1438                   arg1 = gimple_call_arg (stmt, 1);
1439                   if (!host_integerp (arg1, 0))
1440                     break;
1441
1442                   n = TREE_INT_CST_LOW (arg1);
1443                   loc = gimple_location (stmt);
1444                   result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1445
1446                   if (result)
1447                     {
1448                       tree lhs = gimple_get_lhs (stmt);
1449                       gimple new_stmt = gimple_build_assign (lhs, result);
1450                       gimple_set_location (new_stmt, loc);
1451                       unlink_stmt_vdef (stmt);
1452                       gsi_replace (&gsi, new_stmt, true);
1453                       cleanup_eh = true;
1454                       if (gimple_vdef (stmt))
1455                         release_ssa_name (gimple_vdef (stmt));
1456                     }
1457                   break;
1458
1459                 CASE_FLT_FN (BUILT_IN_CABS):
1460                   arg0 = gimple_call_arg (stmt, 0);
1461                   loc = gimple_location (stmt);
1462                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1463
1464                   if (result)
1465                     {
1466                       tree lhs = gimple_get_lhs (stmt);
1467                       gimple new_stmt = gimple_build_assign (lhs, result);
1468                       gimple_set_location (new_stmt, loc);
1469                       unlink_stmt_vdef (stmt);
1470                       gsi_replace (&gsi, new_stmt, true);
1471                       cleanup_eh = true;
1472                       if (gimple_vdef (stmt))
1473                         release_ssa_name (gimple_vdef (stmt));
1474                     }
1475                   break;
1476
1477                 default:;
1478                 }
1479             }
1480         }
1481       if (cleanup_eh)
1482         cfg_changed |= gimple_purge_dead_eh_edges (bb);
1483     }
1484
1485   statistics_counter_event (cfun, "sincos statements inserted",
1486                             sincos_stats.inserted);
1487
1488   free_dominance_info (CDI_DOMINATORS);
1489   return cfg_changed ? TODO_cleanup_cfg : 0;
1490 }
1491
1492 static bool
1493 gate_cse_sincos (void)
1494 {
1495   /* We no longer require either sincos or cexp, since powi expansion
1496      piggybacks on this pass.  */
1497   return optimize;
1498 }
1499
1500 struct gimple_opt_pass pass_cse_sincos =
1501 {
1502  {
1503   GIMPLE_PASS,
1504   "sincos",                             /* name */
1505   gate_cse_sincos,                      /* gate */
1506   execute_cse_sincos,                   /* execute */
1507   NULL,                                 /* sub */
1508   NULL,                                 /* next */
1509   0,                                    /* static_pass_number */
1510   TV_NONE,                              /* tv_id */
1511   PROP_ssa,                             /* properties_required */
1512   0,                                    /* properties_provided */
1513   0,                                    /* properties_destroyed */
1514   0,                                    /* todo_flags_start */
1515   TODO_update_ssa | TODO_verify_ssa
1516     | TODO_verify_stmts                 /* todo_flags_finish */
1517  }
1518 };
1519
1520 /* A symbolic number is used to detect byte permutation and selection
1521    patterns.  Therefore the field N contains an artificial number
1522    consisting of byte size markers:
1523
1524    0    - byte has the value 0
1525    1..size - byte contains the content of the byte
1526    number indexed with that value minus one  */
1527
1528 struct symbolic_number {
1529   unsigned HOST_WIDEST_INT n;
1530   int size;
1531 };
1532
1533 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1534    number N.  Return false if the requested operation is not permitted
1535    on a symbolic number.  */
1536
1537 static inline bool
1538 do_shift_rotate (enum tree_code code,
1539                  struct symbolic_number *n,
1540                  int count)
1541 {
1542   if (count % 8 != 0)
1543     return false;
1544
1545   /* Zero out the extra bits of N in order to avoid them being shifted
1546      into the significant bits.  */
1547   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1548     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1549
1550   switch (code)
1551     {
1552     case LSHIFT_EXPR:
1553       n->n <<= count;
1554       break;
1555     case RSHIFT_EXPR:
1556       n->n >>= count;
1557       break;
1558     case LROTATE_EXPR:
1559       n->n = (n->n << count) | (n->n >> ((n->size * BITS_PER_UNIT) - count));
1560       break;
1561     case RROTATE_EXPR:
1562       n->n = (n->n >> count) | (n->n << ((n->size * BITS_PER_UNIT) - count));
1563       break;
1564     default:
1565       return false;
1566     }
1567   /* Zero unused bits for size.  */
1568   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1569     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1570   return true;
1571 }
1572
1573 /* Perform sanity checking for the symbolic number N and the gimple
1574    statement STMT.  */
1575
1576 static inline bool
1577 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1578 {
1579   tree lhs_type;
1580
1581   lhs_type = gimple_expr_type (stmt);
1582
1583   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1584     return false;
1585
1586   if (TYPE_PRECISION (lhs_type) != n->size * BITS_PER_UNIT)
1587     return false;
1588
1589   return true;
1590 }
1591
1592 /* find_bswap_1 invokes itself recursively with N and tries to perform
1593    the operation given by the rhs of STMT on the result.  If the
1594    operation could successfully be executed the function returns the
1595    tree expression of the source operand and NULL otherwise.  */
1596
1597 static tree
1598 find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
1599 {
1600   enum tree_code code;
1601   tree rhs1, rhs2 = NULL;
1602   gimple rhs1_stmt, rhs2_stmt;
1603   tree source_expr1;
1604   enum gimple_rhs_class rhs_class;
1605
1606   if (!limit || !is_gimple_assign (stmt))
1607     return NULL_TREE;
1608
1609   rhs1 = gimple_assign_rhs1 (stmt);
1610
1611   if (TREE_CODE (rhs1) != SSA_NAME)
1612     return NULL_TREE;
1613
1614   code = gimple_assign_rhs_code (stmt);
1615   rhs_class = gimple_assign_rhs_class (stmt);
1616   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1617
1618   if (rhs_class == GIMPLE_BINARY_RHS)
1619     rhs2 = gimple_assign_rhs2 (stmt);
1620
1621   /* Handle unary rhs and binary rhs with integer constants as second
1622      operand.  */
1623
1624   if (rhs_class == GIMPLE_UNARY_RHS
1625       || (rhs_class == GIMPLE_BINARY_RHS
1626           && TREE_CODE (rhs2) == INTEGER_CST))
1627     {
1628       if (code != BIT_AND_EXPR
1629           && code != LSHIFT_EXPR
1630           && code != RSHIFT_EXPR
1631           && code != LROTATE_EXPR
1632           && code != RROTATE_EXPR
1633           && code != NOP_EXPR
1634           && code != CONVERT_EXPR)
1635         return NULL_TREE;
1636
1637       source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
1638
1639       /* If find_bswap_1 returned NULL STMT is a leaf node and we have
1640          to initialize the symbolic number.  */
1641       if (!source_expr1)
1642         {
1643           /* Set up the symbolic number N by setting each byte to a
1644              value between 1 and the byte size of rhs1.  The highest
1645              order byte is set to n->size and the lowest order
1646              byte to 1.  */
1647           n->size = TYPE_PRECISION (TREE_TYPE (rhs1));
1648           if (n->size % BITS_PER_UNIT != 0)
1649             return NULL_TREE;
1650           n->size /= BITS_PER_UNIT;
1651           n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1652                   (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
1653
1654           if (n->size < (int)sizeof (HOST_WIDEST_INT))
1655             n->n &= ((unsigned HOST_WIDEST_INT)1 <<
1656                      (n->size * BITS_PER_UNIT)) - 1;
1657
1658           source_expr1 = rhs1;
1659         }
1660
1661       switch (code)
1662         {
1663         case BIT_AND_EXPR:
1664           {
1665             int i;
1666             unsigned HOST_WIDEST_INT val = widest_int_cst_value (rhs2);
1667             unsigned HOST_WIDEST_INT tmp = val;
1668
1669             /* Only constants masking full bytes are allowed.  */
1670             for (i = 0; i < n->size; i++, tmp >>= BITS_PER_UNIT)
1671               if ((tmp & 0xff) != 0 && (tmp & 0xff) != 0xff)
1672                 return NULL_TREE;
1673
1674             n->n &= val;
1675           }
1676           break;
1677         case LSHIFT_EXPR:
1678         case RSHIFT_EXPR:
1679         case LROTATE_EXPR:
1680         case RROTATE_EXPR:
1681           if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
1682             return NULL_TREE;
1683           break;
1684         CASE_CONVERT:
1685           {
1686             int type_size;
1687
1688             type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1689             if (type_size % BITS_PER_UNIT != 0)
1690               return NULL_TREE;
1691
1692             if (type_size / BITS_PER_UNIT < (int)(sizeof (HOST_WIDEST_INT)))
1693               {
1694                 /* If STMT casts to a smaller type mask out the bits not
1695                    belonging to the target type.  */
1696                 n->n &= ((unsigned HOST_WIDEST_INT)1 << type_size) - 1;
1697               }
1698             n->size = type_size / BITS_PER_UNIT;
1699           }
1700           break;
1701         default:
1702           return NULL_TREE;
1703         };
1704       return verify_symbolic_number_p (n, stmt) ? source_expr1 : NULL;
1705     }
1706
1707   /* Handle binary rhs.  */
1708
1709   if (rhs_class == GIMPLE_BINARY_RHS)
1710     {
1711       struct symbolic_number n1, n2;
1712       tree source_expr2;
1713
1714       if (code != BIT_IOR_EXPR)
1715         return NULL_TREE;
1716
1717       if (TREE_CODE (rhs2) != SSA_NAME)
1718         return NULL_TREE;
1719
1720       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1721
1722       switch (code)
1723         {
1724         case BIT_IOR_EXPR:
1725           source_expr1 = find_bswap_1 (rhs1_stmt, &n1, limit - 1);
1726
1727           if (!source_expr1)
1728             return NULL_TREE;
1729
1730           source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
1731
1732           if (source_expr1 != source_expr2
1733               || n1.size != n2.size)
1734             return NULL_TREE;
1735
1736           n->size = n1.size;
1737           n->n = n1.n | n2.n;
1738
1739           if (!verify_symbolic_number_p (n, stmt))
1740             return NULL_TREE;
1741
1742           break;
1743         default:
1744           return NULL_TREE;
1745         }
1746       return source_expr1;
1747     }
1748   return NULL_TREE;
1749 }
1750
1751 /* Check if STMT completes a bswap implementation consisting of ORs,
1752    SHIFTs and ANDs.  Return the source tree expression on which the
1753    byte swap is performed and NULL if no bswap was found.  */
1754
1755 static tree
1756 find_bswap (gimple stmt)
1757 {
1758 /* The number which the find_bswap result should match in order to
1759    have a full byte swap.  The number is shifted to the left according
1760    to the size of the symbolic number before using it.  */
1761   unsigned HOST_WIDEST_INT cmp =
1762     sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1763     (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708;
1764
1765   struct symbolic_number n;
1766   tree source_expr;
1767   int limit;
1768
1769   /* The last parameter determines the depth search limit.  It usually
1770      correlates directly to the number of bytes to be touched.  We
1771      increase that number by three  here in order to also
1772      cover signed -> unsigned converions of the src operand as can be seen
1773      in libgcc, and for initial shift/and operation of the src operand.  */
1774   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
1775   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
1776   source_expr =  find_bswap_1 (stmt, &n, limit);
1777
1778   if (!source_expr)
1779     return NULL_TREE;
1780
1781   /* Zero out the extra bits of N and CMP.  */
1782   if (n.size < (int)sizeof (HOST_WIDEST_INT))
1783     {
1784       unsigned HOST_WIDEST_INT mask =
1785         ((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
1786
1787       n.n &= mask;
1788       cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
1789     }
1790
1791   /* A complete byte swap should make the symbolic number to start
1792      with the largest digit in the highest order byte.  */
1793   if (cmp != n.n)
1794     return NULL_TREE;
1795
1796   return source_expr;
1797 }
1798
1799 /* Find manual byte swap implementations and turn them into a bswap
1800    builtin invokation.  */
1801
1802 static unsigned int
1803 execute_optimize_bswap (void)
1804 {
1805   basic_block bb;
1806   bool bswap32_p, bswap64_p;
1807   bool changed = false;
1808   tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
1809
1810   if (BITS_PER_UNIT != 8)
1811     return 0;
1812
1813   if (sizeof (HOST_WIDEST_INT) < 8)
1814     return 0;
1815
1816   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
1817                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
1818   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
1819                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
1820                    || (bswap32_p && word_mode == SImode)));
1821
1822   if (!bswap32_p && !bswap64_p)
1823     return 0;
1824
1825   /* Determine the argument type of the builtins.  The code later on
1826      assumes that the return and argument type are the same.  */
1827   if (bswap32_p)
1828     {
1829       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1830       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1831     }
1832
1833   if (bswap64_p)
1834     {
1835       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1836       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1837     }
1838
1839   memset (&bswap_stats, 0, sizeof (bswap_stats));
1840
1841   FOR_EACH_BB (bb)
1842     {
1843       gimple_stmt_iterator gsi;
1844
1845       /* We do a reverse scan for bswap patterns to make sure we get the
1846          widest match. As bswap pattern matching doesn't handle
1847          previously inserted smaller bswap replacements as sub-
1848          patterns, the wider variant wouldn't be detected.  */
1849       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1850         {
1851           gimple stmt = gsi_stmt (gsi);
1852           tree bswap_src, bswap_type;
1853           tree bswap_tmp;
1854           tree fndecl = NULL_TREE;
1855           int type_size;
1856           gimple call;
1857
1858           if (!is_gimple_assign (stmt)
1859               || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
1860             continue;
1861
1862           type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1863
1864           switch (type_size)
1865             {
1866             case 32:
1867               if (bswap32_p)
1868                 {
1869                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1870                   bswap_type = bswap32_type;
1871                 }
1872               break;
1873             case 64:
1874               if (bswap64_p)
1875                 {
1876                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1877                   bswap_type = bswap64_type;
1878                 }
1879               break;
1880             default:
1881               continue;
1882             }
1883
1884           if (!fndecl)
1885             continue;
1886
1887           bswap_src = find_bswap (stmt);
1888
1889           if (!bswap_src)
1890             continue;
1891
1892           changed = true;
1893           if (type_size == 32)
1894             bswap_stats.found_32bit++;
1895           else
1896             bswap_stats.found_64bit++;
1897
1898           bswap_tmp = bswap_src;
1899
1900           /* Convert the src expression if necessary.  */
1901           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
1902             {
1903               gimple convert_stmt;
1904               bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
1905               convert_stmt = gimple_build_assign_with_ops
1906                                 (NOP_EXPR, bswap_tmp, bswap_src, NULL);
1907               gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
1908             }
1909
1910           call = gimple_build_call (fndecl, 1, bswap_tmp);
1911
1912           bswap_tmp = gimple_assign_lhs (stmt);
1913
1914           /* Convert the result if necessary.  */
1915           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
1916             {
1917               gimple convert_stmt;
1918               bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
1919               convert_stmt = gimple_build_assign_with_ops
1920                         (NOP_EXPR, gimple_assign_lhs (stmt), bswap_tmp, NULL);
1921               gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
1922             }
1923
1924           gimple_call_set_lhs (call, bswap_tmp);
1925
1926           if (dump_file)
1927             {
1928               fprintf (dump_file, "%d bit bswap implementation found at: ",
1929                        (int)type_size);
1930               print_gimple_stmt (dump_file, stmt, 0, 0);
1931             }
1932
1933           gsi_insert_after (&gsi, call, GSI_SAME_STMT);
1934           gsi_remove (&gsi, true);
1935         }
1936     }
1937
1938   statistics_counter_event (cfun, "32-bit bswap implementations found",
1939                             bswap_stats.found_32bit);
1940   statistics_counter_event (cfun, "64-bit bswap implementations found",
1941                             bswap_stats.found_64bit);
1942
1943   return (changed ? TODO_update_ssa | TODO_verify_ssa
1944           | TODO_verify_stmts : 0);
1945 }
1946
1947 static bool
1948 gate_optimize_bswap (void)
1949 {
1950   return flag_expensive_optimizations && optimize;
1951 }
1952
1953 struct gimple_opt_pass pass_optimize_bswap =
1954 {
1955  {
1956   GIMPLE_PASS,
1957   "bswap",                              /* name */
1958   gate_optimize_bswap,                  /* gate */
1959   execute_optimize_bswap,               /* execute */
1960   NULL,                                 /* sub */
1961   NULL,                                 /* next */
1962   0,                                    /* static_pass_number */
1963   TV_NONE,                              /* tv_id */
1964   PROP_ssa,                             /* properties_required */
1965   0,                                    /* properties_provided */
1966   0,                                    /* properties_destroyed */
1967   0,                                    /* todo_flags_start */
1968   0                                     /* todo_flags_finish */
1969  }
1970 };
1971
1972 /* Return true if stmt is a type conversion operation that can be stripped
1973    when used in a widening multiply operation.  */
1974 static bool
1975 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
1976 {
1977   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
1978
1979   if (TREE_CODE (result_type) == INTEGER_TYPE)
1980     {
1981       tree op_type;
1982       tree inner_op_type;
1983
1984       if (!CONVERT_EXPR_CODE_P (rhs_code))
1985         return false;
1986
1987       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
1988
1989       /* If the type of OP has the same precision as the result, then
1990          we can strip this conversion.  The multiply operation will be
1991          selected to create the correct extension as a by-product.  */
1992       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
1993         return true;
1994
1995       /* We can also strip a conversion if it preserves the signed-ness of
1996          the operation and doesn't narrow the range.  */
1997       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
1998
1999       /* If the inner-most type is unsigned, then we can strip any
2000          intermediate widening operation.  If it's signed, then the
2001          intermediate widening operation must also be signed.  */
2002       if ((TYPE_UNSIGNED (inner_op_type)
2003            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2004           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2005         return true;
2006
2007       return false;
2008     }
2009
2010   return rhs_code == FIXED_CONVERT_EXPR;
2011 }
2012
2013 /* Return true if RHS is a suitable operand for a widening multiplication,
2014    assuming a target type of TYPE.
2015    There are two cases:
2016
2017      - RHS makes some value at least twice as wide.  Store that value
2018        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2019
2020      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2021        but leave *TYPE_OUT untouched.  */
2022
2023 static bool
2024 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2025                         tree *new_rhs_out)
2026 {
2027   gimple stmt;
2028   tree type1, rhs1;
2029
2030   if (TREE_CODE (rhs) == SSA_NAME)
2031     {
2032       stmt = SSA_NAME_DEF_STMT (rhs);
2033       if (is_gimple_assign (stmt))
2034         {
2035           if (! widening_mult_conversion_strippable_p (type, stmt))
2036             rhs1 = rhs;
2037           else
2038             {
2039               rhs1 = gimple_assign_rhs1 (stmt);
2040
2041               if (TREE_CODE (rhs1) == INTEGER_CST)
2042                 {
2043                   *new_rhs_out = rhs1;
2044                   *type_out = NULL;
2045                   return true;
2046                 }
2047             }
2048         }
2049       else
2050         rhs1 = rhs;
2051
2052       type1 = TREE_TYPE (rhs1);
2053
2054       if (TREE_CODE (type1) != TREE_CODE (type)
2055           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2056         return false;
2057
2058       *new_rhs_out = rhs1;
2059       *type_out = type1;
2060       return true;
2061     }
2062
2063   if (TREE_CODE (rhs) == INTEGER_CST)
2064     {
2065       *new_rhs_out = rhs;
2066       *type_out = NULL;
2067       return true;
2068     }
2069
2070   return false;
2071 }
2072
2073 /* Return true if STMT performs a widening multiplication, assuming the
2074    output type is TYPE.  If so, store the unwidened types of the operands
2075    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2076    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2077    and *TYPE2_OUT would give the operands of the multiplication.  */
2078
2079 static bool
2080 is_widening_mult_p (gimple stmt,
2081                     tree *type1_out, tree *rhs1_out,
2082                     tree *type2_out, tree *rhs2_out)
2083 {
2084   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2085
2086   if (TREE_CODE (type) != INTEGER_TYPE
2087       && TREE_CODE (type) != FIXED_POINT_TYPE)
2088     return false;
2089
2090   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2091                                rhs1_out))
2092     return false;
2093
2094   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2095                                rhs2_out))
2096     return false;
2097
2098   if (*type1_out == NULL)
2099     {
2100       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2101         return false;
2102       *type1_out = *type2_out;
2103     }
2104
2105   if (*type2_out == NULL)
2106     {
2107       if (!int_fits_type_p (*rhs2_out, *type1_out))
2108         return false;
2109       *type2_out = *type1_out;
2110     }
2111
2112   /* Ensure that the larger of the two operands comes first. */
2113   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2114     {
2115       tree tmp;
2116       tmp = *type1_out;
2117       *type1_out = *type2_out;
2118       *type2_out = tmp;
2119       tmp = *rhs1_out;
2120       *rhs1_out = *rhs2_out;
2121       *rhs2_out = tmp;
2122     }
2123
2124   return true;
2125 }
2126
2127 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2128    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2129    value is true iff we converted the statement.  */
2130
2131 static bool
2132 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2133 {
2134   tree lhs, rhs1, rhs2, type, type1, type2;
2135   enum insn_code handler;
2136   enum machine_mode to_mode, from_mode, actual_mode;
2137   optab op;
2138   int actual_precision;
2139   location_t loc = gimple_location (stmt);
2140   bool from_unsigned1, from_unsigned2;
2141
2142   lhs = gimple_assign_lhs (stmt);
2143   type = TREE_TYPE (lhs);
2144   if (TREE_CODE (type) != INTEGER_TYPE)
2145     return false;
2146
2147   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2148     return false;
2149
2150   to_mode = TYPE_MODE (type);
2151   from_mode = TYPE_MODE (type1);
2152   from_unsigned1 = TYPE_UNSIGNED (type1);
2153   from_unsigned2 = TYPE_UNSIGNED (type2);
2154
2155   if (from_unsigned1 && from_unsigned2)
2156     op = umul_widen_optab;
2157   else if (!from_unsigned1 && !from_unsigned2)
2158     op = smul_widen_optab;
2159   else
2160     op = usmul_widen_optab;
2161
2162   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2163                                                   0, &actual_mode);
2164
2165   if (handler == CODE_FOR_nothing)
2166     {
2167       if (op != smul_widen_optab)
2168         {
2169           /* We can use a signed multiply with unsigned types as long as
2170              there is a wider mode to use, or it is the smaller of the two
2171              types that is unsigned.  Note that type1 >= type2, always.  */
2172           if ((TYPE_UNSIGNED (type1)
2173                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2174               || (TYPE_UNSIGNED (type2)
2175                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2176             {
2177               from_mode = GET_MODE_WIDER_MODE (from_mode);
2178               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2179                 return false;
2180             }
2181
2182           op = smul_widen_optab;
2183           handler = find_widening_optab_handler_and_mode (op, to_mode,
2184                                                           from_mode, 0,
2185                                                           &actual_mode);
2186
2187           if (handler == CODE_FOR_nothing)
2188             return false;
2189
2190           from_unsigned1 = from_unsigned2 = false;
2191         }
2192       else
2193         return false;
2194     }
2195
2196   /* Ensure that the inputs to the handler are in the correct precison
2197      for the opcode.  This will be the full mode size.  */
2198   actual_precision = GET_MODE_PRECISION (actual_mode);
2199   if (2 * actual_precision > TYPE_PRECISION (type))
2200     return false;
2201   if (actual_precision != TYPE_PRECISION (type1)
2202       || from_unsigned1 != TYPE_UNSIGNED (type1))
2203     rhs1 = build_and_insert_cast (gsi, loc,
2204                                   build_nonstandard_integer_type
2205                                     (actual_precision, from_unsigned1), rhs1);
2206   if (actual_precision != TYPE_PRECISION (type2)
2207       || from_unsigned2 != TYPE_UNSIGNED (type2))
2208     rhs2 = build_and_insert_cast (gsi, loc,
2209                                   build_nonstandard_integer_type
2210                                     (actual_precision, from_unsigned2), rhs2);
2211
2212   /* Handle constants.  */
2213   if (TREE_CODE (rhs1) == INTEGER_CST)
2214     rhs1 = fold_convert (type1, rhs1);
2215   if (TREE_CODE (rhs2) == INTEGER_CST)
2216     rhs2 = fold_convert (type2, rhs2);
2217
2218   gimple_assign_set_rhs1 (stmt, rhs1);
2219   gimple_assign_set_rhs2 (stmt, rhs2);
2220   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2221   update_stmt (stmt);
2222   widen_mul_stats.widen_mults_inserted++;
2223   return true;
2224 }
2225
2226 /* Process a single gimple statement STMT, which is found at the
2227    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2228    rhs (given by CODE), and try to convert it into a
2229    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2230    is true iff we converted the statement.  */
2231
2232 static bool
2233 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2234                             enum tree_code code)
2235 {
2236   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2237   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2238   tree type, type1, type2, optype;
2239   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2240   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2241   optab this_optab;
2242   enum tree_code wmult_code;
2243   enum insn_code handler;
2244   enum machine_mode to_mode, from_mode, actual_mode;
2245   location_t loc = gimple_location (stmt);
2246   int actual_precision;
2247   bool from_unsigned1, from_unsigned2;
2248
2249   lhs = gimple_assign_lhs (stmt);
2250   type = TREE_TYPE (lhs);
2251   if (TREE_CODE (type) != INTEGER_TYPE
2252       && TREE_CODE (type) != FIXED_POINT_TYPE)
2253     return false;
2254
2255   if (code == MINUS_EXPR)
2256     wmult_code = WIDEN_MULT_MINUS_EXPR;
2257   else
2258     wmult_code = WIDEN_MULT_PLUS_EXPR;
2259
2260   rhs1 = gimple_assign_rhs1 (stmt);
2261   rhs2 = gimple_assign_rhs2 (stmt);
2262
2263   if (TREE_CODE (rhs1) == SSA_NAME)
2264     {
2265       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2266       if (is_gimple_assign (rhs1_stmt))
2267         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2268     }
2269
2270   if (TREE_CODE (rhs2) == SSA_NAME)
2271     {
2272       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2273       if (is_gimple_assign (rhs2_stmt))
2274         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2275     }
2276
2277   /* Allow for one conversion statement between the multiply
2278      and addition/subtraction statement.  If there are more than
2279      one conversions then we assume they would invalidate this
2280      transformation.  If that's not the case then they should have
2281      been folded before now.  */
2282   if (CONVERT_EXPR_CODE_P (rhs1_code))
2283     {
2284       conv1_stmt = rhs1_stmt;
2285       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2286       if (TREE_CODE (rhs1) == SSA_NAME)
2287         {
2288           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2289           if (is_gimple_assign (rhs1_stmt))
2290             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2291         }
2292       else
2293         return false;
2294     }
2295   if (CONVERT_EXPR_CODE_P (rhs2_code))
2296     {
2297       conv2_stmt = rhs2_stmt;
2298       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2299       if (TREE_CODE (rhs2) == SSA_NAME)
2300         {
2301           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2302           if (is_gimple_assign (rhs2_stmt))
2303             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2304         }
2305       else
2306         return false;
2307     }
2308
2309   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2310      is_widening_mult_p, but we still need the rhs returns.
2311
2312      It might also appear that it would be sufficient to use the existing
2313      operands of the widening multiply, but that would limit the choice of
2314      multiply-and-accumulate instructions.  */
2315   if (code == PLUS_EXPR
2316       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2317     {
2318       if (!is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2319                                &type2, &mult_rhs2))
2320         return false;
2321       add_rhs = rhs2;
2322       conv_stmt = conv1_stmt;
2323     }
2324   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2325     {
2326       if (!is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2327                                &type2, &mult_rhs2))
2328         return false;
2329       add_rhs = rhs1;
2330       conv_stmt = conv2_stmt;
2331     }
2332   else
2333     return false;
2334
2335   to_mode = TYPE_MODE (type);
2336   from_mode = TYPE_MODE (type1);
2337   from_unsigned1 = TYPE_UNSIGNED (type1);
2338   from_unsigned2 = TYPE_UNSIGNED (type2);
2339   optype = type1;
2340
2341   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2342   if (from_unsigned1 != from_unsigned2)
2343     {
2344       if (!INTEGRAL_TYPE_P (type))
2345         return false;
2346       /* We can use a signed multiply with unsigned types as long as
2347          there is a wider mode to use, or it is the smaller of the two
2348          types that is unsigned.  Note that type1 >= type2, always.  */
2349       if ((from_unsigned1
2350            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2351           || (from_unsigned2
2352               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2353         {
2354           from_mode = GET_MODE_WIDER_MODE (from_mode);
2355           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2356             return false;
2357         }
2358
2359       from_unsigned1 = from_unsigned2 = false;
2360       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2361                                                false);
2362     }
2363
2364   /* If there was a conversion between the multiply and addition
2365      then we need to make sure it fits a multiply-and-accumulate.
2366      The should be a single mode change which does not change the
2367      value.  */
2368   if (conv_stmt)
2369     {
2370       /* We use the original, unmodified data types for this.  */
2371       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2372       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2373       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2374       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2375
2376       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2377         {
2378           /* Conversion is a truncate.  */
2379           if (TYPE_PRECISION (to_type) < data_size)
2380             return false;
2381         }
2382       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2383         {
2384           /* Conversion is an extend.  Check it's the right sort.  */
2385           if (TYPE_UNSIGNED (from_type) != is_unsigned
2386               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2387             return false;
2388         }
2389       /* else convert is a no-op for our purposes.  */
2390     }
2391
2392   /* Verify that the machine can perform a widening multiply
2393      accumulate in this mode/signedness combination, otherwise
2394      this transformation is likely to pessimize code.  */
2395   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2396   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2397                                                   from_mode, 0, &actual_mode);
2398
2399   if (handler == CODE_FOR_nothing)
2400     return false;
2401
2402   /* Ensure that the inputs to the handler are in the correct precison
2403      for the opcode.  This will be the full mode size.  */
2404   actual_precision = GET_MODE_PRECISION (actual_mode);
2405   if (actual_precision != TYPE_PRECISION (type1)
2406       || from_unsigned1 != TYPE_UNSIGNED (type1))
2407     mult_rhs1 = build_and_insert_cast (gsi, loc,
2408                                        build_nonstandard_integer_type
2409                                          (actual_precision, from_unsigned1),
2410                                        mult_rhs1);
2411   if (actual_precision != TYPE_PRECISION (type2)
2412       || from_unsigned2 != TYPE_UNSIGNED (type2))
2413     mult_rhs2 = build_and_insert_cast (gsi, loc,
2414                                        build_nonstandard_integer_type
2415                                          (actual_precision, from_unsigned2),
2416                                        mult_rhs2);
2417
2418   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2419     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2420
2421   /* Handle constants.  */
2422   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2423     mult_rhs1 = fold_convert (type1, mult_rhs1);
2424   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2425     mult_rhs2 = fold_convert (type2, mult_rhs2);
2426
2427   gimple_assign_set_rhs_with_ops_1 (gsi, wmult_code, mult_rhs1, mult_rhs2,
2428                                     add_rhs);
2429   update_stmt (gsi_stmt (*gsi));
2430   widen_mul_stats.maccs_inserted++;
2431   return true;
2432 }
2433
2434 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
2435    with uses in additions and subtractions to form fused multiply-add
2436    operations.  Returns true if successful and MUL_STMT should be removed.  */
2437
2438 static bool
2439 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
2440 {
2441   tree mul_result = gimple_get_lhs (mul_stmt);
2442   tree type = TREE_TYPE (mul_result);
2443   gimple use_stmt, neguse_stmt, fma_stmt;
2444   use_operand_p use_p;
2445   imm_use_iterator imm_iter;
2446
2447   if (FLOAT_TYPE_P (type)
2448       && flag_fp_contract_mode == FP_CONTRACT_OFF)
2449     return false;
2450
2451   /* We don't want to do bitfield reduction ops.  */
2452   if (INTEGRAL_TYPE_P (type)
2453       && (TYPE_PRECISION (type)
2454           != GET_MODE_PRECISION (TYPE_MODE (type))))
2455     return false;
2456
2457   /* If the target doesn't support it, don't generate it.  We assume that
2458      if fma isn't available then fms, fnma or fnms are not either.  */
2459   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
2460     return false;
2461
2462   /* If the multiplication has zero uses, it is kept around probably because
2463      of -fnon-call-exceptions.  Don't optimize it away in that case,
2464      it is DCE job.  */
2465   if (has_zero_uses (mul_result))
2466     return false;
2467
2468   /* Make sure that the multiplication statement becomes dead after
2469      the transformation, thus that all uses are transformed to FMAs.
2470      This means we assume that an FMA operation has the same cost
2471      as an addition.  */
2472   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
2473     {
2474       enum tree_code use_code;
2475       tree result = mul_result;
2476       bool negate_p = false;
2477
2478       use_stmt = USE_STMT (use_p);
2479
2480       if (is_gimple_debug (use_stmt))
2481         continue;
2482
2483       /* For now restrict this operations to single basic blocks.  In theory
2484          we would want to support sinking the multiplication in
2485          m = a*b;
2486          if ()
2487            ma = m + c;
2488          else
2489            d = m;
2490          to form a fma in the then block and sink the multiplication to the
2491          else block.  */
2492       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2493         return false;
2494
2495       if (!is_gimple_assign (use_stmt))
2496         return false;
2497
2498       use_code = gimple_assign_rhs_code (use_stmt);
2499
2500       /* A negate on the multiplication leads to FNMA.  */
2501       if (use_code == NEGATE_EXPR)
2502         {
2503           ssa_op_iter iter;
2504           use_operand_p usep;
2505
2506           result = gimple_assign_lhs (use_stmt);
2507
2508           /* Make sure the negate statement becomes dead with this
2509              single transformation.  */
2510           if (!single_imm_use (gimple_assign_lhs (use_stmt),
2511                                &use_p, &neguse_stmt))
2512             return false;
2513
2514           /* Make sure the multiplication isn't also used on that stmt.  */
2515           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
2516             if (USE_FROM_PTR (usep) == mul_result)
2517               return false;
2518
2519           /* Re-validate.  */
2520           use_stmt = neguse_stmt;
2521           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2522             return false;
2523           if (!is_gimple_assign (use_stmt))
2524             return false;
2525
2526           use_code = gimple_assign_rhs_code (use_stmt);
2527           negate_p = true;
2528         }
2529
2530       switch (use_code)
2531         {
2532         case MINUS_EXPR:
2533           if (gimple_assign_rhs2 (use_stmt) == result)
2534             negate_p = !negate_p;
2535           break;
2536         case PLUS_EXPR:
2537           break;
2538         default:
2539           /* FMA can only be formed from PLUS and MINUS.  */
2540           return false;
2541         }
2542
2543       /* We can't handle a * b + a * b.  */
2544       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
2545         return false;
2546
2547       /* While it is possible to validate whether or not the exact form
2548          that we've recognized is available in the backend, the assumption
2549          is that the transformation is never a loss.  For instance, suppose
2550          the target only has the plain FMA pattern available.  Consider
2551          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
2552          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
2553          still have 3 operations, but in the FMA form the two NEGs are
2554          independent and could be run in parallel.  */
2555     }
2556
2557   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2558     {
2559       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2560       enum tree_code use_code;
2561       tree addop, mulop1 = op1, result = mul_result;
2562       bool negate_p = false;
2563
2564       if (is_gimple_debug (use_stmt))
2565         continue;
2566
2567       use_code = gimple_assign_rhs_code (use_stmt);
2568       if (use_code == NEGATE_EXPR)
2569         {
2570           result = gimple_assign_lhs (use_stmt);
2571           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2572           gsi_remove (&gsi, true);
2573           release_defs (use_stmt);
2574
2575           use_stmt = neguse_stmt;
2576           gsi = gsi_for_stmt (use_stmt);
2577           use_code = gimple_assign_rhs_code (use_stmt);
2578           negate_p = true;
2579         }
2580
2581       if (gimple_assign_rhs1 (use_stmt) == result)
2582         {
2583           addop = gimple_assign_rhs2 (use_stmt);
2584           /* a * b - c -> a * b + (-c)  */
2585           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2586             addop = force_gimple_operand_gsi (&gsi,
2587                                               build1 (NEGATE_EXPR,
2588                                                       type, addop),
2589                                               true, NULL_TREE, true,
2590                                               GSI_SAME_STMT);
2591         }
2592       else
2593         {
2594           addop = gimple_assign_rhs1 (use_stmt);
2595           /* a - b * c -> (-b) * c + a */
2596           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2597             negate_p = !negate_p;
2598         }
2599
2600       if (negate_p)
2601         mulop1 = force_gimple_operand_gsi (&gsi,
2602                                            build1 (NEGATE_EXPR,
2603                                                    type, mulop1),
2604                                            true, NULL_TREE, true,
2605                                            GSI_SAME_STMT);
2606
2607       fma_stmt = gimple_build_assign_with_ops3 (FMA_EXPR,
2608                                                 gimple_assign_lhs (use_stmt),
2609                                                 mulop1, op2,
2610                                                 addop);
2611       gsi_replace (&gsi, fma_stmt, true);
2612       widen_mul_stats.fmas_inserted++;
2613     }
2614
2615   return true;
2616 }
2617
2618 /* Find integer multiplications where the operands are extended from
2619    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
2620    where appropriate.  */
2621
2622 static unsigned int
2623 execute_optimize_widening_mul (void)
2624 {
2625   basic_block bb;
2626   bool cfg_changed = false;
2627
2628   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
2629
2630   FOR_EACH_BB (bb)
2631     {
2632       gimple_stmt_iterator gsi;
2633
2634       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
2635         {
2636           gimple stmt = gsi_stmt (gsi);
2637           enum tree_code code;
2638
2639           if (is_gimple_assign (stmt))
2640             {
2641               code = gimple_assign_rhs_code (stmt);
2642               switch (code)
2643                 {
2644                 case MULT_EXPR:
2645                   if (!convert_mult_to_widen (stmt, &gsi)
2646                       && convert_mult_to_fma (stmt,
2647                                               gimple_assign_rhs1 (stmt),
2648                                               gimple_assign_rhs2 (stmt)))
2649                     {
2650                       gsi_remove (&gsi, true);
2651                       release_defs (stmt);
2652                       continue;
2653                     }
2654                   break;
2655
2656                 case PLUS_EXPR:
2657                 case MINUS_EXPR:
2658                   convert_plusminus_to_widen (&gsi, stmt, code);
2659                   break;
2660
2661                 default:;
2662                 }
2663             }
2664           else if (is_gimple_call (stmt)
2665                    && gimple_call_lhs (stmt))
2666             {
2667               tree fndecl = gimple_call_fndecl (stmt);
2668               if (fndecl
2669                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
2670                 {
2671                   switch (DECL_FUNCTION_CODE (fndecl))
2672                     {
2673                       case BUILT_IN_POWF:
2674                       case BUILT_IN_POW:
2675                       case BUILT_IN_POWL:
2676                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
2677                             && REAL_VALUES_EQUAL
2678                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
2679                                   dconst2)
2680                             && convert_mult_to_fma (stmt,
2681                                                     gimple_call_arg (stmt, 0),
2682                                                     gimple_call_arg (stmt, 0)))
2683                           {
2684                             unlink_stmt_vdef (stmt);
2685                             if (gsi_remove (&gsi, true)
2686                                 && gimple_purge_dead_eh_edges (bb))
2687                               cfg_changed = true;
2688                             release_defs (stmt);
2689                             continue;
2690                           }
2691                           break;
2692
2693                       default:;
2694                     }
2695                 }
2696             }
2697           gsi_next (&gsi);
2698         }
2699     }
2700
2701   statistics_counter_event (cfun, "widening multiplications inserted",
2702                             widen_mul_stats.widen_mults_inserted);
2703   statistics_counter_event (cfun, "widening maccs inserted",
2704                             widen_mul_stats.maccs_inserted);
2705   statistics_counter_event (cfun, "fused multiply-adds inserted",
2706                             widen_mul_stats.fmas_inserted);
2707
2708   return cfg_changed ? TODO_cleanup_cfg : 0;
2709 }
2710
2711 static bool
2712 gate_optimize_widening_mul (void)
2713 {
2714   return flag_expensive_optimizations && optimize;
2715 }
2716
2717 struct gimple_opt_pass pass_optimize_widening_mul =
2718 {
2719  {
2720   GIMPLE_PASS,
2721   "widening_mul",                       /* name */
2722   gate_optimize_widening_mul,           /* gate */
2723   execute_optimize_widening_mul,        /* execute */
2724   NULL,                                 /* sub */
2725   NULL,                                 /* next */
2726   0,                                    /* static_pass_number */
2727   TV_NONE,                              /* tv_id */
2728   PROP_ssa,                             /* properties_required */
2729   0,                                    /* properties_provided */
2730   0,                                    /* properties_destroyed */
2731   0,                                    /* todo_flags_start */
2732   TODO_verify_ssa
2733   | TODO_verify_stmts
2734   | TODO_update_ssa                     /* todo_flags_finish */
2735  }
2736 };