gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005-2013 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  21    operations.  These are common in sequences such as this one:
  22
  23         modulus = sqrt(x*x + y*y + z*z);
  24         x = x / modulus;
  25         y = y / modulus;
  26         z = z / modulus;
  27
  28    that can be optimized to
  29
  30         modulus = sqrt(x*x + y*y + z*z);
  31         rmodulus = 1.0 / modulus;
  32         x = x * rmodulus;
  33         y = y * rmodulus;
  34         z = z * rmodulus;
  35
  36    We do this for loop invariant divisors, and with this pass whenever
  37    we notice that a division has the same divisor multiple times.
  38
  39    Of course, like in PRE, we don't insert a division if a dominator
  40    already has one.  However, this cannot be done as an extension of
  41    PRE for several reasons.
  42
  43    First of all, with some experiments it was found out that the
  44    transformation is not always useful if there are only two divisions
  45    hy the same divisor.  This is probably because modern processors
  46    can pipeline the divisions; on older, in-order processors it should
  47    still be effective to optimize two divisions by the same number.
  48    We make this a param, and it shall be called N in the remainder of
  49    this comment.
  50
  51    Second, if trapping math is active, we have less freedom on where
  52    to insert divisions: we can only do so in basic blocks that already
  53    contain one.  (If divisions don't trap, instead, we can insert
  54    divisions elsewhere, which will be in blocks that are common dominators
  55    of those that have the division).
  56
  57    We really don't want to compute the reciprocal unless a division will
  58    be found.  To do this, we won't insert the division in a basic block
  59    that has less than N divisions *post-dominating* it.
  60
  61    The algorithm constructs a subset of the dominator tree, holding the
  62    blocks containing the divisions and the common dominators to them,
  63    and walk it twice.  The first walk is in post-order, and it annotates
  64    each block with the number of divisions that post-dominate it: this
  65    gives information on where divisions can be inserted profitably.
  66    The second walk is in pre-order, and it inserts divisions as explained
  67    above, and replaces divisions by multiplications.
  68
  69    In the best case, the cost of the pass is O(n_statements).  In the
  70    worst-case, the cost is due to creating the dominator tree subset,
  71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  72    for n_statements / n_basic_blocks statements.  So, the amortized cost
  73    of creating the dominator tree subset is O(n_basic_blocks) and the
  74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  75
  76    More practically, the cost will be small because there are few
  77    divisions, and they tend to be in the same basic block, so insert_bb
  78    is called very few times.
  79
  80    If we did this using domwalk.c, an efficient implementation would have
  81    to work on all the variables in a single pass, because we could not
  82    work on just a subset of the dominator tree, as we do now, and the
  83    cost would also be something like O(n_statements * n_basic_blocks).
  84    The data structures would be more complex in order to work on all the
  85    variables in a single pass.  */
  86
  87 #include "config.h"
  88 #include "system.h"
  89 #include "coretypes.h"
  90 #include "tm.h"
  91 #include "flags.h"
  92 #include "tree.h"
  93 #include "gimplify.h"
  94 #include "gimple-iterator.h"
  95 #include "gimple-ssa.h"
  96 #include "tree-cfg.h"
  97 #include "tree-phinodes.h"
  98 #include "ssa-iterators.h"
  99 #include "tree-ssanames.h"
 100 #include "tree-dfa.h"
 101 #include "tree-ssa.h"
 102 #include "tree-pass.h"
 103 #include "alloc-pool.h"
 104 #include "basic-block.h"
 105 #include "target.h"
 106 #include "gimple-pretty-print.h"
 107
 108 /* FIXME: RTL headers have to be included here for optabs.  */
 109 #include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
 110 #include "expr.h"               /* Because optabs.h wants sepops.  */
 111 #include "optabs.h"
 112
 113 /* This structure represents one basic block that either computes a
 114    division, or is a common dominator for basic block that compute a
 115    division.  */
 116 struct occurrence {
 117   /* The basic block represented by this structure.  */
 118   basic_block bb;
 119
 120   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 121      inserted in BB.  */
 122   tree recip_def;
 123
 124   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 125      was inserted in BB.  */
 126   gimple recip_def_stmt;
 127
 128   /* Pointer to a list of "struct occurrence"s for blocks dominated
 129      by BB.  */
 130   struct occurrence *children;
 131
 132   /* Pointer to the next "struct occurrence"s in the list of blocks
 133      sharing a common dominator.  */
 134   struct occurrence *next;
 135
 136   /* The number of divisions that are in BB before compute_merit.  The
 137      number of divisions that are in BB or post-dominate it after
 138      compute_merit.  */
 139   int num_divisions;
 140
 141   /* True if the basic block has a division, false if it is a common
 142      dominator for basic blocks that do.  If it is false and trapping
 143      math is active, BB is not a candidate for inserting a reciprocal.  */
 144   bool bb_has_division;
 145 };
 146
 147 static struct
 148 {
 149   /* Number of 1.0/X ops inserted.  */
 150   int rdivs_inserted;
 151
 152   /* Number of 1.0/FUNC ops inserted.  */
 153   int rfuncs_inserted;
 154 } reciprocal_stats;
 155
 156 static struct
 157 {
 158   /* Number of cexpi calls inserted.  */
 159   int inserted;
 160 } sincos_stats;
 161
 162 static struct
 163 {
 164   /* Number of hand-written 16-bit bswaps found.  */
 165   int found_16bit;
 166
 167   /* Number of hand-written 32-bit bswaps found.  */
 168   int found_32bit;
 169
 170   /* Number of hand-written 64-bit bswaps found.  */
 171   int found_64bit;
 172 } bswap_stats;
 173
 174 static struct
 175 {
 176   /* Number of widening multiplication ops inserted.  */
 177   int widen_mults_inserted;
 178
 179   /* Number of integer multiply-and-accumulate ops inserted.  */
 180   int maccs_inserted;
 181
 182   /* Number of fp fused multiply-add ops inserted.  */
 183   int fmas_inserted;
 184 } widen_mul_stats;
 185
 186 /* The instance of "struct occurrence" representing the highest
 187    interesting block in the dominator tree.  */
 188 static struct occurrence *occ_head;
 189
 190 /* Allocation pool for getting instances of "struct occurrence".  */
 191 static alloc_pool occ_pool;
 192
 193
 194
 195 /* Allocate and return a new struct occurrence for basic block BB, and
 196    whose children list is headed by CHILDREN.  */
 197 static struct occurrence *
 198 occ_new (basic_block bb, struct occurrence *children)
 199 {
 200   struct occurrence *occ;
 201
 202   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
 203   memset (occ, 0, sizeof (struct occurrence));
 204
 205   occ->bb = bb;
 206   occ->children = children;
 207   return occ;
 208 }
 209
 210
 211 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 212    list of "struct occurrence"s, one per basic block, having IDOM as
 213    their common dominator.
 214
 215    We try to insert NEW_OCC as deep as possible in the tree, and we also
 216    insert any other block that is a common dominator for BB and one
 217    block already in the tree.  */
 218
 219 static void
 220 insert_bb (struct occurrence *new_occ, basic_block idom,
 221            struct occurrence **p_head)
 222 {
 223   struct occurrence *occ, **p_occ;
 224
 225   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 226     {
 227       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 228       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 229       if (dom == bb)
 230         {
 231           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 232              from its list.  */
 233           *p_occ = occ->next;
 234           occ->next = new_occ->children;
 235           new_occ->children = occ;
 236
 237           /* Try the next block (it may as well be dominated by BB).  */
 238         }
 239
 240       else if (dom == occ_bb)
 241         {
 242           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 243           insert_bb (new_occ, dom, &occ->children);
 244           return;
 245         }
 246
 247       else if (dom != idom)
 248         {
 249           gcc_assert (!dom->aux);
 250
 251           /* There is a dominator between IDOM and BB, add it and make
 252              two children out of NEW_OCC and OCC.  First, remove OCC from
 253              its list.  */
 254           *p_occ = occ->next;
 255           new_occ->next = occ;
 256           occ->next = NULL;
 257
 258           /* None of the previous blocks has DOM as a dominator: if we tail
 259              recursed, we would reexamine them uselessly. Just switch BB with
 260              DOM, and go on looking for blocks dominated by DOM.  */
 261           new_occ = occ_new (dom, new_occ);
 262         }
 263
 264       else
 265         {
 266           /* Nothing special, go on with the next element.  */
 267           p_occ = &occ->next;
 268         }
 269     }
 270
 271   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 272   new_occ->next = *p_head;
 273   *p_head = new_occ;
 274 }
 275
 276 /* Register that we found a division in BB.  */
 277
 278 static inline void
 279 register_division_in (basic_block bb)
 280 {
 281   struct occurrence *occ;
 282
 283   occ = (struct occurrence *) bb->aux;
 284   if (!occ)
 285     {
 286       occ = occ_new (bb, NULL);
 287       insert_bb (occ, ENTRY_BLOCK_PTR, &occ_head);
 288     }
 289
 290   occ->bb_has_division = true;
 291   occ->num_divisions++;
 292 }
 293
 294
 295 /* Compute the number of divisions that postdominate each block in OCC and
 296    its children.  */
 297
 298 static void
 299 compute_merit (struct occurrence *occ)
 300 {
 301   struct occurrence *occ_child;
 302   basic_block dom = occ->bb;
 303
 304   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 305     {
 306       basic_block bb;
 307       if (occ_child->children)
 308         compute_merit (occ_child);
 309
 310       if (flag_exceptions)
 311         bb = single_noncomplex_succ (dom);
 312       else
 313         bb = dom;
 314
 315       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 316         occ->num_divisions += occ_child->num_divisions;
 317     }
 318 }
 319
 320
 321 /* Return whether USE_STMT is a floating-point division by DEF.  */
 322 static inline bool
 323 is_division_by (gimple use_stmt, tree def)
 324 {
 325   return is_gimple_assign (use_stmt)
 326          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 327          && gimple_assign_rhs2 (use_stmt) == def
 328          /* Do not recognize x / x as valid division, as we are getting
 329             confused later by replacing all immediate uses x in such
 330             a stmt.  */
 331          && gimple_assign_rhs1 (use_stmt) != def;
 332 }
 333
 334 /* Walk the subset of the dominator tree rooted at OCC, setting the
 335    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 336    the given basic block.  The field may be left NULL, of course,
 337    if it is not possible or profitable to do the optimization.
 338
 339    DEF_BSI is an iterator pointing at the statement defining DEF.
 340    If RECIP_DEF is set, a dominator already has a computation that can
 341    be used.  */
 342
 343 static void
 344 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 345                     tree def, tree recip_def, int threshold)
 346 {
 347   tree type;
 348   gimple new_stmt;
 349   gimple_stmt_iterator gsi;
 350   struct occurrence *occ_child;
 351
 352   if (!recip_def
 353       && (occ->bb_has_division || !flag_trapping_math)
 354       && occ->num_divisions >= threshold)
 355     {
 356       /* Make a variable with the replacement and substitute it.  */
 357       type = TREE_TYPE (def);
 358       recip_def = create_tmp_reg (type, "reciptmp");
 359       new_stmt = gimple_build_assign_with_ops (RDIV_EXPR, recip_def,
 360                                                build_one_cst (type), def);
 361
 362       if (occ->bb_has_division)
 363         {
 364           /* Case 1: insert before an existing division.  */
 365           gsi = gsi_after_labels (occ->bb);
 366           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 367             gsi_next (&gsi);
 368
 369           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 370         }
 371       else if (def_gsi && occ->bb == def_gsi->bb)
 372         {
 373           /* Case 2: insert right after the definition.  Note that this will
 374              never happen if the definition statement can throw, because in
 375              that case the sole successor of the statement's basic block will
 376              dominate all the uses as well.  */
 377           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 378         }
 379       else
 380         {
 381           /* Case 3: insert in a basic block not containing defs/uses.  */
 382           gsi = gsi_after_labels (occ->bb);
 383           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 384         }
 385
 386       reciprocal_stats.rdivs_inserted++;
 387
 388       occ->recip_def_stmt = new_stmt;
 389     }
 390
 391   occ->recip_def = recip_def;
 392   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 393     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 394 }
 395
 396
 397 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 398    possible.  */
 399
 400 static inline void
 401 replace_reciprocal (use_operand_p use_p)
 402 {
 403   gimple use_stmt = USE_STMT (use_p);
 404   basic_block bb = gimple_bb (use_stmt);
 405   struct occurrence *occ = (struct occurrence *) bb->aux;
 406
 407   if (optimize_bb_for_speed_p (bb)
 408       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 409     {
 410       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 411       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 412       SET_USE (use_p, occ->recip_def);
 413       fold_stmt_inplace (&gsi);
 414       update_stmt (use_stmt);
 415     }
 416 }
 417
 418
 419 /* Free OCC and return one more "struct occurrence" to be freed.  */
 420
 421 static struct occurrence *
 422 free_bb (struct occurrence *occ)
 423 {
 424   struct occurrence *child, *next;
 425
 426   /* First get the two pointers hanging off OCC.  */
 427   next = occ->next;
 428   child = occ->children;
 429   occ->bb->aux = NULL;
 430   pool_free (occ_pool, occ);
 431
 432   /* Now ensure that we don't recurse unless it is necessary.  */
 433   if (!child)
 434     return next;
 435   else
 436     {
 437       while (next)
 438         next = free_bb (next);
 439
 440       return child;
 441     }
 442 }
 443
 444
 445 /* Look for floating-point divisions among DEF's uses, and try to
 446    replace them by multiplications with the reciprocal.  Add
 447    as many statements computing the reciprocal as needed.
 448
 449    DEF must be a GIMPLE register of a floating-point type.  */
 450
 451 static void
 452 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 453 {
 454   use_operand_p use_p;
 455   imm_use_iterator use_iter;
 456   struct occurrence *occ;
 457   int count = 0, threshold;
 458
 459   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 460
 461   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 462     {
 463       gimple use_stmt = USE_STMT (use_p);
 464       if (is_division_by (use_stmt, def))
 465         {
 466           register_division_in (gimple_bb (use_stmt));
 467           count++;
 468         }
 469     }
 470
 471   /* Do the expensive part only if we can hope to optimize something.  */
 472   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 473   if (count >= threshold)
 474     {
 475       gimple use_stmt;
 476       for (occ = occ_head; occ; occ = occ->next)
 477         {
 478           compute_merit (occ);
 479           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 480         }
 481
 482       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 483         {
 484           if (is_division_by (use_stmt, def))
 485             {
 486               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 487                 replace_reciprocal (use_p);
 488             }
 489         }
 490     }
 491
 492   for (occ = occ_head; occ; )
 493     occ = free_bb (occ);
 494
 495   occ_head = NULL;
 496 }
 497
 498 static bool
 499 gate_cse_reciprocals (void)
 500 {
 501   return optimize && flag_reciprocal_math;
 502 }
 503
 504 /* Go through all the floating-point SSA_NAMEs, and call
 505    execute_cse_reciprocals_1 on each of them.  */
 506 static unsigned int
 507 execute_cse_reciprocals (void)
 508 {
 509   basic_block bb;
 510   tree arg;
 511
 512   occ_pool = create_alloc_pool ("dominators for recip",
 513                                 sizeof (struct occurrence),
 514                                 n_basic_blocks / 3 + 1);
 515
 516   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 517   calculate_dominance_info (CDI_DOMINATORS);
 518   calculate_dominance_info (CDI_POST_DOMINATORS);
 519
 520 #ifdef ENABLE_CHECKING
 521   FOR_EACH_BB (bb)
 522     gcc_assert (!bb->aux);
 523 #endif
 524
 525   for (arg = DECL_ARGUMENTS (cfun->decl); arg; arg = DECL_CHAIN (arg))
 526     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 527         && is_gimple_reg (arg))
 528       {
 529         tree name = ssa_default_def (cfun, arg);
 530         if (name)
 531           execute_cse_reciprocals_1 (NULL, name);
 532       }
 533
 534   FOR_EACH_BB (bb)
 535     {
 536       gimple_stmt_iterator gsi;
 537       gimple phi;
 538       tree def;
 539
 540       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 541         {
 542           phi = gsi_stmt (gsi);
 543           def = PHI_RESULT (phi);
 544           if (! virtual_operand_p (def)
 545               && FLOAT_TYPE_P (TREE_TYPE (def)))
 546             execute_cse_reciprocals_1 (NULL, def);
 547         }
 548
 549       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 550         {
 551           gimple stmt = gsi_stmt (gsi);
 552
 553           if (gimple_has_lhs (stmt)
 554               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 555               && FLOAT_TYPE_P (TREE_TYPE (def))
 556               && TREE_CODE (def) == SSA_NAME)
 557             execute_cse_reciprocals_1 (&gsi, def);
 558         }
 559
 560       if (optimize_bb_for_size_p (bb))
 561         continue;
 562
 563       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 564       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 565         {
 566           gimple stmt = gsi_stmt (gsi);
 567           tree fndecl;
 568
 569           if (is_gimple_assign (stmt)
 570               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 571             {
 572               tree arg1 = gimple_assign_rhs2 (stmt);
 573               gimple stmt1;
 574
 575               if (TREE_CODE (arg1) != SSA_NAME)
 576                 continue;
 577
 578               stmt1 = SSA_NAME_DEF_STMT (arg1);
 579
 580               if (is_gimple_call (stmt1)
 581                   && gimple_call_lhs (stmt1)
 582                   && (fndecl = gimple_call_fndecl (stmt1))
 583                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 584                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 585                 {
 586                   enum built_in_function code;
 587                   bool md_code, fail;
 588                   imm_use_iterator ui;
 589                   use_operand_p use_p;
 590
 591                   code = DECL_FUNCTION_CODE (fndecl);
 592                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 593
 594                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 595                   if (!fndecl)
 596                     continue;
 597
 598                   /* Check that all uses of the SSA name are divisions,
 599                      otherwise replacing the defining statement will do
 600                      the wrong thing.  */
 601                   fail = false;
 602                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 603                     {
 604                       gimple stmt2 = USE_STMT (use_p);
 605                       if (is_gimple_debug (stmt2))
 606                         continue;
 607                       if (!is_gimple_assign (stmt2)
 608                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 609                           || gimple_assign_rhs1 (stmt2) == arg1
 610                           || gimple_assign_rhs2 (stmt2) != arg1)
 611                         {
 612                           fail = true;
 613                           break;
 614                         }
 615                     }
 616                   if (fail)
 617                     continue;
 618
 619                   gimple_replace_ssa_lhs (stmt1, arg1);
 620                   gimple_call_set_fndecl (stmt1, fndecl);
 621                   update_stmt (stmt1);
 622                   reciprocal_stats.rfuncs_inserted++;
 623
 624                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 625                     {
 626                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 627                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 628                       fold_stmt_inplace (&gsi);
 629                       update_stmt (stmt);
 630                     }
 631                 }
 632             }
 633         }
 634     }
 635
 636   statistics_counter_event (cfun, "reciprocal divs inserted",
 637                             reciprocal_stats.rdivs_inserted);
 638   statistics_counter_event (cfun, "reciprocal functions inserted",
 639                             reciprocal_stats.rfuncs_inserted);
 640
 641   free_dominance_info (CDI_DOMINATORS);
 642   free_dominance_info (CDI_POST_DOMINATORS);
 643   free_alloc_pool (occ_pool);
 644   return 0;
 645 }
 646
 647 namespace {
 648
 649 const pass_data pass_data_cse_reciprocals =
 650 {
 651   GIMPLE_PASS, /* type */
 652   "recip", /* name */
 653   OPTGROUP_NONE, /* optinfo_flags */
 654   true, /* has_gate */
 655   true, /* has_execute */
 656   TV_NONE, /* tv_id */
 657   PROP_ssa, /* properties_required */
 658   0, /* properties_provided */
 659   0, /* properties_destroyed */
 660   0, /* todo_flags_start */
 661   ( TODO_update_ssa | TODO_verify_ssa
 662     | TODO_verify_stmts ), /* todo_flags_finish */
 663 };
 664
 665 class pass_cse_reciprocals : public gimple_opt_pass
 666 {
 667 public:
 668   pass_cse_reciprocals (gcc::context *ctxt)
 669     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
 670   {}
 671
 672   /* opt_pass methods: */
 673   bool gate () { return gate_cse_reciprocals (); }
 674   unsigned int execute () { return execute_cse_reciprocals (); }
 675
 676 }; // class pass_cse_reciprocals
 677
 678 } // anon namespace
 679
 680 gimple_opt_pass *
 681 make_pass_cse_reciprocals (gcc::context *ctxt)
 682 {
 683   return new pass_cse_reciprocals (ctxt);
 684 }
 685
 686 /* Records an occurrence at statement USE_STMT in the vector of trees
 687    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 688    is not yet initialized.  Returns true if the occurrence was pushed on
 689    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 690    statements in the vector.  */
 691
 692 static bool
 693 maybe_record_sincos (vec<gimple> *stmts,
 694                      basic_block *top_bb, gimple use_stmt)
 695 {
 696   basic_block use_bb = gimple_bb (use_stmt);
 697   if (*top_bb
 698       && (*top_bb == use_bb
 699           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 700     stmts->safe_push (use_stmt);
 701   else if (!*top_bb
 702            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 703     {
 704       stmts->safe_push (use_stmt);
 705       *top_bb = use_bb;
 706     }
 707   else
 708     return false;
 709
 710   return true;
 711 }
 712
 713 /* Look for sin, cos and cexpi calls with the same argument NAME and
 714    create a single call to cexpi CSEing the result in this case.
 715    We first walk over all immediate uses of the argument collecting
 716    statements that we can CSE in a vector and in a second pass replace
 717    the statement rhs with a REALPART or IMAGPART expression on the
 718    result of the cexpi call we insert before the use statement that
 719    dominates all other candidates.  */
 720
 721 static bool
 722 execute_cse_sincos_1 (tree name)
 723 {
 724   gimple_stmt_iterator gsi;
 725   imm_use_iterator use_iter;
 726   tree fndecl, res, type;
 727   gimple def_stmt, use_stmt, stmt;
 728   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 729   vec<gimple> stmts = vNULL;
 730   basic_block top_bb = NULL;
 731   int i;
 732   bool cfg_changed = false;
 733
 734   type = TREE_TYPE (name);
 735   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 736     {
 737       if (gimple_code (use_stmt) != GIMPLE_CALL
 738           || !gimple_call_lhs (use_stmt)
 739           || !(fndecl = gimple_call_fndecl (use_stmt))
 740           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 741         continue;
 742
 743       switch (DECL_FUNCTION_CODE (fndecl))
 744         {
 745         CASE_FLT_FN (BUILT_IN_COS):
 746           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 747           break;
 748
 749         CASE_FLT_FN (BUILT_IN_SIN):
 750           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 751           break;
 752
 753         CASE_FLT_FN (BUILT_IN_CEXPI):
 754           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 755           break;
 756
 757         default:;
 758         }
 759     }
 760
 761   if (seen_cos + seen_sin + seen_cexpi <= 1)
 762     {
 763       stmts.release ();
 764       return false;
 765     }
 766
 767   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 768      the name def statement.  */
 769   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 770   if (!fndecl)
 771     return false;
 772   stmt = gimple_build_call (fndecl, 1, name);
 773   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
 774   gimple_call_set_lhs (stmt, res);
 775
 776   def_stmt = SSA_NAME_DEF_STMT (name);
 777   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 778       && gimple_code (def_stmt) != GIMPLE_PHI
 779       && gimple_bb (def_stmt) == top_bb)
 780     {
 781       gsi = gsi_for_stmt (def_stmt);
 782       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 783     }
 784   else
 785     {
 786       gsi = gsi_after_labels (top_bb);
 787       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 788     }
 789   sincos_stats.inserted++;
 790
 791   /* And adjust the recorded old call sites.  */
 792   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
 793     {
 794       tree rhs = NULL;
 795       fndecl = gimple_call_fndecl (use_stmt);
 796
 797       switch (DECL_FUNCTION_CODE (fndecl))
 798         {
 799         CASE_FLT_FN (BUILT_IN_COS):
 800           rhs = fold_build1 (REALPART_EXPR, type, res);
 801           break;
 802
 803         CASE_FLT_FN (BUILT_IN_SIN):
 804           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 805           break;
 806
 807         CASE_FLT_FN (BUILT_IN_CEXPI):
 808           rhs = res;
 809           break;
 810
 811         default:;
 812           gcc_unreachable ();
 813         }
 814
 815         /* Replace call with a copy.  */
 816         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 817
 818         gsi = gsi_for_stmt (use_stmt);
 819         gsi_replace (&gsi, stmt, true);
 820         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 821           cfg_changed = true;
 822     }
 823
 824   stmts.release ();
 825
 826   return cfg_changed;
 827 }
 828
 829 /* To evaluate powi(x,n), the floating point value x raised to the
 830    constant integer exponent n, we use a hybrid algorithm that
 831    combines the "window method" with look-up tables.  For an
 832    introduction to exponentiation algorithms and "addition chains",
 833    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 834    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 835    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 836    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 837
 838 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 839    multiplications to inline before calling the system library's pow
 840    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 841    so this default never requires calling pow, powf or powl.  */
 842
 843 #ifndef POWI_MAX_MULTS
 844 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 845 #endif
 846
 847 /* The size of the "optimal power tree" lookup table.  All
 848    exponents less than this value are simply looked up in the
 849    powi_table below.  This threshold is also used to size the
 850    cache of pseudo registers that hold intermediate results.  */
 851 #define POWI_TABLE_SIZE 256
 852
 853 /* The size, in bits of the window, used in the "window method"
 854    exponentiation algorithm.  This is equivalent to a radix of
 855    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 856 #define POWI_WINDOW_SIZE 3
 857
 858 /* The following table is an efficient representation of an
 859    "optimal power tree".  For each value, i, the corresponding
 860    value, j, in the table states than an optimal evaluation
 861    sequence for calculating pow(x,i) can be found by evaluating
 862    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 863    100 integers is given in Knuth's "Seminumerical algorithms".  */
 864
 865 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 866   {
 867       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 868       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 869       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 870      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 871      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 872      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 873      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 874      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 875      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 876      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 877      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 878      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 879      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 880      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 881      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 882      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 883      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 884      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 885      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 886      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 887      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 888      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 889      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 890      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 891      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 892     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 893     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 894     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 895     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 896     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 897     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 898     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 899   };
 900
 901
 902 /* Return the number of multiplications required to calculate
 903    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 904    subroutine of powi_cost.  CACHE is an array indicating
 905    which exponents have already been calculated.  */
 906
 907 static int
 908 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 909 {
 910   /* If we've already calculated this exponent, then this evaluation
 911      doesn't require any additional multiplications.  */
 912   if (cache[n])
 913     return 0;
 914
 915   cache[n] = true;
 916   return powi_lookup_cost (n - powi_table[n], cache)
 917          + powi_lookup_cost (powi_table[n], cache) + 1;
 918 }
 919
 920 /* Return the number of multiplications required to calculate
 921    powi(x,n) for an arbitrary x, given the exponent N.  This
 922    function needs to be kept in sync with powi_as_mults below.  */
 923
 924 static int
 925 powi_cost (HOST_WIDE_INT n)
 926 {
 927   bool cache[POWI_TABLE_SIZE];
 928   unsigned HOST_WIDE_INT digit;
 929   unsigned HOST_WIDE_INT val;
 930   int result;
 931
 932   if (n == 0)
 933     return 0;
 934
 935   /* Ignore the reciprocal when calculating the cost.  */
 936   val = (n < 0) ? -n : n;
 937
 938   /* Initialize the exponent cache.  */
 939   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 940   cache[1] = true;
 941
 942   result = 0;
 943
 944   while (val >= POWI_TABLE_SIZE)
 945     {
 946       if (val & 1)
 947         {
 948           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 949           result += powi_lookup_cost (digit, cache)
 950                     + POWI_WINDOW_SIZE + 1;
 951           val >>= POWI_WINDOW_SIZE;
 952         }
 953       else
 954         {
 955           val >>= 1;
 956           result++;
 957         }
 958     }
 959
 960   return result + powi_lookup_cost (val, cache);
 961 }
 962
 963 /* Recursive subroutine of powi_as_mults.  This function takes the
 964    array, CACHE, of already calculated exponents and an exponent N and
 965    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 966
 967 static tree
 968 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 969                  HOST_WIDE_INT n, tree *cache)
 970 {
 971   tree op0, op1, ssa_target;
 972   unsigned HOST_WIDE_INT digit;
 973   gimple mult_stmt;
 974
 975   if (n < POWI_TABLE_SIZE && cache[n])
 976     return cache[n];
 977
 978   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
 979
 980   if (n < POWI_TABLE_SIZE)
 981     {
 982       cache[n] = ssa_target;
 983       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
 984       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
 985     }
 986   else if (n & 1)
 987     {
 988       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
 989       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
 990       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
 991     }
 992   else
 993     {
 994       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
 995       op1 = op0;
 996     }
 997
 998   mult_stmt = gimple_build_assign_with_ops (MULT_EXPR, ssa_target, op0, op1);
 999   gimple_set_location (mult_stmt, loc);
1000   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1001
1002   return ssa_target;
1003 }
1004
1005 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1006    This function needs to be kept in sync with powi_cost above.  */
1007
1008 static tree
1009 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1010                tree arg0, HOST_WIDE_INT n)
1011 {
1012   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1013   gimple div_stmt;
1014   tree target;
1015
1016   if (n == 0)
1017     return build_real (type, dconst1);
1018
1019   memset (cache, 0,  sizeof (cache));
1020   cache[1] = arg0;
1021
1022   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1023   if (n >= 0)
1024     return result;
1025
1026   /* If the original exponent was negative, reciprocate the result.  */
1027   target = make_temp_ssa_name (type, NULL, "powmult");
1028   div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target,
1029                                            build_real (type, dconst1),
1030                                            result);
1031   gimple_set_location (div_stmt, loc);
1032   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1033
1034   return target;
1035 }
1036
1037 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1038    location info LOC.  If the arguments are appropriate, create an
1039    equivalent sequence of statements prior to GSI using an optimal
1040    number of multiplications, and return an expession holding the
1041    result.  */
1042
1043 static tree
1044 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1045                             tree arg0, HOST_WIDE_INT n)
1046 {
1047   /* Avoid largest negative number.  */
1048   if (n != -n
1049       && ((n >= -1 && n <= 2)
1050           || (optimize_function_for_speed_p (cfun)
1051               && powi_cost (n) <= POWI_MAX_MULTS)))
1052     return powi_as_mults (gsi, loc, arg0, n);
1053
1054   return NULL_TREE;
1055 }
1056
1057 /* Build a gimple call statement that calls FN with argument ARG.
1058    Set the lhs of the call statement to a fresh SSA name.  Insert the
1059    statement prior to GSI's current position, and return the fresh
1060    SSA name.  */
1061
1062 static tree
1063 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1064                        tree fn, tree arg)
1065 {
1066   gimple call_stmt;
1067   tree ssa_target;
1068
1069   call_stmt = gimple_build_call (fn, 1, arg);
1070   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1071   gimple_set_lhs (call_stmt, ssa_target);
1072   gimple_set_location (call_stmt, loc);
1073   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1074
1075   return ssa_target;
1076 }
1077
1078 /* Build a gimple binary operation with the given CODE and arguments
1079    ARG0, ARG1, assigning the result to a new SSA name for variable
1080    TARGET.  Insert the statement prior to GSI's current position, and
1081    return the fresh SSA name.*/
1082
1083 static tree
1084 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1085                         const char *name, enum tree_code code,
1086                         tree arg0, tree arg1)
1087 {
1088   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1089   gimple stmt = gimple_build_assign_with_ops (code, result, arg0, arg1);
1090   gimple_set_location (stmt, loc);
1091   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1092   return result;
1093 }
1094
1095 /* Build a gimple reference operation with the given CODE and argument
1096    ARG, assigning the result to a new SSA name of TYPE with NAME.
1097    Insert the statement prior to GSI's current position, and return
1098    the fresh SSA name.  */
1099
1100 static inline tree
1101 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1102                       const char *name, enum tree_code code, tree arg0)
1103 {
1104   tree result = make_temp_ssa_name (type, NULL, name);
1105   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1106   gimple_set_location (stmt, loc);
1107   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1108   return result;
1109 }
1110
1111 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1112    prior to GSI's current position, and return the fresh SSA name.  */
1113
1114 static tree
1115 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1116                        tree type, tree val)
1117 {
1118   tree result = make_ssa_name (type, NULL);
1119   gimple stmt = gimple_build_assign_with_ops (NOP_EXPR, result, val, NULL_TREE);
1120   gimple_set_location (stmt, loc);
1121   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1122   return result;
1123 }
1124
1125 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1126    with location info LOC.  If possible, create an equivalent and
1127    less expensive sequence of statements prior to GSI, and return an
1128    expession holding the result.  */
1129
1130 static tree
1131 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1132                            tree arg0, tree arg1)
1133 {
1134   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1135   REAL_VALUE_TYPE c2, dconst3;
1136   HOST_WIDE_INT n;
1137   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1138   enum machine_mode mode;
1139   bool hw_sqrt_exists, c_is_int, c2_is_int;
1140
1141   /* If the exponent isn't a constant, there's nothing of interest
1142      to be done.  */
1143   if (TREE_CODE (arg1) != REAL_CST)
1144     return NULL_TREE;
1145
1146   /* If the exponent is equivalent to an integer, expand to an optimal
1147      multiplication sequence when profitable.  */
1148   c = TREE_REAL_CST (arg1);
1149   n = real_to_integer (&c);
1150   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1151   c_is_int = real_identical (&c, &cint);
1152
1153   if (c_is_int
1154       && ((n >= -1 && n <= 2)
1155           || (flag_unsafe_math_optimizations
1156               && optimize_insn_for_speed_p ()
1157               && powi_cost (n) <= POWI_MAX_MULTS)))
1158     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1159
1160   /* Attempt various optimizations using sqrt and cbrt.  */
1161   type = TREE_TYPE (arg0);
1162   mode = TYPE_MODE (type);
1163   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1164
1165   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1166      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1167      sqrt(-0) = -0.  */
1168   if (sqrtfn
1169       && REAL_VALUES_EQUAL (c, dconsthalf)
1170       && !HONOR_SIGNED_ZEROS (mode))
1171     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1172
1173   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1174      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1175      so do this optimization even if -Os.  Don't do this optimization
1176      if we don't have a hardware sqrt insn.  */
1177   dconst1_4 = dconst1;
1178   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1179   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1180
1181   if (flag_unsafe_math_optimizations
1182       && sqrtfn
1183       && REAL_VALUES_EQUAL (c, dconst1_4)
1184       && hw_sqrt_exists)
1185     {
1186       /* sqrt(x)  */
1187       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1188
1189       /* sqrt(sqrt(x))  */
1190       return build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1191     }
1192
1193   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1194      optimizing for space.  Don't do this optimization if we don't have
1195      a hardware sqrt insn.  */
1196   real_from_integer (&dconst3_4, VOIDmode, 3, 0, 0);
1197   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1198
1199   if (flag_unsafe_math_optimizations
1200       && sqrtfn
1201       && optimize_function_for_speed_p (cfun)
1202       && REAL_VALUES_EQUAL (c, dconst3_4)
1203       && hw_sqrt_exists)
1204     {
1205       /* sqrt(x)  */
1206       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1207
1208       /* sqrt(sqrt(x))  */
1209       sqrt_sqrt = build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1210
1211       /* sqrt(x) * sqrt(sqrt(x))  */
1212       return build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1213                                      sqrt_arg0, sqrt_sqrt);
1214     }
1215
1216   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1217      optimizations since 1./3. is not exactly representable.  If x
1218      is negative and finite, the correct value of pow(x,1./3.) is
1219      a NaN with the "invalid" exception raised, because the value
1220      of 1./3. actually has an even denominator.  The correct value
1221      of cbrt(x) is a negative real value.  */
1222   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1223   dconst1_3 = real_value_truncate (mode, dconst_third ());
1224
1225   if (flag_unsafe_math_optimizations
1226       && cbrtfn
1227       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1228       && REAL_VALUES_EQUAL (c, dconst1_3))
1229     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1230
1231   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1232      if we don't have a hardware sqrt insn.  */
1233   dconst1_6 = dconst1_3;
1234   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1235
1236   if (flag_unsafe_math_optimizations
1237       && sqrtfn
1238       && cbrtfn
1239       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1240       && optimize_function_for_speed_p (cfun)
1241       && hw_sqrt_exists
1242       && REAL_VALUES_EQUAL (c, dconst1_6))
1243     {
1244       /* sqrt(x)  */
1245       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1246
1247       /* cbrt(sqrt(x))  */
1248       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1249     }
1250
1251   /* Optimize pow(x,c), where n = 2c for some nonzero integer n
1252      and c not an integer, into
1253
1254        sqrt(x) * powi(x, n/2),                n > 0;
1255        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1256
1257      Do not calculate the powi factor when n/2 = 0.  */
1258   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1259   n = real_to_integer (&c2);
1260   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1261   c2_is_int = real_identical (&c2, &cint);
1262
1263   if (flag_unsafe_math_optimizations
1264       && sqrtfn
1265       && c2_is_int
1266       && !c_is_int
1267       && optimize_function_for_speed_p (cfun))
1268     {
1269       tree powi_x_ndiv2 = NULL_TREE;
1270
1271       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1272          possible or profitable, give up.  Skip the degenerate case when
1273          n is 1 or -1, where the result is always 1.  */
1274       if (absu_hwi (n) != 1)
1275         {
1276           powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1277                                                      abs_hwi (n / 2));
1278           if (!powi_x_ndiv2)
1279             return NULL_TREE;
1280         }
1281
1282       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1283          result of the optimal multiply sequence just calculated.  */
1284       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1285
1286       if (absu_hwi (n) == 1)
1287         result = sqrt_arg0;
1288       else
1289         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1290                                          sqrt_arg0, powi_x_ndiv2);
1291
1292       /* If n is negative, reciprocate the result.  */
1293       if (n < 0)
1294         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1295                                          build_real (type, dconst1), result);
1296       return result;
1297     }
1298
1299   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1300
1301      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1302      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1303
1304      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1305      different from pow(x, 1./3.) due to rounding and behavior with
1306      negative x, we need to constrain this transformation to unsafe
1307      math and positive x or finite math.  */
1308   real_from_integer (&dconst3, VOIDmode, 3, 0, 0);
1309   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1310   real_round (&c2, mode, &c2);
1311   n = real_to_integer (&c2);
1312   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1313   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1314   real_convert (&c2, mode, &c2);
1315
1316   if (flag_unsafe_math_optimizations
1317       && cbrtfn
1318       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1319       && real_identical (&c2, &c)
1320       && !c2_is_int
1321       && optimize_function_for_speed_p (cfun)
1322       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1323     {
1324       tree powi_x_ndiv3 = NULL_TREE;
1325
1326       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1327          possible or profitable, give up.  Skip the degenerate case when
1328          abs(n) < 3, where the result is always 1.  */
1329       if (absu_hwi (n) >= 3)
1330         {
1331           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1332                                                      abs_hwi (n / 3));
1333           if (!powi_x_ndiv3)
1334             return NULL_TREE;
1335         }
1336
1337       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1338          as that creates an unnecessary variable.  Instead, just produce
1339          either cbrt(x) or cbrt(x) * cbrt(x).  */
1340       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1341
1342       if (absu_hwi (n) % 3 == 1)
1343         powi_cbrt_x = cbrt_x;
1344       else
1345         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1346                                               cbrt_x, cbrt_x);
1347
1348       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1349       if (absu_hwi (n) < 3)
1350         result = powi_cbrt_x;
1351       else
1352         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1353                                          powi_x_ndiv3, powi_cbrt_x);
1354
1355       /* If n is negative, reciprocate the result.  */
1356       if (n < 0)
1357         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1358                                          build_real (type, dconst1), result);
1359
1360       return result;
1361     }
1362
1363   /* No optimizations succeeded.  */
1364   return NULL_TREE;
1365 }
1366
1367 /* ARG is the argument to a cabs builtin call in GSI with location info
1368    LOC.  Create a sequence of statements prior to GSI that calculates
1369    sqrt(R*R + I*I), where R and I are the real and imaginary components
1370    of ARG, respectively.  Return an expression holding the result.  */
1371
1372 static tree
1373 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1374 {
1375   tree real_part, imag_part, addend1, addend2, sum, result;
1376   tree type = TREE_TYPE (TREE_TYPE (arg));
1377   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1378   enum machine_mode mode = TYPE_MODE (type);
1379
1380   if (!flag_unsafe_math_optimizations
1381       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1382       || !sqrtfn
1383       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1384     return NULL_TREE;
1385
1386   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1387                                     REALPART_EXPR, arg);
1388   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1389                                     real_part, real_part);
1390   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1391                                     IMAGPART_EXPR, arg);
1392   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1393                                     imag_part, imag_part);
1394   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1395   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1396
1397   return result;
1398 }
1399
1400 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1401    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1402    an optimal number of multiplies, when n is a constant.  */
1403
1404 static unsigned int
1405 execute_cse_sincos (void)
1406 {
1407   basic_block bb;
1408   bool cfg_changed = false;
1409
1410   calculate_dominance_info (CDI_DOMINATORS);
1411   memset (&sincos_stats, 0, sizeof (sincos_stats));
1412
1413   FOR_EACH_BB (bb)
1414     {
1415       gimple_stmt_iterator gsi;
1416       bool cleanup_eh = false;
1417
1418       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1419         {
1420           gimple stmt = gsi_stmt (gsi);
1421           tree fndecl;
1422
1423           /* Only the last stmt in a bb could throw, no need to call
1424              gimple_purge_dead_eh_edges if we change something in the middle
1425              of a basic block.  */
1426           cleanup_eh = false;
1427
1428           if (is_gimple_call (stmt)
1429               && gimple_call_lhs (stmt)
1430               && (fndecl = gimple_call_fndecl (stmt))
1431               && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1432             {
1433               tree arg, arg0, arg1, result;
1434               HOST_WIDE_INT n;
1435               location_t loc;
1436
1437               switch (DECL_FUNCTION_CODE (fndecl))
1438                 {
1439                 CASE_FLT_FN (BUILT_IN_COS):
1440                 CASE_FLT_FN (BUILT_IN_SIN):
1441                 CASE_FLT_FN (BUILT_IN_CEXPI):
1442                   /* Make sure we have either sincos or cexp.  */
1443                   if (!targetm.libc_has_function (function_c99_math_complex)
1444                       && !targetm.libc_has_function (function_sincos))
1445                     break;
1446
1447                   arg = gimple_call_arg (stmt, 0);
1448                   if (TREE_CODE (arg) == SSA_NAME)
1449                     cfg_changed |= execute_cse_sincos_1 (arg);
1450                   break;
1451
1452                 CASE_FLT_FN (BUILT_IN_POW):
1453                   arg0 = gimple_call_arg (stmt, 0);
1454                   arg1 = gimple_call_arg (stmt, 1);
1455
1456                   loc = gimple_location (stmt);
1457                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1458
1459                   if (result)
1460                     {
1461                       tree lhs = gimple_get_lhs (stmt);
1462                       gimple new_stmt = gimple_build_assign (lhs, result);
1463                       gimple_set_location (new_stmt, loc);
1464                       unlink_stmt_vdef (stmt);
1465                       gsi_replace (&gsi, new_stmt, true);
1466                       cleanup_eh = true;
1467                       if (gimple_vdef (stmt))
1468                         release_ssa_name (gimple_vdef (stmt));
1469                     }
1470                   break;
1471
1472                 CASE_FLT_FN (BUILT_IN_POWI):
1473                   arg0 = gimple_call_arg (stmt, 0);
1474                   arg1 = gimple_call_arg (stmt, 1);
1475                   loc = gimple_location (stmt);
1476
1477                   if (real_minus_onep (arg0))
1478                     {
1479                       tree t0, t1, cond, one, minus_one;
1480                       gimple stmt;
1481
1482                       t0 = TREE_TYPE (arg0);
1483                       t1 = TREE_TYPE (arg1);
1484                       one = build_real (t0, dconst1);
1485                       minus_one = build_real (t0, dconstm1);
1486
1487                       cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1488                       stmt = gimple_build_assign_with_ops (BIT_AND_EXPR, cond,
1489                                                            arg1,
1490                                                            build_int_cst (t1,
1491                                                                           1));
1492                       gimple_set_location (stmt, loc);
1493                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1494
1495                       result = make_temp_ssa_name (t0, NULL, "powi");
1496                       stmt = gimple_build_assign_with_ops (COND_EXPR, result,
1497                                                            cond,
1498                                                            minus_one, one);
1499                       gimple_set_location (stmt, loc);
1500                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1501                     }
1502                   else
1503                     {
1504                       if (!host_integerp (arg1, 0))
1505                         break;
1506
1507                       n = TREE_INT_CST_LOW (arg1);
1508                       result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1509                     }
1510
1511                   if (result)
1512                     {
1513                       tree lhs = gimple_get_lhs (stmt);
1514                       gimple new_stmt = gimple_build_assign (lhs, result);
1515                       gimple_set_location (new_stmt, loc);
1516                       unlink_stmt_vdef (stmt);
1517                       gsi_replace (&gsi, new_stmt, true);
1518                       cleanup_eh = true;
1519                       if (gimple_vdef (stmt))
1520                         release_ssa_name (gimple_vdef (stmt));
1521                     }
1522                   break;
1523
1524                 CASE_FLT_FN (BUILT_IN_CABS):
1525                   arg0 = gimple_call_arg (stmt, 0);
1526                   loc = gimple_location (stmt);
1527                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1528
1529                   if (result)
1530                     {
1531                       tree lhs = gimple_get_lhs (stmt);
1532                       gimple new_stmt = gimple_build_assign (lhs, result);
1533                       gimple_set_location (new_stmt, loc);
1534                       unlink_stmt_vdef (stmt);
1535                       gsi_replace (&gsi, new_stmt, true);
1536                       cleanup_eh = true;
1537                       if (gimple_vdef (stmt))
1538                         release_ssa_name (gimple_vdef (stmt));
1539                     }
1540                   break;
1541
1542                 default:;
1543                 }
1544             }
1545         }
1546       if (cleanup_eh)
1547         cfg_changed |= gimple_purge_dead_eh_edges (bb);
1548     }
1549
1550   statistics_counter_event (cfun, "sincos statements inserted",
1551                             sincos_stats.inserted);
1552
1553   free_dominance_info (CDI_DOMINATORS);
1554   return cfg_changed ? TODO_cleanup_cfg : 0;
1555 }
1556
1557 static bool
1558 gate_cse_sincos (void)
1559 {
1560   /* We no longer require either sincos or cexp, since powi expansion
1561      piggybacks on this pass.  */
1562   return optimize;
1563 }
1564
1565 namespace {
1566
1567 const pass_data pass_data_cse_sincos =
1568 {
1569   GIMPLE_PASS, /* type */
1570   "sincos", /* name */
1571   OPTGROUP_NONE, /* optinfo_flags */
1572   true, /* has_gate */
1573   true, /* has_execute */
1574   TV_NONE, /* tv_id */
1575   PROP_ssa, /* properties_required */
1576   0, /* properties_provided */
1577   0, /* properties_destroyed */
1578   0, /* todo_flags_start */
1579   ( TODO_update_ssa | TODO_verify_ssa
1580     | TODO_verify_stmts ), /* todo_flags_finish */
1581 };
1582
1583 class pass_cse_sincos : public gimple_opt_pass
1584 {
1585 public:
1586   pass_cse_sincos (gcc::context *ctxt)
1587     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1588   {}
1589
1590   /* opt_pass methods: */
1591   bool gate () { return gate_cse_sincos (); }
1592   unsigned int execute () { return execute_cse_sincos (); }
1593
1594 }; // class pass_cse_sincos
1595
1596 } // anon namespace
1597
1598 gimple_opt_pass *
1599 make_pass_cse_sincos (gcc::context *ctxt)
1600 {
1601   return new pass_cse_sincos (ctxt);
1602 }
1603
1604 /* A symbolic number is used to detect byte permutation and selection
1605    patterns.  Therefore the field N contains an artificial number
1606    consisting of byte size markers:
1607
1608    0    - byte has the value 0
1609    1..size - byte contains the content of the byte
1610    number indexed with that value minus one  */
1611
1612 struct symbolic_number {
1613   unsigned HOST_WIDEST_INT n;
1614   int size;
1615 };
1616
1617 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1618    number N.  Return false if the requested operation is not permitted
1619    on a symbolic number.  */
1620
1621 static inline bool
1622 do_shift_rotate (enum tree_code code,
1623                  struct symbolic_number *n,
1624                  int count)
1625 {
1626   if (count % 8 != 0)
1627     return false;
1628
1629   /* Zero out the extra bits of N in order to avoid them being shifted
1630      into the significant bits.  */
1631   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1632     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1633
1634   switch (code)
1635     {
1636     case LSHIFT_EXPR:
1637       n->n <<= count;
1638       break;
1639     case RSHIFT_EXPR:
1640       n->n >>= count;
1641       break;
1642     case LROTATE_EXPR:
1643       n->n = (n->n << count) | (n->n >> ((n->size * BITS_PER_UNIT) - count));
1644       break;
1645     case RROTATE_EXPR:
1646       n->n = (n->n >> count) | (n->n << ((n->size * BITS_PER_UNIT) - count));
1647       break;
1648     default:
1649       return false;
1650     }
1651   /* Zero unused bits for size.  */
1652   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1653     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1654   return true;
1655 }
1656
1657 /* Perform sanity checking for the symbolic number N and the gimple
1658    statement STMT.  */
1659
1660 static inline bool
1661 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1662 {
1663   tree lhs_type;
1664
1665   lhs_type = gimple_expr_type (stmt);
1666
1667   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1668     return false;
1669
1670   if (TYPE_PRECISION (lhs_type) != n->size * BITS_PER_UNIT)
1671     return false;
1672
1673   return true;
1674 }
1675
1676 /* find_bswap_1 invokes itself recursively with N and tries to perform
1677    the operation given by the rhs of STMT on the result.  If the
1678    operation could successfully be executed the function returns the
1679    tree expression of the source operand and NULL otherwise.  */
1680
1681 static tree
1682 find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
1683 {
1684   enum tree_code code;
1685   tree rhs1, rhs2 = NULL;
1686   gimple rhs1_stmt, rhs2_stmt;
1687   tree source_expr1;
1688   enum gimple_rhs_class rhs_class;
1689
1690   if (!limit || !is_gimple_assign (stmt))
1691     return NULL_TREE;
1692
1693   rhs1 = gimple_assign_rhs1 (stmt);
1694
1695   if (TREE_CODE (rhs1) != SSA_NAME)
1696     return NULL_TREE;
1697
1698   code = gimple_assign_rhs_code (stmt);
1699   rhs_class = gimple_assign_rhs_class (stmt);
1700   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1701
1702   if (rhs_class == GIMPLE_BINARY_RHS)
1703     rhs2 = gimple_assign_rhs2 (stmt);
1704
1705   /* Handle unary rhs and binary rhs with integer constants as second
1706      operand.  */
1707
1708   if (rhs_class == GIMPLE_UNARY_RHS
1709       || (rhs_class == GIMPLE_BINARY_RHS
1710           && TREE_CODE (rhs2) == INTEGER_CST))
1711     {
1712       if (code != BIT_AND_EXPR
1713           && code != LSHIFT_EXPR
1714           && code != RSHIFT_EXPR
1715           && code != LROTATE_EXPR
1716           && code != RROTATE_EXPR
1717           && code != NOP_EXPR
1718           && code != CONVERT_EXPR)
1719         return NULL_TREE;
1720
1721       source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
1722
1723       /* If find_bswap_1 returned NULL STMT is a leaf node and we have
1724          to initialize the symbolic number.  */
1725       if (!source_expr1)
1726         {
1727           /* Set up the symbolic number N by setting each byte to a
1728              value between 1 and the byte size of rhs1.  The highest
1729              order byte is set to n->size and the lowest order
1730              byte to 1.  */
1731           n->size = TYPE_PRECISION (TREE_TYPE (rhs1));
1732           if (n->size % BITS_PER_UNIT != 0)
1733             return NULL_TREE;
1734           n->size /= BITS_PER_UNIT;
1735           n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1736                   (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
1737
1738           if (n->size < (int)sizeof (HOST_WIDEST_INT))
1739             n->n &= ((unsigned HOST_WIDEST_INT)1 <<
1740                      (n->size * BITS_PER_UNIT)) - 1;
1741
1742           source_expr1 = rhs1;
1743         }
1744
1745       switch (code)
1746         {
1747         case BIT_AND_EXPR:
1748           {
1749             int i;
1750             unsigned HOST_WIDEST_INT val = widest_int_cst_value (rhs2);
1751             unsigned HOST_WIDEST_INT tmp = val;
1752
1753             /* Only constants masking full bytes are allowed.  */
1754             for (i = 0; i < n->size; i++, tmp >>= BITS_PER_UNIT)
1755               if ((tmp & 0xff) != 0 && (tmp & 0xff) != 0xff)
1756                 return NULL_TREE;
1757
1758             n->n &= val;
1759           }
1760           break;
1761         case LSHIFT_EXPR:
1762         case RSHIFT_EXPR:
1763         case LROTATE_EXPR:
1764         case RROTATE_EXPR:
1765           if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
1766             return NULL_TREE;
1767           break;
1768         CASE_CONVERT:
1769           {
1770             int type_size;
1771
1772             type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1773             if (type_size % BITS_PER_UNIT != 0)
1774               return NULL_TREE;
1775
1776             if (type_size / BITS_PER_UNIT < (int)(sizeof (HOST_WIDEST_INT)))
1777               {
1778                 /* If STMT casts to a smaller type mask out the bits not
1779                    belonging to the target type.  */
1780                 n->n &= ((unsigned HOST_WIDEST_INT)1 << type_size) - 1;
1781               }
1782             n->size = type_size / BITS_PER_UNIT;
1783           }
1784           break;
1785         default:
1786           return NULL_TREE;
1787         };
1788       return verify_symbolic_number_p (n, stmt) ? source_expr1 : NULL;
1789     }
1790
1791   /* Handle binary rhs.  */
1792
1793   if (rhs_class == GIMPLE_BINARY_RHS)
1794     {
1795       struct symbolic_number n1, n2;
1796       tree source_expr2;
1797
1798       if (code != BIT_IOR_EXPR)
1799         return NULL_TREE;
1800
1801       if (TREE_CODE (rhs2) != SSA_NAME)
1802         return NULL_TREE;
1803
1804       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1805
1806       switch (code)
1807         {
1808         case BIT_IOR_EXPR:
1809           source_expr1 = find_bswap_1 (rhs1_stmt, &n1, limit - 1);
1810
1811           if (!source_expr1)
1812             return NULL_TREE;
1813
1814           source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
1815
1816           if (source_expr1 != source_expr2
1817               || n1.size != n2.size)
1818             return NULL_TREE;
1819
1820           n->size = n1.size;
1821           n->n = n1.n | n2.n;
1822
1823           if (!verify_symbolic_number_p (n, stmt))
1824             return NULL_TREE;
1825
1826           break;
1827         default:
1828           return NULL_TREE;
1829         }
1830       return source_expr1;
1831     }
1832   return NULL_TREE;
1833 }
1834
1835 /* Check if STMT completes a bswap implementation consisting of ORs,
1836    SHIFTs and ANDs.  Return the source tree expression on which the
1837    byte swap is performed and NULL if no bswap was found.  */
1838
1839 static tree
1840 find_bswap (gimple stmt)
1841 {
1842 /* The number which the find_bswap result should match in order to
1843    have a full byte swap.  The number is shifted to the left according
1844    to the size of the symbolic number before using it.  */
1845   unsigned HOST_WIDEST_INT cmp =
1846     sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1847     (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708;
1848
1849   struct symbolic_number n;
1850   tree source_expr;
1851   int limit;
1852
1853   /* The last parameter determines the depth search limit.  It usually
1854      correlates directly to the number of bytes to be touched.  We
1855      increase that number by three  here in order to also
1856      cover signed -> unsigned converions of the src operand as can be seen
1857      in libgcc, and for initial shift/and operation of the src operand.  */
1858   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
1859   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
1860   source_expr =  find_bswap_1 (stmt, &n, limit);
1861
1862   if (!source_expr)
1863     return NULL_TREE;
1864
1865   /* Zero out the extra bits of N and CMP.  */
1866   if (n.size < (int)sizeof (HOST_WIDEST_INT))
1867     {
1868       unsigned HOST_WIDEST_INT mask =
1869         ((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
1870
1871       n.n &= mask;
1872       cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
1873     }
1874
1875   /* A complete byte swap should make the symbolic number to start
1876      with the largest digit in the highest order byte.  */
1877   if (cmp != n.n)
1878     return NULL_TREE;
1879
1880   return source_expr;
1881 }
1882
1883 /* Find manual byte swap implementations and turn them into a bswap
1884    builtin invokation.  */
1885
1886 static unsigned int
1887 execute_optimize_bswap (void)
1888 {
1889   basic_block bb;
1890   bool bswap16_p, bswap32_p, bswap64_p;
1891   bool changed = false;
1892   tree bswap16_type = NULL_TREE, bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
1893
1894   if (BITS_PER_UNIT != 8)
1895     return 0;
1896
1897   if (sizeof (HOST_WIDEST_INT) < 8)
1898     return 0;
1899
1900   bswap16_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP16)
1901                && optab_handler (bswap_optab, HImode) != CODE_FOR_nothing);
1902   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
1903                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
1904   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
1905                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
1906                    || (bswap32_p && word_mode == SImode)));
1907
1908   if (!bswap16_p && !bswap32_p && !bswap64_p)
1909     return 0;
1910
1911   /* Determine the argument type of the builtins.  The code later on
1912      assumes that the return and argument type are the same.  */
1913   if (bswap16_p)
1914     {
1915       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
1916       bswap16_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1917     }
1918
1919   if (bswap32_p)
1920     {
1921       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1922       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1923     }
1924
1925   if (bswap64_p)
1926     {
1927       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1928       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1929     }
1930
1931   memset (&bswap_stats, 0, sizeof (bswap_stats));
1932
1933   FOR_EACH_BB (bb)
1934     {
1935       gimple_stmt_iterator gsi;
1936
1937       /* We do a reverse scan for bswap patterns to make sure we get the
1938          widest match. As bswap pattern matching doesn't handle
1939          previously inserted smaller bswap replacements as sub-
1940          patterns, the wider variant wouldn't be detected.  */
1941       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1942         {
1943           gimple stmt = gsi_stmt (gsi);
1944           tree bswap_src, bswap_type;
1945           tree bswap_tmp;
1946           tree fndecl = NULL_TREE;
1947           int type_size;
1948           gimple call;
1949
1950           if (!is_gimple_assign (stmt)
1951               || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
1952             continue;
1953
1954           type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1955
1956           switch (type_size)
1957             {
1958             case 16:
1959               if (bswap16_p)
1960                 {
1961                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
1962                   bswap_type = bswap16_type;
1963                 }
1964               break;
1965             case 32:
1966               if (bswap32_p)
1967                 {
1968                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1969                   bswap_type = bswap32_type;
1970                 }
1971               break;
1972             case 64:
1973               if (bswap64_p)
1974                 {
1975                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1976                   bswap_type = bswap64_type;
1977                 }
1978               break;
1979             default:
1980               continue;
1981             }
1982
1983           if (!fndecl)
1984             continue;
1985
1986           bswap_src = find_bswap (stmt);
1987
1988           if (!bswap_src)
1989             continue;
1990
1991           changed = true;
1992           if (type_size == 16)
1993             bswap_stats.found_16bit++;
1994           else if (type_size == 32)
1995             bswap_stats.found_32bit++;
1996           else
1997             bswap_stats.found_64bit++;
1998
1999           bswap_tmp = bswap_src;
2000
2001           /* Convert the src expression if necessary.  */
2002           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
2003             {
2004               gimple convert_stmt;
2005               bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2006               convert_stmt = gimple_build_assign_with_ops
2007                                 (NOP_EXPR, bswap_tmp, bswap_src, NULL);
2008               gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2009             }
2010
2011           call = gimple_build_call (fndecl, 1, bswap_tmp);
2012
2013           bswap_tmp = gimple_assign_lhs (stmt);
2014
2015           /* Convert the result if necessary.  */
2016           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
2017             {
2018               gimple convert_stmt;
2019               bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2020               convert_stmt = gimple_build_assign_with_ops
2021                         (NOP_EXPR, gimple_assign_lhs (stmt), bswap_tmp, NULL);
2022               gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2023             }
2024
2025           gimple_call_set_lhs (call, bswap_tmp);
2026
2027           if (dump_file)
2028             {
2029               fprintf (dump_file, "%d bit bswap implementation found at: ",
2030                        (int)type_size);
2031               print_gimple_stmt (dump_file, stmt, 0, 0);
2032             }
2033
2034           gsi_insert_after (&gsi, call, GSI_SAME_STMT);
2035           gsi_remove (&gsi, true);
2036         }
2037     }
2038
2039   statistics_counter_event (cfun, "16-bit bswap implementations found",
2040                             bswap_stats.found_16bit);
2041   statistics_counter_event (cfun, "32-bit bswap implementations found",
2042                             bswap_stats.found_32bit);
2043   statistics_counter_event (cfun, "64-bit bswap implementations found",
2044                             bswap_stats.found_64bit);
2045
2046   return (changed ? TODO_update_ssa | TODO_verify_ssa
2047           | TODO_verify_stmts : 0);
2048 }
2049
2050 static bool
2051 gate_optimize_bswap (void)
2052 {
2053   return flag_expensive_optimizations && optimize;
2054 }
2055
2056 namespace {
2057
2058 const pass_data pass_data_optimize_bswap =
2059 {
2060   GIMPLE_PASS, /* type */
2061   "bswap", /* name */
2062   OPTGROUP_NONE, /* optinfo_flags */
2063   true, /* has_gate */
2064   true, /* has_execute */
2065   TV_NONE, /* tv_id */
2066   PROP_ssa, /* properties_required */
2067   0, /* properties_provided */
2068   0, /* properties_destroyed */
2069   0, /* todo_flags_start */
2070   0, /* todo_flags_finish */
2071 };
2072
2073 class pass_optimize_bswap : public gimple_opt_pass
2074 {
2075 public:
2076   pass_optimize_bswap (gcc::context *ctxt)
2077     : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2078   {}
2079
2080   /* opt_pass methods: */
2081   bool gate () { return gate_optimize_bswap (); }
2082   unsigned int execute () { return execute_optimize_bswap (); }
2083
2084 }; // class pass_optimize_bswap
2085
2086 } // anon namespace
2087
2088 gimple_opt_pass *
2089 make_pass_optimize_bswap (gcc::context *ctxt)
2090 {
2091   return new pass_optimize_bswap (ctxt);
2092 }
2093
2094 /* Return true if stmt is a type conversion operation that can be stripped
2095    when used in a widening multiply operation.  */
2096 static bool
2097 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
2098 {
2099   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2100
2101   if (TREE_CODE (result_type) == INTEGER_TYPE)
2102     {
2103       tree op_type;
2104       tree inner_op_type;
2105
2106       if (!CONVERT_EXPR_CODE_P (rhs_code))
2107         return false;
2108
2109       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2110
2111       /* If the type of OP has the same precision as the result, then
2112          we can strip this conversion.  The multiply operation will be
2113          selected to create the correct extension as a by-product.  */
2114       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2115         return true;
2116
2117       /* We can also strip a conversion if it preserves the signed-ness of
2118          the operation and doesn't narrow the range.  */
2119       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2120
2121       /* If the inner-most type is unsigned, then we can strip any
2122          intermediate widening operation.  If it's signed, then the
2123          intermediate widening operation must also be signed.  */
2124       if ((TYPE_UNSIGNED (inner_op_type)
2125            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2126           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2127         return true;
2128
2129       return false;
2130     }
2131
2132   return rhs_code == FIXED_CONVERT_EXPR;
2133 }
2134
2135 /* Return true if RHS is a suitable operand for a widening multiplication,
2136    assuming a target type of TYPE.
2137    There are two cases:
2138
2139      - RHS makes some value at least twice as wide.  Store that value
2140        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2141
2142      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2143        but leave *TYPE_OUT untouched.  */
2144
2145 static bool
2146 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2147                         tree *new_rhs_out)
2148 {
2149   gimple stmt;
2150   tree type1, rhs1;
2151
2152   if (TREE_CODE (rhs) == SSA_NAME)
2153     {
2154       stmt = SSA_NAME_DEF_STMT (rhs);
2155       if (is_gimple_assign (stmt))
2156         {
2157           if (! widening_mult_conversion_strippable_p (type, stmt))
2158             rhs1 = rhs;
2159           else
2160             {
2161               rhs1 = gimple_assign_rhs1 (stmt);
2162
2163               if (TREE_CODE (rhs1) == INTEGER_CST)
2164                 {
2165                   *new_rhs_out = rhs1;
2166                   *type_out = NULL;
2167                   return true;
2168                 }
2169             }
2170         }
2171       else
2172         rhs1 = rhs;
2173
2174       type1 = TREE_TYPE (rhs1);
2175
2176       if (TREE_CODE (type1) != TREE_CODE (type)
2177           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2178         return false;
2179
2180       *new_rhs_out = rhs1;
2181       *type_out = type1;
2182       return true;
2183     }
2184
2185   if (TREE_CODE (rhs) == INTEGER_CST)
2186     {
2187       *new_rhs_out = rhs;
2188       *type_out = NULL;
2189       return true;
2190     }
2191
2192   return false;
2193 }
2194
2195 /* Return true if STMT performs a widening multiplication, assuming the
2196    output type is TYPE.  If so, store the unwidened types of the operands
2197    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2198    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2199    and *TYPE2_OUT would give the operands of the multiplication.  */
2200
2201 static bool
2202 is_widening_mult_p (gimple stmt,
2203                     tree *type1_out, tree *rhs1_out,
2204                     tree *type2_out, tree *rhs2_out)
2205 {
2206   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2207
2208   if (TREE_CODE (type) != INTEGER_TYPE
2209       && TREE_CODE (type) != FIXED_POINT_TYPE)
2210     return false;
2211
2212   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2213                                rhs1_out))
2214     return false;
2215
2216   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2217                                rhs2_out))
2218     return false;
2219
2220   if (*type1_out == NULL)
2221     {
2222       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2223         return false;
2224       *type1_out = *type2_out;
2225     }
2226
2227   if (*type2_out == NULL)
2228     {
2229       if (!int_fits_type_p (*rhs2_out, *type1_out))
2230         return false;
2231       *type2_out = *type1_out;
2232     }
2233
2234   /* Ensure that the larger of the two operands comes first. */
2235   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2236     {
2237       tree tmp;
2238       tmp = *type1_out;
2239       *type1_out = *type2_out;
2240       *type2_out = tmp;
2241       tmp = *rhs1_out;
2242       *rhs1_out = *rhs2_out;
2243       *rhs2_out = tmp;
2244     }
2245
2246   return true;
2247 }
2248
2249 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2250    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2251    value is true iff we converted the statement.  */
2252
2253 static bool
2254 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2255 {
2256   tree lhs, rhs1, rhs2, type, type1, type2;
2257   enum insn_code handler;
2258   enum machine_mode to_mode, from_mode, actual_mode;
2259   optab op;
2260   int actual_precision;
2261   location_t loc = gimple_location (stmt);
2262   bool from_unsigned1, from_unsigned2;
2263
2264   lhs = gimple_assign_lhs (stmt);
2265   type = TREE_TYPE (lhs);
2266   if (TREE_CODE (type) != INTEGER_TYPE)
2267     return false;
2268
2269   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2270     return false;
2271
2272   to_mode = TYPE_MODE (type);
2273   from_mode = TYPE_MODE (type1);
2274   from_unsigned1 = TYPE_UNSIGNED (type1);
2275   from_unsigned2 = TYPE_UNSIGNED (type2);
2276
2277   if (from_unsigned1 && from_unsigned2)
2278     op = umul_widen_optab;
2279   else if (!from_unsigned1 && !from_unsigned2)
2280     op = smul_widen_optab;
2281   else
2282     op = usmul_widen_optab;
2283
2284   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2285                                                   0, &actual_mode);
2286
2287   if (handler == CODE_FOR_nothing)
2288     {
2289       if (op != smul_widen_optab)
2290         {
2291           /* We can use a signed multiply with unsigned types as long as
2292              there is a wider mode to use, or it is the smaller of the two
2293              types that is unsigned.  Note that type1 >= type2, always.  */
2294           if ((TYPE_UNSIGNED (type1)
2295                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2296               || (TYPE_UNSIGNED (type2)
2297                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2298             {
2299               from_mode = GET_MODE_WIDER_MODE (from_mode);
2300               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2301                 return false;
2302             }
2303
2304           op = smul_widen_optab;
2305           handler = find_widening_optab_handler_and_mode (op, to_mode,
2306                                                           from_mode, 0,
2307                                                           &actual_mode);
2308
2309           if (handler == CODE_FOR_nothing)
2310             return false;
2311
2312           from_unsigned1 = from_unsigned2 = false;
2313         }
2314       else
2315         return false;
2316     }
2317
2318   /* Ensure that the inputs to the handler are in the correct precison
2319      for the opcode.  This will be the full mode size.  */
2320   actual_precision = GET_MODE_PRECISION (actual_mode);
2321   if (2 * actual_precision > TYPE_PRECISION (type))
2322     return false;
2323   if (actual_precision != TYPE_PRECISION (type1)
2324       || from_unsigned1 != TYPE_UNSIGNED (type1))
2325     rhs1 = build_and_insert_cast (gsi, loc,
2326                                   build_nonstandard_integer_type
2327                                     (actual_precision, from_unsigned1), rhs1);
2328   if (actual_precision != TYPE_PRECISION (type2)
2329       || from_unsigned2 != TYPE_UNSIGNED (type2))
2330     rhs2 = build_and_insert_cast (gsi, loc,
2331                                   build_nonstandard_integer_type
2332                                     (actual_precision, from_unsigned2), rhs2);
2333
2334   /* Handle constants.  */
2335   if (TREE_CODE (rhs1) == INTEGER_CST)
2336     rhs1 = fold_convert (type1, rhs1);
2337   if (TREE_CODE (rhs2) == INTEGER_CST)
2338     rhs2 = fold_convert (type2, rhs2);
2339
2340   gimple_assign_set_rhs1 (stmt, rhs1);
2341   gimple_assign_set_rhs2 (stmt, rhs2);
2342   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2343   update_stmt (stmt);
2344   widen_mul_stats.widen_mults_inserted++;
2345   return true;
2346 }
2347
2348 /* Process a single gimple statement STMT, which is found at the
2349    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2350    rhs (given by CODE), and try to convert it into a
2351    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2352    is true iff we converted the statement.  */
2353
2354 static bool
2355 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2356                             enum tree_code code)
2357 {
2358   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2359   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2360   tree type, type1, type2, optype;
2361   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2362   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2363   optab this_optab;
2364   enum tree_code wmult_code;
2365   enum insn_code handler;
2366   enum machine_mode to_mode, from_mode, actual_mode;
2367   location_t loc = gimple_location (stmt);
2368   int actual_precision;
2369   bool from_unsigned1, from_unsigned2;
2370
2371   lhs = gimple_assign_lhs (stmt);
2372   type = TREE_TYPE (lhs);
2373   if (TREE_CODE (type) != INTEGER_TYPE
2374       && TREE_CODE (type) != FIXED_POINT_TYPE)
2375     return false;
2376
2377   if (code == MINUS_EXPR)
2378     wmult_code = WIDEN_MULT_MINUS_EXPR;
2379   else
2380     wmult_code = WIDEN_MULT_PLUS_EXPR;
2381
2382   rhs1 = gimple_assign_rhs1 (stmt);
2383   rhs2 = gimple_assign_rhs2 (stmt);
2384
2385   if (TREE_CODE (rhs1) == SSA_NAME)
2386     {
2387       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2388       if (is_gimple_assign (rhs1_stmt))
2389         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2390     }
2391
2392   if (TREE_CODE (rhs2) == SSA_NAME)
2393     {
2394       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2395       if (is_gimple_assign (rhs2_stmt))
2396         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2397     }
2398
2399   /* Allow for one conversion statement between the multiply
2400      and addition/subtraction statement.  If there are more than
2401      one conversions then we assume they would invalidate this
2402      transformation.  If that's not the case then they should have
2403      been folded before now.  */
2404   if (CONVERT_EXPR_CODE_P (rhs1_code))
2405     {
2406       conv1_stmt = rhs1_stmt;
2407       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2408       if (TREE_CODE (rhs1) == SSA_NAME)
2409         {
2410           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2411           if (is_gimple_assign (rhs1_stmt))
2412             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2413         }
2414       else
2415         return false;
2416     }
2417   if (CONVERT_EXPR_CODE_P (rhs2_code))
2418     {
2419       conv2_stmt = rhs2_stmt;
2420       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2421       if (TREE_CODE (rhs2) == SSA_NAME)
2422         {
2423           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2424           if (is_gimple_assign (rhs2_stmt))
2425             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2426         }
2427       else
2428         return false;
2429     }
2430
2431   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2432      is_widening_mult_p, but we still need the rhs returns.
2433
2434      It might also appear that it would be sufficient to use the existing
2435      operands of the widening multiply, but that would limit the choice of
2436      multiply-and-accumulate instructions.
2437
2438      If the widened-multiplication result has more than one uses, it is
2439      probably wiser not to do the conversion.  */
2440   if (code == PLUS_EXPR
2441       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2442     {
2443       if (!has_single_use (rhs1)
2444           || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2445                                   &type2, &mult_rhs2))
2446         return false;
2447       add_rhs = rhs2;
2448       conv_stmt = conv1_stmt;
2449     }
2450   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2451     {
2452       if (!has_single_use (rhs2)
2453           || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2454                                   &type2, &mult_rhs2))
2455         return false;
2456       add_rhs = rhs1;
2457       conv_stmt = conv2_stmt;
2458     }
2459   else
2460     return false;
2461
2462   to_mode = TYPE_MODE (type);
2463   from_mode = TYPE_MODE (type1);
2464   from_unsigned1 = TYPE_UNSIGNED (type1);
2465   from_unsigned2 = TYPE_UNSIGNED (type2);
2466   optype = type1;
2467
2468   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2469   if (from_unsigned1 != from_unsigned2)
2470     {
2471       if (!INTEGRAL_TYPE_P (type))
2472         return false;
2473       /* We can use a signed multiply with unsigned types as long as
2474          there is a wider mode to use, or it is the smaller of the two
2475          types that is unsigned.  Note that type1 >= type2, always.  */
2476       if ((from_unsigned1
2477            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2478           || (from_unsigned2
2479               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2480         {
2481           from_mode = GET_MODE_WIDER_MODE (from_mode);
2482           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2483             return false;
2484         }
2485
2486       from_unsigned1 = from_unsigned2 = false;
2487       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2488                                                false);
2489     }
2490
2491   /* If there was a conversion between the multiply and addition
2492      then we need to make sure it fits a multiply-and-accumulate.
2493      The should be a single mode change which does not change the
2494      value.  */
2495   if (conv_stmt)
2496     {
2497       /* We use the original, unmodified data types for this.  */
2498       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2499       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2500       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2501       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2502
2503       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2504         {
2505           /* Conversion is a truncate.  */
2506           if (TYPE_PRECISION (to_type) < data_size)
2507             return false;
2508         }
2509       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2510         {
2511           /* Conversion is an extend.  Check it's the right sort.  */
2512           if (TYPE_UNSIGNED (from_type) != is_unsigned
2513               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2514             return false;
2515         }
2516       /* else convert is a no-op for our purposes.  */
2517     }
2518
2519   /* Verify that the machine can perform a widening multiply
2520      accumulate in this mode/signedness combination, otherwise
2521      this transformation is likely to pessimize code.  */
2522   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2523   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2524                                                   from_mode, 0, &actual_mode);
2525
2526   if (handler == CODE_FOR_nothing)
2527     return false;
2528
2529   /* Ensure that the inputs to the handler are in the correct precison
2530      for the opcode.  This will be the full mode size.  */
2531   actual_precision = GET_MODE_PRECISION (actual_mode);
2532   if (actual_precision != TYPE_PRECISION (type1)
2533       || from_unsigned1 != TYPE_UNSIGNED (type1))
2534     mult_rhs1 = build_and_insert_cast (gsi, loc,
2535                                        build_nonstandard_integer_type
2536                                          (actual_precision, from_unsigned1),
2537                                        mult_rhs1);
2538   if (actual_precision != TYPE_PRECISION (type2)
2539       || from_unsigned2 != TYPE_UNSIGNED (type2))
2540     mult_rhs2 = build_and_insert_cast (gsi, loc,
2541                                        build_nonstandard_integer_type
2542                                          (actual_precision, from_unsigned2),
2543                                        mult_rhs2);
2544
2545   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2546     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2547
2548   /* Handle constants.  */
2549   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2550     mult_rhs1 = fold_convert (type1, mult_rhs1);
2551   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2552     mult_rhs2 = fold_convert (type2, mult_rhs2);
2553
2554   gimple_assign_set_rhs_with_ops_1 (gsi, wmult_code, mult_rhs1, mult_rhs2,
2555                                     add_rhs);
2556   update_stmt (gsi_stmt (*gsi));
2557   widen_mul_stats.maccs_inserted++;
2558   return true;
2559 }
2560
2561 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
2562    with uses in additions and subtractions to form fused multiply-add
2563    operations.  Returns true if successful and MUL_STMT should be removed.  */
2564
2565 static bool
2566 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
2567 {
2568   tree mul_result = gimple_get_lhs (mul_stmt);
2569   tree type = TREE_TYPE (mul_result);
2570   gimple use_stmt, neguse_stmt, fma_stmt;
2571   use_operand_p use_p;
2572   imm_use_iterator imm_iter;
2573
2574   if (FLOAT_TYPE_P (type)
2575       && flag_fp_contract_mode == FP_CONTRACT_OFF)
2576     return false;
2577
2578   /* We don't want to do bitfield reduction ops.  */
2579   if (INTEGRAL_TYPE_P (type)
2580       && (TYPE_PRECISION (type)
2581           != GET_MODE_PRECISION (TYPE_MODE (type))))
2582     return false;
2583
2584   /* If the target doesn't support it, don't generate it.  We assume that
2585      if fma isn't available then fms, fnma or fnms are not either.  */
2586   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
2587     return false;
2588
2589   /* If the multiplication has zero uses, it is kept around probably because
2590      of -fnon-call-exceptions.  Don't optimize it away in that case,
2591      it is DCE job.  */
2592   if (has_zero_uses (mul_result))
2593     return false;
2594
2595   /* Make sure that the multiplication statement becomes dead after
2596      the transformation, thus that all uses are transformed to FMAs.
2597      This means we assume that an FMA operation has the same cost
2598      as an addition.  */
2599   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
2600     {
2601       enum tree_code use_code;
2602       tree result = mul_result;
2603       bool negate_p = false;
2604
2605       use_stmt = USE_STMT (use_p);
2606
2607       if (is_gimple_debug (use_stmt))
2608         continue;
2609
2610       /* For now restrict this operations to single basic blocks.  In theory
2611          we would want to support sinking the multiplication in
2612          m = a*b;
2613          if ()
2614            ma = m + c;
2615          else
2616            d = m;
2617          to form a fma in the then block and sink the multiplication to the
2618          else block.  */
2619       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2620         return false;
2621
2622       if (!is_gimple_assign (use_stmt))
2623         return false;
2624
2625       use_code = gimple_assign_rhs_code (use_stmt);
2626
2627       /* A negate on the multiplication leads to FNMA.  */
2628       if (use_code == NEGATE_EXPR)
2629         {
2630           ssa_op_iter iter;
2631           use_operand_p usep;
2632
2633           result = gimple_assign_lhs (use_stmt);
2634
2635           /* Make sure the negate statement becomes dead with this
2636              single transformation.  */
2637           if (!single_imm_use (gimple_assign_lhs (use_stmt),
2638                                &use_p, &neguse_stmt))
2639             return false;
2640
2641           /* Make sure the multiplication isn't also used on that stmt.  */
2642           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
2643             if (USE_FROM_PTR (usep) == mul_result)
2644               return false;
2645
2646           /* Re-validate.  */
2647           use_stmt = neguse_stmt;
2648           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2649             return false;
2650           if (!is_gimple_assign (use_stmt))
2651             return false;
2652
2653           use_code = gimple_assign_rhs_code (use_stmt);
2654           negate_p = true;
2655         }
2656
2657       switch (use_code)
2658         {
2659         case MINUS_EXPR:
2660           if (gimple_assign_rhs2 (use_stmt) == result)
2661             negate_p = !negate_p;
2662           break;
2663         case PLUS_EXPR:
2664           break;
2665         default:
2666           /* FMA can only be formed from PLUS and MINUS.  */
2667           return false;
2668         }
2669
2670       /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
2671          by a MULT_EXPR that we'll visit later, we might be able to
2672          get a more profitable match with fnma.
2673          OTOH, if we don't, a negate / fma pair has likely lower latency
2674          that a mult / subtract pair.  */
2675       if (use_code == MINUS_EXPR && !negate_p
2676           && gimple_assign_rhs1 (use_stmt) == result
2677           && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
2678           && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
2679         {
2680           tree rhs2 = gimple_assign_rhs2 (use_stmt);
2681
2682           if (TREE_CODE (rhs2) == SSA_NAME)
2683             {
2684               gimple stmt2 = SSA_NAME_DEF_STMT (rhs2);
2685               if (has_single_use (rhs2)
2686                   && is_gimple_assign (stmt2)
2687                   && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
2688               return false;
2689             }
2690         }
2691
2692       /* We can't handle a * b + a * b.  */
2693       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
2694         return false;
2695
2696       /* While it is possible to validate whether or not the exact form
2697          that we've recognized is available in the backend, the assumption
2698          is that the transformation is never a loss.  For instance, suppose
2699          the target only has the plain FMA pattern available.  Consider
2700          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
2701          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
2702          still have 3 operations, but in the FMA form the two NEGs are
2703          independent and could be run in parallel.  */
2704     }
2705
2706   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2707     {
2708       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2709       enum tree_code use_code;
2710       tree addop, mulop1 = op1, result = mul_result;
2711       bool negate_p = false;
2712
2713       if (is_gimple_debug (use_stmt))
2714         continue;
2715
2716       use_code = gimple_assign_rhs_code (use_stmt);
2717       if (use_code == NEGATE_EXPR)
2718         {
2719           result = gimple_assign_lhs (use_stmt);
2720           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2721           gsi_remove (&gsi, true);
2722           release_defs (use_stmt);
2723
2724           use_stmt = neguse_stmt;
2725           gsi = gsi_for_stmt (use_stmt);
2726           use_code = gimple_assign_rhs_code (use_stmt);
2727           negate_p = true;
2728         }
2729
2730       if (gimple_assign_rhs1 (use_stmt) == result)
2731         {
2732           addop = gimple_assign_rhs2 (use_stmt);
2733           /* a * b - c -> a * b + (-c)  */
2734           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2735             addop = force_gimple_operand_gsi (&gsi,
2736                                               build1 (NEGATE_EXPR,
2737                                                       type, addop),
2738                                               true, NULL_TREE, true,
2739                                               GSI_SAME_STMT);
2740         }
2741       else
2742         {
2743           addop = gimple_assign_rhs1 (use_stmt);
2744           /* a - b * c -> (-b) * c + a */
2745           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2746             negate_p = !negate_p;
2747         }
2748
2749       if (negate_p)
2750         mulop1 = force_gimple_operand_gsi (&gsi,
2751                                            build1 (NEGATE_EXPR,
2752                                                    type, mulop1),
2753                                            true, NULL_TREE, true,
2754                                            GSI_SAME_STMT);
2755
2756       fma_stmt = gimple_build_assign_with_ops (FMA_EXPR,
2757                                                gimple_assign_lhs (use_stmt),
2758                                                mulop1, op2,
2759                                                addop);
2760       gsi_replace (&gsi, fma_stmt, true);
2761       widen_mul_stats.fmas_inserted++;
2762     }
2763
2764   return true;
2765 }
2766
2767 /* Find integer multiplications where the operands are extended from
2768    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
2769    where appropriate.  */
2770
2771 static unsigned int
2772 execute_optimize_widening_mul (void)
2773 {
2774   basic_block bb;
2775   bool cfg_changed = false;
2776
2777   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
2778
2779   FOR_EACH_BB (bb)
2780     {
2781       gimple_stmt_iterator gsi;
2782
2783       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
2784         {
2785           gimple stmt = gsi_stmt (gsi);
2786           enum tree_code code;
2787
2788           if (is_gimple_assign (stmt))
2789             {
2790               code = gimple_assign_rhs_code (stmt);
2791               switch (code)
2792                 {
2793                 case MULT_EXPR:
2794                   if (!convert_mult_to_widen (stmt, &gsi)
2795                       && convert_mult_to_fma (stmt,
2796                                               gimple_assign_rhs1 (stmt),
2797                                               gimple_assign_rhs2 (stmt)))
2798                     {
2799                       gsi_remove (&gsi, true);
2800                       release_defs (stmt);
2801                       continue;
2802                     }
2803                   break;
2804
2805                 case PLUS_EXPR:
2806                 case MINUS_EXPR:
2807                   convert_plusminus_to_widen (&gsi, stmt, code);
2808                   break;
2809
2810                 default:;
2811                 }
2812             }
2813           else if (is_gimple_call (stmt)
2814                    && gimple_call_lhs (stmt))
2815             {
2816               tree fndecl = gimple_call_fndecl (stmt);
2817               if (fndecl
2818                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
2819                 {
2820                   switch (DECL_FUNCTION_CODE (fndecl))
2821                     {
2822                       case BUILT_IN_POWF:
2823                       case BUILT_IN_POW:
2824                       case BUILT_IN_POWL:
2825                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
2826                             && REAL_VALUES_EQUAL
2827                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
2828                                   dconst2)
2829                             && convert_mult_to_fma (stmt,
2830                                                     gimple_call_arg (stmt, 0),
2831                                                     gimple_call_arg (stmt, 0)))
2832                           {
2833                             unlink_stmt_vdef (stmt);
2834                             if (gsi_remove (&gsi, true)
2835                                 && gimple_purge_dead_eh_edges (bb))
2836                               cfg_changed = true;
2837                             release_defs (stmt);
2838                             continue;
2839                           }
2840                           break;
2841
2842                       default:;
2843                     }
2844                 }
2845             }
2846           gsi_next (&gsi);
2847         }
2848     }
2849
2850   statistics_counter_event (cfun, "widening multiplications inserted",
2851                             widen_mul_stats.widen_mults_inserted);
2852   statistics_counter_event (cfun, "widening maccs inserted",
2853                             widen_mul_stats.maccs_inserted);
2854   statistics_counter_event (cfun, "fused multiply-adds inserted",
2855                             widen_mul_stats.fmas_inserted);
2856
2857   return cfg_changed ? TODO_cleanup_cfg : 0;
2858 }
2859
2860 static bool
2861 gate_optimize_widening_mul (void)
2862 {
2863   return flag_expensive_optimizations && optimize;
2864 }
2865
2866 namespace {
2867
2868 const pass_data pass_data_optimize_widening_mul =
2869 {
2870   GIMPLE_PASS, /* type */
2871   "widening_mul", /* name */
2872   OPTGROUP_NONE, /* optinfo_flags */
2873   true, /* has_gate */
2874   true, /* has_execute */
2875   TV_NONE, /* tv_id */
2876   PROP_ssa, /* properties_required */
2877   0, /* properties_provided */
2878   0, /* properties_destroyed */
2879   0, /* todo_flags_start */
2880   ( TODO_verify_ssa | TODO_verify_stmts
2881     | TODO_update_ssa ), /* todo_flags_finish */
2882 };
2883
2884 class pass_optimize_widening_mul : public gimple_opt_pass
2885 {
2886 public:
2887   pass_optimize_widening_mul (gcc::context *ctxt)
2888     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
2889   {}
2890
2891   /* opt_pass methods: */
2892   bool gate () { return gate_optimize_widening_mul (); }
2893   unsigned int execute () { return execute_optimize_widening_mul (); }
2894
2895 }; // class pass_optimize_widening_mul
2896
2897 } // anon namespace
2898
2899 gimple_opt_pass *
2900 make_pass_optimize_widening_mul (gcc::context *ctxt)
2901 {
2902   return new pass_optimize_widening_mul (ctxt);
2903 }