gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005-2013 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  21    operations.  These are common in sequences such as this one:
  22
  23         modulus = sqrt(x*x + y*y + z*z);
  24         x = x / modulus;
  25         y = y / modulus;
  26         z = z / modulus;
  27
  28    that can be optimized to
  29
  30         modulus = sqrt(x*x + y*y + z*z);
  31         rmodulus = 1.0 / modulus;
  32         x = x * rmodulus;
  33         y = y * rmodulus;
  34         z = z * rmodulus;
  35
  36    We do this for loop invariant divisors, and with this pass whenever
  37    we notice that a division has the same divisor multiple times.
  38
  39    Of course, like in PRE, we don't insert a division if a dominator
  40    already has one.  However, this cannot be done as an extension of
  41    PRE for several reasons.
  42
  43    First of all, with some experiments it was found out that the
  44    transformation is not always useful if there are only two divisions
  45    hy the same divisor.  This is probably because modern processors
  46    can pipeline the divisions; on older, in-order processors it should
  47    still be effective to optimize two divisions by the same number.
  48    We make this a param, and it shall be called N in the remainder of
  49    this comment.
  50
  51    Second, if trapping math is active, we have less freedom on where
  52    to insert divisions: we can only do so in basic blocks that already
  53    contain one.  (If divisions don't trap, instead, we can insert
  54    divisions elsewhere, which will be in blocks that are common dominators
  55    of those that have the division).
  56
  57    We really don't want to compute the reciprocal unless a division will
  58    be found.  To do this, we won't insert the division in a basic block
  59    that has less than N divisions *post-dominating* it.
  60
  61    The algorithm constructs a subset of the dominator tree, holding the
  62    blocks containing the divisions and the common dominators to them,
  63    and walk it twice.  The first walk is in post-order, and it annotates
  64    each block with the number of divisions that post-dominate it: this
  65    gives information on where divisions can be inserted profitably.
  66    The second walk is in pre-order, and it inserts divisions as explained
  67    above, and replaces divisions by multiplications.
  68
  69    In the best case, the cost of the pass is O(n_statements).  In the
  70    worst-case, the cost is due to creating the dominator tree subset,
  71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  72    for n_statements / n_basic_blocks statements.  So, the amortized cost
  73    of creating the dominator tree subset is O(n_basic_blocks) and the
  74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  75
  76    More practically, the cost will be small because there are few
  77    divisions, and they tend to be in the same basic block, so insert_bb
  78    is called very few times.
  79
  80    If we did this using domwalk.c, an efficient implementation would have
  81    to work on all the variables in a single pass, because we could not
  82    work on just a subset of the dominator tree, as we do now, and the
  83    cost would also be something like O(n_statements * n_basic_blocks).
  84    The data structures would be more complex in order to work on all the
  85    variables in a single pass.  */
  86
  87 #include "config.h"
  88 #include "system.h"
  89 #include "coretypes.h"
  90 #include "tm.h"
  91 #include "flags.h"
  92 #include "tree.h"
  93 #include "gimple.h"
  94 #include "gimple-iterator.h"
  95 #include "gimplify-me.h"
  96 #include "gimple-ssa.h"
  97 #include "tree-cfg.h"
  98 #include "tree-phinodes.h"
  99 #include "ssa-iterators.h"
 100 #include "tree-ssanames.h"
 101 #include "tree-dfa.h"
 102 #include "tree-ssa.h"
 103 #include "tree-pass.h"
 104 #include "alloc-pool.h"
 105 #include "basic-block.h"
 106 #include "target.h"
 107 #include "gimple-pretty-print.h"
 108
 109 /* FIXME: RTL headers have to be included here for optabs.  */
 110 #include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
 111 #include "expr.h"               /* Because optabs.h wants sepops.  */
 112 #include "optabs.h"
 113
 114 /* This structure represents one basic block that either computes a
 115    division, or is a common dominator for basic block that compute a
 116    division.  */
 117 struct occurrence {
 118   /* The basic block represented by this structure.  */
 119   basic_block bb;
 120
 121   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 122      inserted in BB.  */
 123   tree recip_def;
 124
 125   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 126      was inserted in BB.  */
 127   gimple recip_def_stmt;
 128
 129   /* Pointer to a list of "struct occurrence"s for blocks dominated
 130      by BB.  */
 131   struct occurrence *children;
 132
 133   /* Pointer to the next "struct occurrence"s in the list of blocks
 134      sharing a common dominator.  */
 135   struct occurrence *next;
 136
 137   /* The number of divisions that are in BB before compute_merit.  The
 138      number of divisions that are in BB or post-dominate it after
 139      compute_merit.  */
 140   int num_divisions;
 141
 142   /* True if the basic block has a division, false if it is a common
 143      dominator for basic blocks that do.  If it is false and trapping
 144      math is active, BB is not a candidate for inserting a reciprocal.  */
 145   bool bb_has_division;
 146 };
 147
 148 static struct
 149 {
 150   /* Number of 1.0/X ops inserted.  */
 151   int rdivs_inserted;
 152
 153   /* Number of 1.0/FUNC ops inserted.  */
 154   int rfuncs_inserted;
 155 } reciprocal_stats;
 156
 157 static struct
 158 {
 159   /* Number of cexpi calls inserted.  */
 160   int inserted;
 161 } sincos_stats;
 162
 163 static struct
 164 {
 165   /* Number of hand-written 16-bit bswaps found.  */
 166   int found_16bit;
 167
 168   /* Number of hand-written 32-bit bswaps found.  */
 169   int found_32bit;
 170
 171   /* Number of hand-written 64-bit bswaps found.  */
 172   int found_64bit;
 173 } bswap_stats;
 174
 175 static struct
 176 {
 177   /* Number of widening multiplication ops inserted.  */
 178   int widen_mults_inserted;
 179
 180   /* Number of integer multiply-and-accumulate ops inserted.  */
 181   int maccs_inserted;
 182
 183   /* Number of fp fused multiply-add ops inserted.  */
 184   int fmas_inserted;
 185 } widen_mul_stats;
 186
 187 /* The instance of "struct occurrence" representing the highest
 188    interesting block in the dominator tree.  */
 189 static struct occurrence *occ_head;
 190
 191 /* Allocation pool for getting instances of "struct occurrence".  */
 192 static alloc_pool occ_pool;
 193
 194
 195
 196 /* Allocate and return a new struct occurrence for basic block BB, and
 197    whose children list is headed by CHILDREN.  */
 198 static struct occurrence *
 199 occ_new (basic_block bb, struct occurrence *children)
 200 {
 201   struct occurrence *occ;
 202
 203   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
 204   memset (occ, 0, sizeof (struct occurrence));
 205
 206   occ->bb = bb;
 207   occ->children = children;
 208   return occ;
 209 }
 210
 211
 212 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 213    list of "struct occurrence"s, one per basic block, having IDOM as
 214    their common dominator.
 215
 216    We try to insert NEW_OCC as deep as possible in the tree, and we also
 217    insert any other block that is a common dominator for BB and one
 218    block already in the tree.  */
 219
 220 static void
 221 insert_bb (struct occurrence *new_occ, basic_block idom,
 222            struct occurrence **p_head)
 223 {
 224   struct occurrence *occ, **p_occ;
 225
 226   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 227     {
 228       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 229       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 230       if (dom == bb)
 231         {
 232           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 233              from its list.  */
 234           *p_occ = occ->next;
 235           occ->next = new_occ->children;
 236           new_occ->children = occ;
 237
 238           /* Try the next block (it may as well be dominated by BB).  */
 239         }
 240
 241       else if (dom == occ_bb)
 242         {
 243           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 244           insert_bb (new_occ, dom, &occ->children);
 245           return;
 246         }
 247
 248       else if (dom != idom)
 249         {
 250           gcc_assert (!dom->aux);
 251
 252           /* There is a dominator between IDOM and BB, add it and make
 253              two children out of NEW_OCC and OCC.  First, remove OCC from
 254              its list.  */
 255           *p_occ = occ->next;
 256           new_occ->next = occ;
 257           occ->next = NULL;
 258
 259           /* None of the previous blocks has DOM as a dominator: if we tail
 260              recursed, we would reexamine them uselessly. Just switch BB with
 261              DOM, and go on looking for blocks dominated by DOM.  */
 262           new_occ = occ_new (dom, new_occ);
 263         }
 264
 265       else
 266         {
 267           /* Nothing special, go on with the next element.  */
 268           p_occ = &occ->next;
 269         }
 270     }
 271
 272   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 273   new_occ->next = *p_head;
 274   *p_head = new_occ;
 275 }
 276
 277 /* Register that we found a division in BB.  */
 278
 279 static inline void
 280 register_division_in (basic_block bb)
 281 {
 282   struct occurrence *occ;
 283
 284   occ = (struct occurrence *) bb->aux;
 285   if (!occ)
 286     {
 287       occ = occ_new (bb, NULL);
 288       insert_bb (occ, ENTRY_BLOCK_PTR, &occ_head);
 289     }
 290
 291   occ->bb_has_division = true;
 292   occ->num_divisions++;
 293 }
 294
 295
 296 /* Compute the number of divisions that postdominate each block in OCC and
 297    its children.  */
 298
 299 static void
 300 compute_merit (struct occurrence *occ)
 301 {
 302   struct occurrence *occ_child;
 303   basic_block dom = occ->bb;
 304
 305   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 306     {
 307       basic_block bb;
 308       if (occ_child->children)
 309         compute_merit (occ_child);
 310
 311       if (flag_exceptions)
 312         bb = single_noncomplex_succ (dom);
 313       else
 314         bb = dom;
 315
 316       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 317         occ->num_divisions += occ_child->num_divisions;
 318     }
 319 }
 320
 321
 322 /* Return whether USE_STMT is a floating-point division by DEF.  */
 323 static inline bool
 324 is_division_by (gimple use_stmt, tree def)
 325 {
 326   return is_gimple_assign (use_stmt)
 327          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 328          && gimple_assign_rhs2 (use_stmt) == def
 329          /* Do not recognize x / x as valid division, as we are getting
 330             confused later by replacing all immediate uses x in such
 331             a stmt.  */
 332          && gimple_assign_rhs1 (use_stmt) != def;
 333 }
 334
 335 /* Walk the subset of the dominator tree rooted at OCC, setting the
 336    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 337    the given basic block.  The field may be left NULL, of course,
 338    if it is not possible or profitable to do the optimization.
 339
 340    DEF_BSI is an iterator pointing at the statement defining DEF.
 341    If RECIP_DEF is set, a dominator already has a computation that can
 342    be used.  */
 343
 344 static void
 345 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 346                     tree def, tree recip_def, int threshold)
 347 {
 348   tree type;
 349   gimple new_stmt;
 350   gimple_stmt_iterator gsi;
 351   struct occurrence *occ_child;
 352
 353   if (!recip_def
 354       && (occ->bb_has_division || !flag_trapping_math)
 355       && occ->num_divisions >= threshold)
 356     {
 357       /* Make a variable with the replacement and substitute it.  */
 358       type = TREE_TYPE (def);
 359       recip_def = create_tmp_reg (type, "reciptmp");
 360       new_stmt = gimple_build_assign_with_ops (RDIV_EXPR, recip_def,
 361                                                build_one_cst (type), def);
 362
 363       if (occ->bb_has_division)
 364         {
 365           /* Case 1: insert before an existing division.  */
 366           gsi = gsi_after_labels (occ->bb);
 367           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 368             gsi_next (&gsi);
 369
 370           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 371         }
 372       else if (def_gsi && occ->bb == def_gsi->bb)
 373         {
 374           /* Case 2: insert right after the definition.  Note that this will
 375              never happen if the definition statement can throw, because in
 376              that case the sole successor of the statement's basic block will
 377              dominate all the uses as well.  */
 378           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 379         }
 380       else
 381         {
 382           /* Case 3: insert in a basic block not containing defs/uses.  */
 383           gsi = gsi_after_labels (occ->bb);
 384           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 385         }
 386
 387       reciprocal_stats.rdivs_inserted++;
 388
 389       occ->recip_def_stmt = new_stmt;
 390     }
 391
 392   occ->recip_def = recip_def;
 393   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 394     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 395 }
 396
 397
 398 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 399    possible.  */
 400
 401 static inline void
 402 replace_reciprocal (use_operand_p use_p)
 403 {
 404   gimple use_stmt = USE_STMT (use_p);
 405   basic_block bb = gimple_bb (use_stmt);
 406   struct occurrence *occ = (struct occurrence *) bb->aux;
 407
 408   if (optimize_bb_for_speed_p (bb)
 409       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 410     {
 411       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 412       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 413       SET_USE (use_p, occ->recip_def);
 414       fold_stmt_inplace (&gsi);
 415       update_stmt (use_stmt);
 416     }
 417 }
 418
 419
 420 /* Free OCC and return one more "struct occurrence" to be freed.  */
 421
 422 static struct occurrence *
 423 free_bb (struct occurrence *occ)
 424 {
 425   struct occurrence *child, *next;
 426
 427   /* First get the two pointers hanging off OCC.  */
 428   next = occ->next;
 429   child = occ->children;
 430   occ->bb->aux = NULL;
 431   pool_free (occ_pool, occ);
 432
 433   /* Now ensure that we don't recurse unless it is necessary.  */
 434   if (!child)
 435     return next;
 436   else
 437     {
 438       while (next)
 439         next = free_bb (next);
 440
 441       return child;
 442     }
 443 }
 444
 445
 446 /* Look for floating-point divisions among DEF's uses, and try to
 447    replace them by multiplications with the reciprocal.  Add
 448    as many statements computing the reciprocal as needed.
 449
 450    DEF must be a GIMPLE register of a floating-point type.  */
 451
 452 static void
 453 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 454 {
 455   use_operand_p use_p;
 456   imm_use_iterator use_iter;
 457   struct occurrence *occ;
 458   int count = 0, threshold;
 459
 460   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 461
 462   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 463     {
 464       gimple use_stmt = USE_STMT (use_p);
 465       if (is_division_by (use_stmt, def))
 466         {
 467           register_division_in (gimple_bb (use_stmt));
 468           count++;
 469         }
 470     }
 471
 472   /* Do the expensive part only if we can hope to optimize something.  */
 473   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 474   if (count >= threshold)
 475     {
 476       gimple use_stmt;
 477       for (occ = occ_head; occ; occ = occ->next)
 478         {
 479           compute_merit (occ);
 480           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 481         }
 482
 483       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 484         {
 485           if (is_division_by (use_stmt, def))
 486             {
 487               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 488                 replace_reciprocal (use_p);
 489             }
 490         }
 491     }
 492
 493   for (occ = occ_head; occ; )
 494     occ = free_bb (occ);
 495
 496   occ_head = NULL;
 497 }
 498
 499 static bool
 500 gate_cse_reciprocals (void)
 501 {
 502   return optimize && flag_reciprocal_math;
 503 }
 504
 505 /* Go through all the floating-point SSA_NAMEs, and call
 506    execute_cse_reciprocals_1 on each of them.  */
 507 static unsigned int
 508 execute_cse_reciprocals (void)
 509 {
 510   basic_block bb;
 511   tree arg;
 512
 513   occ_pool = create_alloc_pool ("dominators for recip",
 514                                 sizeof (struct occurrence),
 515                                 n_basic_blocks / 3 + 1);
 516
 517   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 518   calculate_dominance_info (CDI_DOMINATORS);
 519   calculate_dominance_info (CDI_POST_DOMINATORS);
 520
 521 #ifdef ENABLE_CHECKING
 522   FOR_EACH_BB (bb)
 523     gcc_assert (!bb->aux);
 524 #endif
 525
 526   for (arg = DECL_ARGUMENTS (cfun->decl); arg; arg = DECL_CHAIN (arg))
 527     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 528         && is_gimple_reg (arg))
 529       {
 530         tree name = ssa_default_def (cfun, arg);
 531         if (name)
 532           execute_cse_reciprocals_1 (NULL, name);
 533       }
 534
 535   FOR_EACH_BB (bb)
 536     {
 537       gimple_stmt_iterator gsi;
 538       gimple phi;
 539       tree def;
 540
 541       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 542         {
 543           phi = gsi_stmt (gsi);
 544           def = PHI_RESULT (phi);
 545           if (! virtual_operand_p (def)
 546               && FLOAT_TYPE_P (TREE_TYPE (def)))
 547             execute_cse_reciprocals_1 (NULL, def);
 548         }
 549
 550       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 551         {
 552           gimple stmt = gsi_stmt (gsi);
 553
 554           if (gimple_has_lhs (stmt)
 555               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 556               && FLOAT_TYPE_P (TREE_TYPE (def))
 557               && TREE_CODE (def) == SSA_NAME)
 558             execute_cse_reciprocals_1 (&gsi, def);
 559         }
 560
 561       if (optimize_bb_for_size_p (bb))
 562         continue;
 563
 564       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 565       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 566         {
 567           gimple stmt = gsi_stmt (gsi);
 568           tree fndecl;
 569
 570           if (is_gimple_assign (stmt)
 571               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 572             {
 573               tree arg1 = gimple_assign_rhs2 (stmt);
 574               gimple stmt1;
 575
 576               if (TREE_CODE (arg1) != SSA_NAME)
 577                 continue;
 578
 579               stmt1 = SSA_NAME_DEF_STMT (arg1);
 580
 581               if (is_gimple_call (stmt1)
 582                   && gimple_call_lhs (stmt1)
 583                   && (fndecl = gimple_call_fndecl (stmt1))
 584                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 585                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 586                 {
 587                   enum built_in_function code;
 588                   bool md_code, fail;
 589                   imm_use_iterator ui;
 590                   use_operand_p use_p;
 591
 592                   code = DECL_FUNCTION_CODE (fndecl);
 593                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 594
 595                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 596                   if (!fndecl)
 597                     continue;
 598
 599                   /* Check that all uses of the SSA name are divisions,
 600                      otherwise replacing the defining statement will do
 601                      the wrong thing.  */
 602                   fail = false;
 603                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 604                     {
 605                       gimple stmt2 = USE_STMT (use_p);
 606                       if (is_gimple_debug (stmt2))
 607                         continue;
 608                       if (!is_gimple_assign (stmt2)
 609                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 610                           || gimple_assign_rhs1 (stmt2) == arg1
 611                           || gimple_assign_rhs2 (stmt2) != arg1)
 612                         {
 613                           fail = true;
 614                           break;
 615                         }
 616                     }
 617                   if (fail)
 618                     continue;
 619
 620                   gimple_replace_ssa_lhs (stmt1, arg1);
 621                   gimple_call_set_fndecl (stmt1, fndecl);
 622                   update_stmt (stmt1);
 623                   reciprocal_stats.rfuncs_inserted++;
 624
 625                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 626                     {
 627                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 628                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 629                       fold_stmt_inplace (&gsi);
 630                       update_stmt (stmt);
 631                     }
 632                 }
 633             }
 634         }
 635     }
 636
 637   statistics_counter_event (cfun, "reciprocal divs inserted",
 638                             reciprocal_stats.rdivs_inserted);
 639   statistics_counter_event (cfun, "reciprocal functions inserted",
 640                             reciprocal_stats.rfuncs_inserted);
 641
 642   free_dominance_info (CDI_DOMINATORS);
 643   free_dominance_info (CDI_POST_DOMINATORS);
 644   free_alloc_pool (occ_pool);
 645   return 0;
 646 }
 647
 648 namespace {
 649
 650 const pass_data pass_data_cse_reciprocals =
 651 {
 652   GIMPLE_PASS, /* type */
 653   "recip", /* name */
 654   OPTGROUP_NONE, /* optinfo_flags */
 655   true, /* has_gate */
 656   true, /* has_execute */
 657   TV_NONE, /* tv_id */
 658   PROP_ssa, /* properties_required */
 659   0, /* properties_provided */
 660   0, /* properties_destroyed */
 661   0, /* todo_flags_start */
 662   ( TODO_update_ssa | TODO_verify_ssa
 663     | TODO_verify_stmts ), /* todo_flags_finish */
 664 };
 665
 666 class pass_cse_reciprocals : public gimple_opt_pass
 667 {
 668 public:
 669   pass_cse_reciprocals (gcc::context *ctxt)
 670     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
 671   {}
 672
 673   /* opt_pass methods: */
 674   bool gate () { return gate_cse_reciprocals (); }
 675   unsigned int execute () { return execute_cse_reciprocals (); }
 676
 677 }; // class pass_cse_reciprocals
 678
 679 } // anon namespace
 680
 681 gimple_opt_pass *
 682 make_pass_cse_reciprocals (gcc::context *ctxt)
 683 {
 684   return new pass_cse_reciprocals (ctxt);
 685 }
 686
 687 /* Records an occurrence at statement USE_STMT in the vector of trees
 688    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 689    is not yet initialized.  Returns true if the occurrence was pushed on
 690    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 691    statements in the vector.  */
 692
 693 static bool
 694 maybe_record_sincos (vec<gimple> *stmts,
 695                      basic_block *top_bb, gimple use_stmt)
 696 {
 697   basic_block use_bb = gimple_bb (use_stmt);
 698   if (*top_bb
 699       && (*top_bb == use_bb
 700           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 701     stmts->safe_push (use_stmt);
 702   else if (!*top_bb
 703            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 704     {
 705       stmts->safe_push (use_stmt);
 706       *top_bb = use_bb;
 707     }
 708   else
 709     return false;
 710
 711   return true;
 712 }
 713
 714 /* Look for sin, cos and cexpi calls with the same argument NAME and
 715    create a single call to cexpi CSEing the result in this case.
 716    We first walk over all immediate uses of the argument collecting
 717    statements that we can CSE in a vector and in a second pass replace
 718    the statement rhs with a REALPART or IMAGPART expression on the
 719    result of the cexpi call we insert before the use statement that
 720    dominates all other candidates.  */
 721
 722 static bool
 723 execute_cse_sincos_1 (tree name)
 724 {
 725   gimple_stmt_iterator gsi;
 726   imm_use_iterator use_iter;
 727   tree fndecl, res, type;
 728   gimple def_stmt, use_stmt, stmt;
 729   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 730   vec<gimple> stmts = vNULL;
 731   basic_block top_bb = NULL;
 732   int i;
 733   bool cfg_changed = false;
 734
 735   type = TREE_TYPE (name);
 736   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 737     {
 738       if (gimple_code (use_stmt) != GIMPLE_CALL
 739           || !gimple_call_lhs (use_stmt)
 740           || !(fndecl = gimple_call_fndecl (use_stmt))
 741           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 742         continue;
 743
 744       switch (DECL_FUNCTION_CODE (fndecl))
 745         {
 746         CASE_FLT_FN (BUILT_IN_COS):
 747           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 748           break;
 749
 750         CASE_FLT_FN (BUILT_IN_SIN):
 751           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 752           break;
 753
 754         CASE_FLT_FN (BUILT_IN_CEXPI):
 755           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 756           break;
 757
 758         default:;
 759         }
 760     }
 761
 762   if (seen_cos + seen_sin + seen_cexpi <= 1)
 763     {
 764       stmts.release ();
 765       return false;
 766     }
 767
 768   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 769      the name def statement.  */
 770   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 771   if (!fndecl)
 772     return false;
 773   stmt = gimple_build_call (fndecl, 1, name);
 774   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
 775   gimple_call_set_lhs (stmt, res);
 776
 777   def_stmt = SSA_NAME_DEF_STMT (name);
 778   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 779       && gimple_code (def_stmt) != GIMPLE_PHI
 780       && gimple_bb (def_stmt) == top_bb)
 781     {
 782       gsi = gsi_for_stmt (def_stmt);
 783       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 784     }
 785   else
 786     {
 787       gsi = gsi_after_labels (top_bb);
 788       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 789     }
 790   sincos_stats.inserted++;
 791
 792   /* And adjust the recorded old call sites.  */
 793   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
 794     {
 795       tree rhs = NULL;
 796       fndecl = gimple_call_fndecl (use_stmt);
 797
 798       switch (DECL_FUNCTION_CODE (fndecl))
 799         {
 800         CASE_FLT_FN (BUILT_IN_COS):
 801           rhs = fold_build1 (REALPART_EXPR, type, res);
 802           break;
 803
 804         CASE_FLT_FN (BUILT_IN_SIN):
 805           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 806           break;
 807
 808         CASE_FLT_FN (BUILT_IN_CEXPI):
 809           rhs = res;
 810           break;
 811
 812         default:;
 813           gcc_unreachable ();
 814         }
 815
 816         /* Replace call with a copy.  */
 817         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 818
 819         gsi = gsi_for_stmt (use_stmt);
 820         gsi_replace (&gsi, stmt, true);
 821         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 822           cfg_changed = true;
 823     }
 824
 825   stmts.release ();
 826
 827   return cfg_changed;
 828 }
 829
 830 /* To evaluate powi(x,n), the floating point value x raised to the
 831    constant integer exponent n, we use a hybrid algorithm that
 832    combines the "window method" with look-up tables.  For an
 833    introduction to exponentiation algorithms and "addition chains",
 834    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 835    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 836    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 837    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 838
 839 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 840    multiplications to inline before calling the system library's pow
 841    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 842    so this default never requires calling pow, powf or powl.  */
 843
 844 #ifndef POWI_MAX_MULTS
 845 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 846 #endif
 847
 848 /* The size of the "optimal power tree" lookup table.  All
 849    exponents less than this value are simply looked up in the
 850    powi_table below.  This threshold is also used to size the
 851    cache of pseudo registers that hold intermediate results.  */
 852 #define POWI_TABLE_SIZE 256
 853
 854 /* The size, in bits of the window, used in the "window method"
 855    exponentiation algorithm.  This is equivalent to a radix of
 856    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 857 #define POWI_WINDOW_SIZE 3
 858
 859 /* The following table is an efficient representation of an
 860    "optimal power tree".  For each value, i, the corresponding
 861    value, j, in the table states than an optimal evaluation
 862    sequence for calculating pow(x,i) can be found by evaluating
 863    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 864    100 integers is given in Knuth's "Seminumerical algorithms".  */
 865
 866 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 867   {
 868       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 869       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 870       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 871      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 872      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 873      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 874      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 875      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 876      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 877      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 878      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 879      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 880      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 881      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 882      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 883      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 884      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 885      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 886      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 887      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 888      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 889      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 890      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 891      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 892      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 893     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 894     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 895     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 896     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 897     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 898     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 899     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 900   };
 901
 902
 903 /* Return the number of multiplications required to calculate
 904    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 905    subroutine of powi_cost.  CACHE is an array indicating
 906    which exponents have already been calculated.  */
 907
 908 static int
 909 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 910 {
 911   /* If we've already calculated this exponent, then this evaluation
 912      doesn't require any additional multiplications.  */
 913   if (cache[n])
 914     return 0;
 915
 916   cache[n] = true;
 917   return powi_lookup_cost (n - powi_table[n], cache)
 918          + powi_lookup_cost (powi_table[n], cache) + 1;
 919 }
 920
 921 /* Return the number of multiplications required to calculate
 922    powi(x,n) for an arbitrary x, given the exponent N.  This
 923    function needs to be kept in sync with powi_as_mults below.  */
 924
 925 static int
 926 powi_cost (HOST_WIDE_INT n)
 927 {
 928   bool cache[POWI_TABLE_SIZE];
 929   unsigned HOST_WIDE_INT digit;
 930   unsigned HOST_WIDE_INT val;
 931   int result;
 932
 933   if (n == 0)
 934     return 0;
 935
 936   /* Ignore the reciprocal when calculating the cost.  */
 937   val = (n < 0) ? -n : n;
 938
 939   /* Initialize the exponent cache.  */
 940   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 941   cache[1] = true;
 942
 943   result = 0;
 944
 945   while (val >= POWI_TABLE_SIZE)
 946     {
 947       if (val & 1)
 948         {
 949           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 950           result += powi_lookup_cost (digit, cache)
 951                     + POWI_WINDOW_SIZE + 1;
 952           val >>= POWI_WINDOW_SIZE;
 953         }
 954       else
 955         {
 956           val >>= 1;
 957           result++;
 958         }
 959     }
 960
 961   return result + powi_lookup_cost (val, cache);
 962 }
 963
 964 /* Recursive subroutine of powi_as_mults.  This function takes the
 965    array, CACHE, of already calculated exponents and an exponent N and
 966    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 967
 968 static tree
 969 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 970                  HOST_WIDE_INT n, tree *cache)
 971 {
 972   tree op0, op1, ssa_target;
 973   unsigned HOST_WIDE_INT digit;
 974   gimple mult_stmt;
 975
 976   if (n < POWI_TABLE_SIZE && cache[n])
 977     return cache[n];
 978
 979   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
 980
 981   if (n < POWI_TABLE_SIZE)
 982     {
 983       cache[n] = ssa_target;
 984       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
 985       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
 986     }
 987   else if (n & 1)
 988     {
 989       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
 990       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
 991       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
 992     }
 993   else
 994     {
 995       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
 996       op1 = op0;
 997     }
 998
 999   mult_stmt = gimple_build_assign_with_ops (MULT_EXPR, ssa_target, op0, op1);
1000   gimple_set_location (mult_stmt, loc);
1001   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1002
1003   return ssa_target;
1004 }
1005
1006 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1007    This function needs to be kept in sync with powi_cost above.  */
1008
1009 static tree
1010 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1011                tree arg0, HOST_WIDE_INT n)
1012 {
1013   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1014   gimple div_stmt;
1015   tree target;
1016
1017   if (n == 0)
1018     return build_real (type, dconst1);
1019
1020   memset (cache, 0,  sizeof (cache));
1021   cache[1] = arg0;
1022
1023   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1024   if (n >= 0)
1025     return result;
1026
1027   /* If the original exponent was negative, reciprocate the result.  */
1028   target = make_temp_ssa_name (type, NULL, "powmult");
1029   div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target,
1030                                            build_real (type, dconst1),
1031                                            result);
1032   gimple_set_location (div_stmt, loc);
1033   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1034
1035   return target;
1036 }
1037
1038 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1039    location info LOC.  If the arguments are appropriate, create an
1040    equivalent sequence of statements prior to GSI using an optimal
1041    number of multiplications, and return an expession holding the
1042    result.  */
1043
1044 static tree
1045 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1046                             tree arg0, HOST_WIDE_INT n)
1047 {
1048   /* Avoid largest negative number.  */
1049   if (n != -n
1050       && ((n >= -1 && n <= 2)
1051           || (optimize_function_for_speed_p (cfun)
1052               && powi_cost (n) <= POWI_MAX_MULTS)))
1053     return powi_as_mults (gsi, loc, arg0, n);
1054
1055   return NULL_TREE;
1056 }
1057
1058 /* Build a gimple call statement that calls FN with argument ARG.
1059    Set the lhs of the call statement to a fresh SSA name.  Insert the
1060    statement prior to GSI's current position, and return the fresh
1061    SSA name.  */
1062
1063 static tree
1064 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1065                        tree fn, tree arg)
1066 {
1067   gimple call_stmt;
1068   tree ssa_target;
1069
1070   call_stmt = gimple_build_call (fn, 1, arg);
1071   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1072   gimple_set_lhs (call_stmt, ssa_target);
1073   gimple_set_location (call_stmt, loc);
1074   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1075
1076   return ssa_target;
1077 }
1078
1079 /* Build a gimple binary operation with the given CODE and arguments
1080    ARG0, ARG1, assigning the result to a new SSA name for variable
1081    TARGET.  Insert the statement prior to GSI's current position, and
1082    return the fresh SSA name.*/
1083
1084 static tree
1085 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1086                         const char *name, enum tree_code code,
1087                         tree arg0, tree arg1)
1088 {
1089   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1090   gimple stmt = gimple_build_assign_with_ops (code, result, arg0, arg1);
1091   gimple_set_location (stmt, loc);
1092   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1093   return result;
1094 }
1095
1096 /* Build a gimple reference operation with the given CODE and argument
1097    ARG, assigning the result to a new SSA name of TYPE with NAME.
1098    Insert the statement prior to GSI's current position, and return
1099    the fresh SSA name.  */
1100
1101 static inline tree
1102 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1103                       const char *name, enum tree_code code, tree arg0)
1104 {
1105   tree result = make_temp_ssa_name (type, NULL, name);
1106   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1107   gimple_set_location (stmt, loc);
1108   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1109   return result;
1110 }
1111
1112 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1113    prior to GSI's current position, and return the fresh SSA name.  */
1114
1115 static tree
1116 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1117                        tree type, tree val)
1118 {
1119   tree result = make_ssa_name (type, NULL);
1120   gimple stmt = gimple_build_assign_with_ops (NOP_EXPR, result, val, NULL_TREE);
1121   gimple_set_location (stmt, loc);
1122   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1123   return result;
1124 }
1125
1126 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1127    with location info LOC.  If possible, create an equivalent and
1128    less expensive sequence of statements prior to GSI, and return an
1129    expession holding the result.  */
1130
1131 static tree
1132 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1133                            tree arg0, tree arg1)
1134 {
1135   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1136   REAL_VALUE_TYPE c2, dconst3;
1137   HOST_WIDE_INT n;
1138   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1139   enum machine_mode mode;
1140   bool hw_sqrt_exists, c_is_int, c2_is_int;
1141
1142   /* If the exponent isn't a constant, there's nothing of interest
1143      to be done.  */
1144   if (TREE_CODE (arg1) != REAL_CST)
1145     return NULL_TREE;
1146
1147   /* If the exponent is equivalent to an integer, expand to an optimal
1148      multiplication sequence when profitable.  */
1149   c = TREE_REAL_CST (arg1);
1150   n = real_to_integer (&c);
1151   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1152   c_is_int = real_identical (&c, &cint);
1153
1154   if (c_is_int
1155       && ((n >= -1 && n <= 2)
1156           || (flag_unsafe_math_optimizations
1157               && optimize_insn_for_speed_p ()
1158               && powi_cost (n) <= POWI_MAX_MULTS)))
1159     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1160
1161   /* Attempt various optimizations using sqrt and cbrt.  */
1162   type = TREE_TYPE (arg0);
1163   mode = TYPE_MODE (type);
1164   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1165
1166   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1167      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1168      sqrt(-0) = -0.  */
1169   if (sqrtfn
1170       && REAL_VALUES_EQUAL (c, dconsthalf)
1171       && !HONOR_SIGNED_ZEROS (mode))
1172     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1173
1174   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1175      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1176      so do this optimization even if -Os.  Don't do this optimization
1177      if we don't have a hardware sqrt insn.  */
1178   dconst1_4 = dconst1;
1179   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1180   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1181
1182   if (flag_unsafe_math_optimizations
1183       && sqrtfn
1184       && REAL_VALUES_EQUAL (c, dconst1_4)
1185       && hw_sqrt_exists)
1186     {
1187       /* sqrt(x)  */
1188       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1189
1190       /* sqrt(sqrt(x))  */
1191       return build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1192     }
1193
1194   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1195      optimizing for space.  Don't do this optimization if we don't have
1196      a hardware sqrt insn.  */
1197   real_from_integer (&dconst3_4, VOIDmode, 3, 0, 0);
1198   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1199
1200   if (flag_unsafe_math_optimizations
1201       && sqrtfn
1202       && optimize_function_for_speed_p (cfun)
1203       && REAL_VALUES_EQUAL (c, dconst3_4)
1204       && hw_sqrt_exists)
1205     {
1206       /* sqrt(x)  */
1207       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1208
1209       /* sqrt(sqrt(x))  */
1210       sqrt_sqrt = build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1211
1212       /* sqrt(x) * sqrt(sqrt(x))  */
1213       return build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1214                                      sqrt_arg0, sqrt_sqrt);
1215     }
1216
1217   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1218      optimizations since 1./3. is not exactly representable.  If x
1219      is negative and finite, the correct value of pow(x,1./3.) is
1220      a NaN with the "invalid" exception raised, because the value
1221      of 1./3. actually has an even denominator.  The correct value
1222      of cbrt(x) is a negative real value.  */
1223   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1224   dconst1_3 = real_value_truncate (mode, dconst_third ());
1225
1226   if (flag_unsafe_math_optimizations
1227       && cbrtfn
1228       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1229       && REAL_VALUES_EQUAL (c, dconst1_3))
1230     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1231
1232   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1233      if we don't have a hardware sqrt insn.  */
1234   dconst1_6 = dconst1_3;
1235   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1236
1237   if (flag_unsafe_math_optimizations
1238       && sqrtfn
1239       && cbrtfn
1240       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1241       && optimize_function_for_speed_p (cfun)
1242       && hw_sqrt_exists
1243       && REAL_VALUES_EQUAL (c, dconst1_6))
1244     {
1245       /* sqrt(x)  */
1246       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1247
1248       /* cbrt(sqrt(x))  */
1249       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1250     }
1251
1252   /* Optimize pow(x,c), where n = 2c for some nonzero integer n
1253      and c not an integer, into
1254
1255        sqrt(x) * powi(x, n/2),                n > 0;
1256        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1257
1258      Do not calculate the powi factor when n/2 = 0.  */
1259   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1260   n = real_to_integer (&c2);
1261   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1262   c2_is_int = real_identical (&c2, &cint);
1263
1264   if (flag_unsafe_math_optimizations
1265       && sqrtfn
1266       && c2_is_int
1267       && !c_is_int
1268       && optimize_function_for_speed_p (cfun))
1269     {
1270       tree powi_x_ndiv2 = NULL_TREE;
1271
1272       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1273          possible or profitable, give up.  Skip the degenerate case when
1274          n is 1 or -1, where the result is always 1.  */
1275       if (absu_hwi (n) != 1)
1276         {
1277           powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1278                                                      abs_hwi (n / 2));
1279           if (!powi_x_ndiv2)
1280             return NULL_TREE;
1281         }
1282
1283       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1284          result of the optimal multiply sequence just calculated.  */
1285       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1286
1287       if (absu_hwi (n) == 1)
1288         result = sqrt_arg0;
1289       else
1290         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1291                                          sqrt_arg0, powi_x_ndiv2);
1292
1293       /* If n is negative, reciprocate the result.  */
1294       if (n < 0)
1295         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1296                                          build_real (type, dconst1), result);
1297       return result;
1298     }
1299
1300   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1301
1302      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1303      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1304
1305      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1306      different from pow(x, 1./3.) due to rounding and behavior with
1307      negative x, we need to constrain this transformation to unsafe
1308      math and positive x or finite math.  */
1309   real_from_integer (&dconst3, VOIDmode, 3, 0, 0);
1310   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1311   real_round (&c2, mode, &c2);
1312   n = real_to_integer (&c2);
1313   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1314   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1315   real_convert (&c2, mode, &c2);
1316
1317   if (flag_unsafe_math_optimizations
1318       && cbrtfn
1319       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1320       && real_identical (&c2, &c)
1321       && !c2_is_int
1322       && optimize_function_for_speed_p (cfun)
1323       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1324     {
1325       tree powi_x_ndiv3 = NULL_TREE;
1326
1327       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1328          possible or profitable, give up.  Skip the degenerate case when
1329          abs(n) < 3, where the result is always 1.  */
1330       if (absu_hwi (n) >= 3)
1331         {
1332           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1333                                                      abs_hwi (n / 3));
1334           if (!powi_x_ndiv3)
1335             return NULL_TREE;
1336         }
1337
1338       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1339          as that creates an unnecessary variable.  Instead, just produce
1340          either cbrt(x) or cbrt(x) * cbrt(x).  */
1341       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1342
1343       if (absu_hwi (n) % 3 == 1)
1344         powi_cbrt_x = cbrt_x;
1345       else
1346         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1347                                               cbrt_x, cbrt_x);
1348
1349       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1350       if (absu_hwi (n) < 3)
1351         result = powi_cbrt_x;
1352       else
1353         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1354                                          powi_x_ndiv3, powi_cbrt_x);
1355
1356       /* If n is negative, reciprocate the result.  */
1357       if (n < 0)
1358         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1359                                          build_real (type, dconst1), result);
1360
1361       return result;
1362     }
1363
1364   /* No optimizations succeeded.  */
1365   return NULL_TREE;
1366 }
1367
1368 /* ARG is the argument to a cabs builtin call in GSI with location info
1369    LOC.  Create a sequence of statements prior to GSI that calculates
1370    sqrt(R*R + I*I), where R and I are the real and imaginary components
1371    of ARG, respectively.  Return an expression holding the result.  */
1372
1373 static tree
1374 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1375 {
1376   tree real_part, imag_part, addend1, addend2, sum, result;
1377   tree type = TREE_TYPE (TREE_TYPE (arg));
1378   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1379   enum machine_mode mode = TYPE_MODE (type);
1380
1381   if (!flag_unsafe_math_optimizations
1382       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1383       || !sqrtfn
1384       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1385     return NULL_TREE;
1386
1387   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1388                                     REALPART_EXPR, arg);
1389   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1390                                     real_part, real_part);
1391   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1392                                     IMAGPART_EXPR, arg);
1393   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1394                                     imag_part, imag_part);
1395   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1396   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1397
1398   return result;
1399 }
1400
1401 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1402    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1403    an optimal number of multiplies, when n is a constant.  */
1404
1405 static unsigned int
1406 execute_cse_sincos (void)
1407 {
1408   basic_block bb;
1409   bool cfg_changed = false;
1410
1411   calculate_dominance_info (CDI_DOMINATORS);
1412   memset (&sincos_stats, 0, sizeof (sincos_stats));
1413
1414   FOR_EACH_BB (bb)
1415     {
1416       gimple_stmt_iterator gsi;
1417       bool cleanup_eh = false;
1418
1419       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1420         {
1421           gimple stmt = gsi_stmt (gsi);
1422           tree fndecl;
1423
1424           /* Only the last stmt in a bb could throw, no need to call
1425              gimple_purge_dead_eh_edges if we change something in the middle
1426              of a basic block.  */
1427           cleanup_eh = false;
1428
1429           if (is_gimple_call (stmt)
1430               && gimple_call_lhs (stmt)
1431               && (fndecl = gimple_call_fndecl (stmt))
1432               && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1433             {
1434               tree arg, arg0, arg1, result;
1435               HOST_WIDE_INT n;
1436               location_t loc;
1437
1438               switch (DECL_FUNCTION_CODE (fndecl))
1439                 {
1440                 CASE_FLT_FN (BUILT_IN_COS):
1441                 CASE_FLT_FN (BUILT_IN_SIN):
1442                 CASE_FLT_FN (BUILT_IN_CEXPI):
1443                   /* Make sure we have either sincos or cexp.  */
1444                   if (!targetm.libc_has_function (function_c99_math_complex)
1445                       && !targetm.libc_has_function (function_sincos))
1446                     break;
1447
1448                   arg = gimple_call_arg (stmt, 0);
1449                   if (TREE_CODE (arg) == SSA_NAME)
1450                     cfg_changed |= execute_cse_sincos_1 (arg);
1451                   break;
1452
1453                 CASE_FLT_FN (BUILT_IN_POW):
1454                   arg0 = gimple_call_arg (stmt, 0);
1455                   arg1 = gimple_call_arg (stmt, 1);
1456
1457                   loc = gimple_location (stmt);
1458                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1459
1460                   if (result)
1461                     {
1462                       tree lhs = gimple_get_lhs (stmt);
1463                       gimple new_stmt = gimple_build_assign (lhs, result);
1464                       gimple_set_location (new_stmt, loc);
1465                       unlink_stmt_vdef (stmt);
1466                       gsi_replace (&gsi, new_stmt, true);
1467                       cleanup_eh = true;
1468                       if (gimple_vdef (stmt))
1469                         release_ssa_name (gimple_vdef (stmt));
1470                     }
1471                   break;
1472
1473                 CASE_FLT_FN (BUILT_IN_POWI):
1474                   arg0 = gimple_call_arg (stmt, 0);
1475                   arg1 = gimple_call_arg (stmt, 1);
1476                   loc = gimple_location (stmt);
1477
1478                   if (real_minus_onep (arg0))
1479                     {
1480                       tree t0, t1, cond, one, minus_one;
1481                       gimple stmt;
1482
1483                       t0 = TREE_TYPE (arg0);
1484                       t1 = TREE_TYPE (arg1);
1485                       one = build_real (t0, dconst1);
1486                       minus_one = build_real (t0, dconstm1);
1487
1488                       cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1489                       stmt = gimple_build_assign_with_ops (BIT_AND_EXPR, cond,
1490                                                            arg1,
1491                                                            build_int_cst (t1,
1492                                                                           1));
1493                       gimple_set_location (stmt, loc);
1494                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1495
1496                       result = make_temp_ssa_name (t0, NULL, "powi");
1497                       stmt = gimple_build_assign_with_ops (COND_EXPR, result,
1498                                                            cond,
1499                                                            minus_one, one);
1500                       gimple_set_location (stmt, loc);
1501                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1502                     }
1503                   else
1504                     {
1505                       if (!host_integerp (arg1, 0))
1506                         break;
1507
1508                       n = TREE_INT_CST_LOW (arg1);
1509                       result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1510                     }
1511
1512                   if (result)
1513                     {
1514                       tree lhs = gimple_get_lhs (stmt);
1515                       gimple new_stmt = gimple_build_assign (lhs, result);
1516                       gimple_set_location (new_stmt, loc);
1517                       unlink_stmt_vdef (stmt);
1518                       gsi_replace (&gsi, new_stmt, true);
1519                       cleanup_eh = true;
1520                       if (gimple_vdef (stmt))
1521                         release_ssa_name (gimple_vdef (stmt));
1522                     }
1523                   break;
1524
1525                 CASE_FLT_FN (BUILT_IN_CABS):
1526                   arg0 = gimple_call_arg (stmt, 0);
1527                   loc = gimple_location (stmt);
1528                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1529
1530                   if (result)
1531                     {
1532                       tree lhs = gimple_get_lhs (stmt);
1533                       gimple new_stmt = gimple_build_assign (lhs, result);
1534                       gimple_set_location (new_stmt, loc);
1535                       unlink_stmt_vdef (stmt);
1536                       gsi_replace (&gsi, new_stmt, true);
1537                       cleanup_eh = true;
1538                       if (gimple_vdef (stmt))
1539                         release_ssa_name (gimple_vdef (stmt));
1540                     }
1541                   break;
1542
1543                 default:;
1544                 }
1545             }
1546         }
1547       if (cleanup_eh)
1548         cfg_changed |= gimple_purge_dead_eh_edges (bb);
1549     }
1550
1551   statistics_counter_event (cfun, "sincos statements inserted",
1552                             sincos_stats.inserted);
1553
1554   free_dominance_info (CDI_DOMINATORS);
1555   return cfg_changed ? TODO_cleanup_cfg : 0;
1556 }
1557
1558 static bool
1559 gate_cse_sincos (void)
1560 {
1561   /* We no longer require either sincos or cexp, since powi expansion
1562      piggybacks on this pass.  */
1563   return optimize;
1564 }
1565
1566 namespace {
1567
1568 const pass_data pass_data_cse_sincos =
1569 {
1570   GIMPLE_PASS, /* type */
1571   "sincos", /* name */
1572   OPTGROUP_NONE, /* optinfo_flags */
1573   true, /* has_gate */
1574   true, /* has_execute */
1575   TV_NONE, /* tv_id */
1576   PROP_ssa, /* properties_required */
1577   0, /* properties_provided */
1578   0, /* properties_destroyed */
1579   0, /* todo_flags_start */
1580   ( TODO_update_ssa | TODO_verify_ssa
1581     | TODO_verify_stmts ), /* todo_flags_finish */
1582 };
1583
1584 class pass_cse_sincos : public gimple_opt_pass
1585 {
1586 public:
1587   pass_cse_sincos (gcc::context *ctxt)
1588     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1589   {}
1590
1591   /* opt_pass methods: */
1592   bool gate () { return gate_cse_sincos (); }
1593   unsigned int execute () { return execute_cse_sincos (); }
1594
1595 }; // class pass_cse_sincos
1596
1597 } // anon namespace
1598
1599 gimple_opt_pass *
1600 make_pass_cse_sincos (gcc::context *ctxt)
1601 {
1602   return new pass_cse_sincos (ctxt);
1603 }
1604
1605 /* A symbolic number is used to detect byte permutation and selection
1606    patterns.  Therefore the field N contains an artificial number
1607    consisting of byte size markers:
1608
1609    0    - byte has the value 0
1610    1..size - byte contains the content of the byte
1611    number indexed with that value minus one  */
1612
1613 struct symbolic_number {
1614   unsigned HOST_WIDEST_INT n;
1615   int size;
1616 };
1617
1618 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1619    number N.  Return false if the requested operation is not permitted
1620    on a symbolic number.  */
1621
1622 static inline bool
1623 do_shift_rotate (enum tree_code code,
1624                  struct symbolic_number *n,
1625                  int count)
1626 {
1627   if (count % 8 != 0)
1628     return false;
1629
1630   /* Zero out the extra bits of N in order to avoid them being shifted
1631      into the significant bits.  */
1632   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1633     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1634
1635   switch (code)
1636     {
1637     case LSHIFT_EXPR:
1638       n->n <<= count;
1639       break;
1640     case RSHIFT_EXPR:
1641       n->n >>= count;
1642       break;
1643     case LROTATE_EXPR:
1644       n->n = (n->n << count) | (n->n >> ((n->size * BITS_PER_UNIT) - count));
1645       break;
1646     case RROTATE_EXPR:
1647       n->n = (n->n >> count) | (n->n << ((n->size * BITS_PER_UNIT) - count));
1648       break;
1649     default:
1650       return false;
1651     }
1652   /* Zero unused bits for size.  */
1653   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1654     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1655   return true;
1656 }
1657
1658 /* Perform sanity checking for the symbolic number N and the gimple
1659    statement STMT.  */
1660
1661 static inline bool
1662 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1663 {
1664   tree lhs_type;
1665
1666   lhs_type = gimple_expr_type (stmt);
1667
1668   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1669     return false;
1670
1671   if (TYPE_PRECISION (lhs_type) != n->size * BITS_PER_UNIT)
1672     return false;
1673
1674   return true;
1675 }
1676
1677 /* find_bswap_1 invokes itself recursively with N and tries to perform
1678    the operation given by the rhs of STMT on the result.  If the
1679    operation could successfully be executed the function returns the
1680    tree expression of the source operand and NULL otherwise.  */
1681
1682 static tree
1683 find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
1684 {
1685   enum tree_code code;
1686   tree rhs1, rhs2 = NULL;
1687   gimple rhs1_stmt, rhs2_stmt;
1688   tree source_expr1;
1689   enum gimple_rhs_class rhs_class;
1690
1691   if (!limit || !is_gimple_assign (stmt))
1692     return NULL_TREE;
1693
1694   rhs1 = gimple_assign_rhs1 (stmt);
1695
1696   if (TREE_CODE (rhs1) != SSA_NAME)
1697     return NULL_TREE;
1698
1699   code = gimple_assign_rhs_code (stmt);
1700   rhs_class = gimple_assign_rhs_class (stmt);
1701   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1702
1703   if (rhs_class == GIMPLE_BINARY_RHS)
1704     rhs2 = gimple_assign_rhs2 (stmt);
1705
1706   /* Handle unary rhs and binary rhs with integer constants as second
1707      operand.  */
1708
1709   if (rhs_class == GIMPLE_UNARY_RHS
1710       || (rhs_class == GIMPLE_BINARY_RHS
1711           && TREE_CODE (rhs2) == INTEGER_CST))
1712     {
1713       if (code != BIT_AND_EXPR
1714           && code != LSHIFT_EXPR
1715           && code != RSHIFT_EXPR
1716           && code != LROTATE_EXPR
1717           && code != RROTATE_EXPR
1718           && code != NOP_EXPR
1719           && code != CONVERT_EXPR)
1720         return NULL_TREE;
1721
1722       source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
1723
1724       /* If find_bswap_1 returned NULL STMT is a leaf node and we have
1725          to initialize the symbolic number.  */
1726       if (!source_expr1)
1727         {
1728           /* Set up the symbolic number N by setting each byte to a
1729              value between 1 and the byte size of rhs1.  The highest
1730              order byte is set to n->size and the lowest order
1731              byte to 1.  */
1732           n->size = TYPE_PRECISION (TREE_TYPE (rhs1));
1733           if (n->size % BITS_PER_UNIT != 0)
1734             return NULL_TREE;
1735           n->size /= BITS_PER_UNIT;
1736           n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1737                   (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
1738
1739           if (n->size < (int)sizeof (HOST_WIDEST_INT))
1740             n->n &= ((unsigned HOST_WIDEST_INT)1 <<
1741                      (n->size * BITS_PER_UNIT)) - 1;
1742
1743           source_expr1 = rhs1;
1744         }
1745
1746       switch (code)
1747         {
1748         case BIT_AND_EXPR:
1749           {
1750             int i;
1751             unsigned HOST_WIDEST_INT val = widest_int_cst_value (rhs2);
1752             unsigned HOST_WIDEST_INT tmp = val;
1753
1754             /* Only constants masking full bytes are allowed.  */
1755             for (i = 0; i < n->size; i++, tmp >>= BITS_PER_UNIT)
1756               if ((tmp & 0xff) != 0 && (tmp & 0xff) != 0xff)
1757                 return NULL_TREE;
1758
1759             n->n &= val;
1760           }
1761           break;
1762         case LSHIFT_EXPR:
1763         case RSHIFT_EXPR:
1764         case LROTATE_EXPR:
1765         case RROTATE_EXPR:
1766           if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
1767             return NULL_TREE;
1768           break;
1769         CASE_CONVERT:
1770           {
1771             int type_size;
1772
1773             type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1774             if (type_size % BITS_PER_UNIT != 0)
1775               return NULL_TREE;
1776
1777             if (type_size / BITS_PER_UNIT < (int)(sizeof (HOST_WIDEST_INT)))
1778               {
1779                 /* If STMT casts to a smaller type mask out the bits not
1780                    belonging to the target type.  */
1781                 n->n &= ((unsigned HOST_WIDEST_INT)1 << type_size) - 1;
1782               }
1783             n->size = type_size / BITS_PER_UNIT;
1784           }
1785           break;
1786         default:
1787           return NULL_TREE;
1788         };
1789       return verify_symbolic_number_p (n, stmt) ? source_expr1 : NULL;
1790     }
1791
1792   /* Handle binary rhs.  */
1793
1794   if (rhs_class == GIMPLE_BINARY_RHS)
1795     {
1796       struct symbolic_number n1, n2;
1797       tree source_expr2;
1798
1799       if (code != BIT_IOR_EXPR)
1800         return NULL_TREE;
1801
1802       if (TREE_CODE (rhs2) != SSA_NAME)
1803         return NULL_TREE;
1804
1805       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1806
1807       switch (code)
1808         {
1809         case BIT_IOR_EXPR:
1810           source_expr1 = find_bswap_1 (rhs1_stmt, &n1, limit - 1);
1811
1812           if (!source_expr1)
1813             return NULL_TREE;
1814
1815           source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
1816
1817           if (source_expr1 != source_expr2
1818               || n1.size != n2.size)
1819             return NULL_TREE;
1820
1821           n->size = n1.size;
1822           n->n = n1.n | n2.n;
1823
1824           if (!verify_symbolic_number_p (n, stmt))
1825             return NULL_TREE;
1826
1827           break;
1828         default:
1829           return NULL_TREE;
1830         }
1831       return source_expr1;
1832     }
1833   return NULL_TREE;
1834 }
1835
1836 /* Check if STMT completes a bswap implementation consisting of ORs,
1837    SHIFTs and ANDs.  Return the source tree expression on which the
1838    byte swap is performed and NULL if no bswap was found.  */
1839
1840 static tree
1841 find_bswap (gimple stmt)
1842 {
1843 /* The number which the find_bswap result should match in order to
1844    have a full byte swap.  The number is shifted to the left according
1845    to the size of the symbolic number before using it.  */
1846   unsigned HOST_WIDEST_INT cmp =
1847     sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1848     (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708;
1849
1850   struct symbolic_number n;
1851   tree source_expr;
1852   int limit;
1853
1854   /* The last parameter determines the depth search limit.  It usually
1855      correlates directly to the number of bytes to be touched.  We
1856      increase that number by three  here in order to also
1857      cover signed -> unsigned converions of the src operand as can be seen
1858      in libgcc, and for initial shift/and operation of the src operand.  */
1859   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
1860   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
1861   source_expr =  find_bswap_1 (stmt, &n, limit);
1862
1863   if (!source_expr)
1864     return NULL_TREE;
1865
1866   /* Zero out the extra bits of N and CMP.  */
1867   if (n.size < (int)sizeof (HOST_WIDEST_INT))
1868     {
1869       unsigned HOST_WIDEST_INT mask =
1870         ((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
1871
1872       n.n &= mask;
1873       cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
1874     }
1875
1876   /* A complete byte swap should make the symbolic number to start
1877      with the largest digit in the highest order byte.  */
1878   if (cmp != n.n)
1879     return NULL_TREE;
1880
1881   return source_expr;
1882 }
1883
1884 /* Find manual byte swap implementations and turn them into a bswap
1885    builtin invokation.  */
1886
1887 static unsigned int
1888 execute_optimize_bswap (void)
1889 {
1890   basic_block bb;
1891   bool bswap16_p, bswap32_p, bswap64_p;
1892   bool changed = false;
1893   tree bswap16_type = NULL_TREE, bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
1894
1895   if (BITS_PER_UNIT != 8)
1896     return 0;
1897
1898   if (sizeof (HOST_WIDEST_INT) < 8)
1899     return 0;
1900
1901   bswap16_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP16)
1902                && optab_handler (bswap_optab, HImode) != CODE_FOR_nothing);
1903   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
1904                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
1905   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
1906                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
1907                    || (bswap32_p && word_mode == SImode)));
1908
1909   if (!bswap16_p && !bswap32_p && !bswap64_p)
1910     return 0;
1911
1912   /* Determine the argument type of the builtins.  The code later on
1913      assumes that the return and argument type are the same.  */
1914   if (bswap16_p)
1915     {
1916       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
1917       bswap16_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1918     }
1919
1920   if (bswap32_p)
1921     {
1922       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1923       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1924     }
1925
1926   if (bswap64_p)
1927     {
1928       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1929       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1930     }
1931
1932   memset (&bswap_stats, 0, sizeof (bswap_stats));
1933
1934   FOR_EACH_BB (bb)
1935     {
1936       gimple_stmt_iterator gsi;
1937
1938       /* We do a reverse scan for bswap patterns to make sure we get the
1939          widest match. As bswap pattern matching doesn't handle
1940          previously inserted smaller bswap replacements as sub-
1941          patterns, the wider variant wouldn't be detected.  */
1942       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1943         {
1944           gimple stmt = gsi_stmt (gsi);
1945           tree bswap_src, bswap_type;
1946           tree bswap_tmp;
1947           tree fndecl = NULL_TREE;
1948           int type_size;
1949           gimple call;
1950
1951           if (!is_gimple_assign (stmt)
1952               || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
1953             continue;
1954
1955           type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1956
1957           switch (type_size)
1958             {
1959             case 16:
1960               if (bswap16_p)
1961                 {
1962                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
1963                   bswap_type = bswap16_type;
1964                 }
1965               break;
1966             case 32:
1967               if (bswap32_p)
1968                 {
1969                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1970                   bswap_type = bswap32_type;
1971                 }
1972               break;
1973             case 64:
1974               if (bswap64_p)
1975                 {
1976                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1977                   bswap_type = bswap64_type;
1978                 }
1979               break;
1980             default:
1981               continue;
1982             }
1983
1984           if (!fndecl)
1985             continue;
1986
1987           bswap_src = find_bswap (stmt);
1988
1989           if (!bswap_src)
1990             continue;
1991
1992           changed = true;
1993           if (type_size == 16)
1994             bswap_stats.found_16bit++;
1995           else if (type_size == 32)
1996             bswap_stats.found_32bit++;
1997           else
1998             bswap_stats.found_64bit++;
1999
2000           bswap_tmp = bswap_src;
2001
2002           /* Convert the src expression if necessary.  */
2003           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
2004             {
2005               gimple convert_stmt;
2006               bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2007               convert_stmt = gimple_build_assign_with_ops
2008                                 (NOP_EXPR, bswap_tmp, bswap_src, NULL);
2009               gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2010             }
2011
2012           call = gimple_build_call (fndecl, 1, bswap_tmp);
2013
2014           bswap_tmp = gimple_assign_lhs (stmt);
2015
2016           /* Convert the result if necessary.  */
2017           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
2018             {
2019               gimple convert_stmt;
2020               bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2021               convert_stmt = gimple_build_assign_with_ops
2022                         (NOP_EXPR, gimple_assign_lhs (stmt), bswap_tmp, NULL);
2023               gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2024             }
2025
2026           gimple_call_set_lhs (call, bswap_tmp);
2027
2028           if (dump_file)
2029             {
2030               fprintf (dump_file, "%d bit bswap implementation found at: ",
2031                        (int)type_size);
2032               print_gimple_stmt (dump_file, stmt, 0, 0);
2033             }
2034
2035           gsi_insert_after (&gsi, call, GSI_SAME_STMT);
2036           gsi_remove (&gsi, true);
2037         }
2038     }
2039
2040   statistics_counter_event (cfun, "16-bit bswap implementations found",
2041                             bswap_stats.found_16bit);
2042   statistics_counter_event (cfun, "32-bit bswap implementations found",
2043                             bswap_stats.found_32bit);
2044   statistics_counter_event (cfun, "64-bit bswap implementations found",
2045                             bswap_stats.found_64bit);
2046
2047   return (changed ? TODO_update_ssa | TODO_verify_ssa
2048           | TODO_verify_stmts : 0);
2049 }
2050
2051 static bool
2052 gate_optimize_bswap (void)
2053 {
2054   return flag_expensive_optimizations && optimize;
2055 }
2056
2057 namespace {
2058
2059 const pass_data pass_data_optimize_bswap =
2060 {
2061   GIMPLE_PASS, /* type */
2062   "bswap", /* name */
2063   OPTGROUP_NONE, /* optinfo_flags */
2064   true, /* has_gate */
2065   true, /* has_execute */
2066   TV_NONE, /* tv_id */
2067   PROP_ssa, /* properties_required */
2068   0, /* properties_provided */
2069   0, /* properties_destroyed */
2070   0, /* todo_flags_start */
2071   0, /* todo_flags_finish */
2072 };
2073
2074 class pass_optimize_bswap : public gimple_opt_pass
2075 {
2076 public:
2077   pass_optimize_bswap (gcc::context *ctxt)
2078     : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2079   {}
2080
2081   /* opt_pass methods: */
2082   bool gate () { return gate_optimize_bswap (); }
2083   unsigned int execute () { return execute_optimize_bswap (); }
2084
2085 }; // class pass_optimize_bswap
2086
2087 } // anon namespace
2088
2089 gimple_opt_pass *
2090 make_pass_optimize_bswap (gcc::context *ctxt)
2091 {
2092   return new pass_optimize_bswap (ctxt);
2093 }
2094
2095 /* Return true if stmt is a type conversion operation that can be stripped
2096    when used in a widening multiply operation.  */
2097 static bool
2098 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
2099 {
2100   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2101
2102   if (TREE_CODE (result_type) == INTEGER_TYPE)
2103     {
2104       tree op_type;
2105       tree inner_op_type;
2106
2107       if (!CONVERT_EXPR_CODE_P (rhs_code))
2108         return false;
2109
2110       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2111
2112       /* If the type of OP has the same precision as the result, then
2113          we can strip this conversion.  The multiply operation will be
2114          selected to create the correct extension as a by-product.  */
2115       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2116         return true;
2117
2118       /* We can also strip a conversion if it preserves the signed-ness of
2119          the operation and doesn't narrow the range.  */
2120       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2121
2122       /* If the inner-most type is unsigned, then we can strip any
2123          intermediate widening operation.  If it's signed, then the
2124          intermediate widening operation must also be signed.  */
2125       if ((TYPE_UNSIGNED (inner_op_type)
2126            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2127           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2128         return true;
2129
2130       return false;
2131     }
2132
2133   return rhs_code == FIXED_CONVERT_EXPR;
2134 }
2135
2136 /* Return true if RHS is a suitable operand for a widening multiplication,
2137    assuming a target type of TYPE.
2138    There are two cases:
2139
2140      - RHS makes some value at least twice as wide.  Store that value
2141        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2142
2143      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2144        but leave *TYPE_OUT untouched.  */
2145
2146 static bool
2147 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2148                         tree *new_rhs_out)
2149 {
2150   gimple stmt;
2151   tree type1, rhs1;
2152
2153   if (TREE_CODE (rhs) == SSA_NAME)
2154     {
2155       stmt = SSA_NAME_DEF_STMT (rhs);
2156       if (is_gimple_assign (stmt))
2157         {
2158           if (! widening_mult_conversion_strippable_p (type, stmt))
2159             rhs1 = rhs;
2160           else
2161             {
2162               rhs1 = gimple_assign_rhs1 (stmt);
2163
2164               if (TREE_CODE (rhs1) == INTEGER_CST)
2165                 {
2166                   *new_rhs_out = rhs1;
2167                   *type_out = NULL;
2168                   return true;
2169                 }
2170             }
2171         }
2172       else
2173         rhs1 = rhs;
2174
2175       type1 = TREE_TYPE (rhs1);
2176
2177       if (TREE_CODE (type1) != TREE_CODE (type)
2178           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2179         return false;
2180
2181       *new_rhs_out = rhs1;
2182       *type_out = type1;
2183       return true;
2184     }
2185
2186   if (TREE_CODE (rhs) == INTEGER_CST)
2187     {
2188       *new_rhs_out = rhs;
2189       *type_out = NULL;
2190       return true;
2191     }
2192
2193   return false;
2194 }
2195
2196 /* Return true if STMT performs a widening multiplication, assuming the
2197    output type is TYPE.  If so, store the unwidened types of the operands
2198    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2199    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2200    and *TYPE2_OUT would give the operands of the multiplication.  */
2201
2202 static bool
2203 is_widening_mult_p (gimple stmt,
2204                     tree *type1_out, tree *rhs1_out,
2205                     tree *type2_out, tree *rhs2_out)
2206 {
2207   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2208
2209   if (TREE_CODE (type) != INTEGER_TYPE
2210       && TREE_CODE (type) != FIXED_POINT_TYPE)
2211     return false;
2212
2213   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2214                                rhs1_out))
2215     return false;
2216
2217   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2218                                rhs2_out))
2219     return false;
2220
2221   if (*type1_out == NULL)
2222     {
2223       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2224         return false;
2225       *type1_out = *type2_out;
2226     }
2227
2228   if (*type2_out == NULL)
2229     {
2230       if (!int_fits_type_p (*rhs2_out, *type1_out))
2231         return false;
2232       *type2_out = *type1_out;
2233     }
2234
2235   /* Ensure that the larger of the two operands comes first. */
2236   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2237     {
2238       tree tmp;
2239       tmp = *type1_out;
2240       *type1_out = *type2_out;
2241       *type2_out = tmp;
2242       tmp = *rhs1_out;
2243       *rhs1_out = *rhs2_out;
2244       *rhs2_out = tmp;
2245     }
2246
2247   return true;
2248 }
2249
2250 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2251    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2252    value is true iff we converted the statement.  */
2253
2254 static bool
2255 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2256 {
2257   tree lhs, rhs1, rhs2, type, type1, type2;
2258   enum insn_code handler;
2259   enum machine_mode to_mode, from_mode, actual_mode;
2260   optab op;
2261   int actual_precision;
2262   location_t loc = gimple_location (stmt);
2263   bool from_unsigned1, from_unsigned2;
2264
2265   lhs = gimple_assign_lhs (stmt);
2266   type = TREE_TYPE (lhs);
2267   if (TREE_CODE (type) != INTEGER_TYPE)
2268     return false;
2269
2270   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2271     return false;
2272
2273   to_mode = TYPE_MODE (type);
2274   from_mode = TYPE_MODE (type1);
2275   from_unsigned1 = TYPE_UNSIGNED (type1);
2276   from_unsigned2 = TYPE_UNSIGNED (type2);
2277
2278   if (from_unsigned1 && from_unsigned2)
2279     op = umul_widen_optab;
2280   else if (!from_unsigned1 && !from_unsigned2)
2281     op = smul_widen_optab;
2282   else
2283     op = usmul_widen_optab;
2284
2285   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2286                                                   0, &actual_mode);
2287
2288   if (handler == CODE_FOR_nothing)
2289     {
2290       if (op != smul_widen_optab)
2291         {
2292           /* We can use a signed multiply with unsigned types as long as
2293              there is a wider mode to use, or it is the smaller of the two
2294              types that is unsigned.  Note that type1 >= type2, always.  */
2295           if ((TYPE_UNSIGNED (type1)
2296                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2297               || (TYPE_UNSIGNED (type2)
2298                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2299             {
2300               from_mode = GET_MODE_WIDER_MODE (from_mode);
2301               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2302                 return false;
2303             }
2304
2305           op = smul_widen_optab;
2306           handler = find_widening_optab_handler_and_mode (op, to_mode,
2307                                                           from_mode, 0,
2308                                                           &actual_mode);
2309
2310           if (handler == CODE_FOR_nothing)
2311             return false;
2312
2313           from_unsigned1 = from_unsigned2 = false;
2314         }
2315       else
2316         return false;
2317     }
2318
2319   /* Ensure that the inputs to the handler are in the correct precison
2320      for the opcode.  This will be the full mode size.  */
2321   actual_precision = GET_MODE_PRECISION (actual_mode);
2322   if (2 * actual_precision > TYPE_PRECISION (type))
2323     return false;
2324   if (actual_precision != TYPE_PRECISION (type1)
2325       || from_unsigned1 != TYPE_UNSIGNED (type1))
2326     rhs1 = build_and_insert_cast (gsi, loc,
2327                                   build_nonstandard_integer_type
2328                                     (actual_precision, from_unsigned1), rhs1);
2329   if (actual_precision != TYPE_PRECISION (type2)
2330       || from_unsigned2 != TYPE_UNSIGNED (type2))
2331     rhs2 = build_and_insert_cast (gsi, loc,
2332                                   build_nonstandard_integer_type
2333                                     (actual_precision, from_unsigned2), rhs2);
2334
2335   /* Handle constants.  */
2336   if (TREE_CODE (rhs1) == INTEGER_CST)
2337     rhs1 = fold_convert (type1, rhs1);
2338   if (TREE_CODE (rhs2) == INTEGER_CST)
2339     rhs2 = fold_convert (type2, rhs2);
2340
2341   gimple_assign_set_rhs1 (stmt, rhs1);
2342   gimple_assign_set_rhs2 (stmt, rhs2);
2343   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2344   update_stmt (stmt);
2345   widen_mul_stats.widen_mults_inserted++;
2346   return true;
2347 }
2348
2349 /* Process a single gimple statement STMT, which is found at the
2350    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2351    rhs (given by CODE), and try to convert it into a
2352    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2353    is true iff we converted the statement.  */
2354
2355 static bool
2356 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2357                             enum tree_code code)
2358 {
2359   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2360   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2361   tree type, type1, type2, optype;
2362   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2363   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2364   optab this_optab;
2365   enum tree_code wmult_code;
2366   enum insn_code handler;
2367   enum machine_mode to_mode, from_mode, actual_mode;
2368   location_t loc = gimple_location (stmt);
2369   int actual_precision;
2370   bool from_unsigned1, from_unsigned2;
2371
2372   lhs = gimple_assign_lhs (stmt);
2373   type = TREE_TYPE (lhs);
2374   if (TREE_CODE (type) != INTEGER_TYPE
2375       && TREE_CODE (type) != FIXED_POINT_TYPE)
2376     return false;
2377
2378   if (code == MINUS_EXPR)
2379     wmult_code = WIDEN_MULT_MINUS_EXPR;
2380   else
2381     wmult_code = WIDEN_MULT_PLUS_EXPR;
2382
2383   rhs1 = gimple_assign_rhs1 (stmt);
2384   rhs2 = gimple_assign_rhs2 (stmt);
2385
2386   if (TREE_CODE (rhs1) == SSA_NAME)
2387     {
2388       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2389       if (is_gimple_assign (rhs1_stmt))
2390         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2391     }
2392
2393   if (TREE_CODE (rhs2) == SSA_NAME)
2394     {
2395       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2396       if (is_gimple_assign (rhs2_stmt))
2397         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2398     }
2399
2400   /* Allow for one conversion statement between the multiply
2401      and addition/subtraction statement.  If there are more than
2402      one conversions then we assume they would invalidate this
2403      transformation.  If that's not the case then they should have
2404      been folded before now.  */
2405   if (CONVERT_EXPR_CODE_P (rhs1_code))
2406     {
2407       conv1_stmt = rhs1_stmt;
2408       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2409       if (TREE_CODE (rhs1) == SSA_NAME)
2410         {
2411           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2412           if (is_gimple_assign (rhs1_stmt))
2413             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2414         }
2415       else
2416         return false;
2417     }
2418   if (CONVERT_EXPR_CODE_P (rhs2_code))
2419     {
2420       conv2_stmt = rhs2_stmt;
2421       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2422       if (TREE_CODE (rhs2) == SSA_NAME)
2423         {
2424           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2425           if (is_gimple_assign (rhs2_stmt))
2426             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2427         }
2428       else
2429         return false;
2430     }
2431
2432   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2433      is_widening_mult_p, but we still need the rhs returns.
2434
2435      It might also appear that it would be sufficient to use the existing
2436      operands of the widening multiply, but that would limit the choice of
2437      multiply-and-accumulate instructions.
2438
2439      If the widened-multiplication result has more than one uses, it is
2440      probably wiser not to do the conversion.  */
2441   if (code == PLUS_EXPR
2442       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2443     {
2444       if (!has_single_use (rhs1)
2445           || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2446                                   &type2, &mult_rhs2))
2447         return false;
2448       add_rhs = rhs2;
2449       conv_stmt = conv1_stmt;
2450     }
2451   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2452     {
2453       if (!has_single_use (rhs2)
2454           || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2455                                   &type2, &mult_rhs2))
2456         return false;
2457       add_rhs = rhs1;
2458       conv_stmt = conv2_stmt;
2459     }
2460   else
2461     return false;
2462
2463   to_mode = TYPE_MODE (type);
2464   from_mode = TYPE_MODE (type1);
2465   from_unsigned1 = TYPE_UNSIGNED (type1);
2466   from_unsigned2 = TYPE_UNSIGNED (type2);
2467   optype = type1;
2468
2469   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2470   if (from_unsigned1 != from_unsigned2)
2471     {
2472       if (!INTEGRAL_TYPE_P (type))
2473         return false;
2474       /* We can use a signed multiply with unsigned types as long as
2475          there is a wider mode to use, or it is the smaller of the two
2476          types that is unsigned.  Note that type1 >= type2, always.  */
2477       if ((from_unsigned1
2478            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2479           || (from_unsigned2
2480               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2481         {
2482           from_mode = GET_MODE_WIDER_MODE (from_mode);
2483           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2484             return false;
2485         }
2486
2487       from_unsigned1 = from_unsigned2 = false;
2488       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2489                                                false);
2490     }
2491
2492   /* If there was a conversion between the multiply and addition
2493      then we need to make sure it fits a multiply-and-accumulate.
2494      The should be a single mode change which does not change the
2495      value.  */
2496   if (conv_stmt)
2497     {
2498       /* We use the original, unmodified data types for this.  */
2499       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2500       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2501       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2502       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2503
2504       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2505         {
2506           /* Conversion is a truncate.  */
2507           if (TYPE_PRECISION (to_type) < data_size)
2508             return false;
2509         }
2510       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2511         {
2512           /* Conversion is an extend.  Check it's the right sort.  */
2513           if (TYPE_UNSIGNED (from_type) != is_unsigned
2514               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2515             return false;
2516         }
2517       /* else convert is a no-op for our purposes.  */
2518     }
2519
2520   /* Verify that the machine can perform a widening multiply
2521      accumulate in this mode/signedness combination, otherwise
2522      this transformation is likely to pessimize code.  */
2523   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2524   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2525                                                   from_mode, 0, &actual_mode);
2526
2527   if (handler == CODE_FOR_nothing)
2528     return false;
2529
2530   /* Ensure that the inputs to the handler are in the correct precison
2531      for the opcode.  This will be the full mode size.  */
2532   actual_precision = GET_MODE_PRECISION (actual_mode);
2533   if (actual_precision != TYPE_PRECISION (type1)
2534       || from_unsigned1 != TYPE_UNSIGNED (type1))
2535     mult_rhs1 = build_and_insert_cast (gsi, loc,
2536                                        build_nonstandard_integer_type
2537                                          (actual_precision, from_unsigned1),
2538                                        mult_rhs1);
2539   if (actual_precision != TYPE_PRECISION (type2)
2540       || from_unsigned2 != TYPE_UNSIGNED (type2))
2541     mult_rhs2 = build_and_insert_cast (gsi, loc,
2542                                        build_nonstandard_integer_type
2543                                          (actual_precision, from_unsigned2),
2544                                        mult_rhs2);
2545
2546   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2547     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2548
2549   /* Handle constants.  */
2550   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2551     mult_rhs1 = fold_convert (type1, mult_rhs1);
2552   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2553     mult_rhs2 = fold_convert (type2, mult_rhs2);
2554
2555   gimple_assign_set_rhs_with_ops_1 (gsi, wmult_code, mult_rhs1, mult_rhs2,
2556                                     add_rhs);
2557   update_stmt (gsi_stmt (*gsi));
2558   widen_mul_stats.maccs_inserted++;
2559   return true;
2560 }
2561
2562 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
2563    with uses in additions and subtractions to form fused multiply-add
2564    operations.  Returns true if successful and MUL_STMT should be removed.  */
2565
2566 static bool
2567 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
2568 {
2569   tree mul_result = gimple_get_lhs (mul_stmt);
2570   tree type = TREE_TYPE (mul_result);
2571   gimple use_stmt, neguse_stmt, fma_stmt;
2572   use_operand_p use_p;
2573   imm_use_iterator imm_iter;
2574
2575   if (FLOAT_TYPE_P (type)
2576       && flag_fp_contract_mode == FP_CONTRACT_OFF)
2577     return false;
2578
2579   /* We don't want to do bitfield reduction ops.  */
2580   if (INTEGRAL_TYPE_P (type)
2581       && (TYPE_PRECISION (type)
2582           != GET_MODE_PRECISION (TYPE_MODE (type))))
2583     return false;
2584
2585   /* If the target doesn't support it, don't generate it.  We assume that
2586      if fma isn't available then fms, fnma or fnms are not either.  */
2587   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
2588     return false;
2589
2590   /* If the multiplication has zero uses, it is kept around probably because
2591      of -fnon-call-exceptions.  Don't optimize it away in that case,
2592      it is DCE job.  */
2593   if (has_zero_uses (mul_result))
2594     return false;
2595
2596   /* Make sure that the multiplication statement becomes dead after
2597      the transformation, thus that all uses are transformed to FMAs.
2598      This means we assume that an FMA operation has the same cost
2599      as an addition.  */
2600   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
2601     {
2602       enum tree_code use_code;
2603       tree result = mul_result;
2604       bool negate_p = false;
2605
2606       use_stmt = USE_STMT (use_p);
2607
2608       if (is_gimple_debug (use_stmt))
2609         continue;
2610
2611       /* For now restrict this operations to single basic blocks.  In theory
2612          we would want to support sinking the multiplication in
2613          m = a*b;
2614          if ()
2615            ma = m + c;
2616          else
2617            d = m;
2618          to form a fma in the then block and sink the multiplication to the
2619          else block.  */
2620       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2621         return false;
2622
2623       if (!is_gimple_assign (use_stmt))
2624         return false;
2625
2626       use_code = gimple_assign_rhs_code (use_stmt);
2627
2628       /* A negate on the multiplication leads to FNMA.  */
2629       if (use_code == NEGATE_EXPR)
2630         {
2631           ssa_op_iter iter;
2632           use_operand_p usep;
2633
2634           result = gimple_assign_lhs (use_stmt);
2635
2636           /* Make sure the negate statement becomes dead with this
2637              single transformation.  */
2638           if (!single_imm_use (gimple_assign_lhs (use_stmt),
2639                                &use_p, &neguse_stmt))
2640             return false;
2641
2642           /* Make sure the multiplication isn't also used on that stmt.  */
2643           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
2644             if (USE_FROM_PTR (usep) == mul_result)
2645               return false;
2646
2647           /* Re-validate.  */
2648           use_stmt = neguse_stmt;
2649           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2650             return false;
2651           if (!is_gimple_assign (use_stmt))
2652             return false;
2653
2654           use_code = gimple_assign_rhs_code (use_stmt);
2655           negate_p = true;
2656         }
2657
2658       switch (use_code)
2659         {
2660         case MINUS_EXPR:
2661           if (gimple_assign_rhs2 (use_stmt) == result)
2662             negate_p = !negate_p;
2663           break;
2664         case PLUS_EXPR:
2665           break;
2666         default:
2667           /* FMA can only be formed from PLUS and MINUS.  */
2668           return false;
2669         }
2670
2671       /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
2672          by a MULT_EXPR that we'll visit later, we might be able to
2673          get a more profitable match with fnma.
2674          OTOH, if we don't, a negate / fma pair has likely lower latency
2675          that a mult / subtract pair.  */
2676       if (use_code == MINUS_EXPR && !negate_p
2677           && gimple_assign_rhs1 (use_stmt) == result
2678           && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
2679           && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
2680         {
2681           tree rhs2 = gimple_assign_rhs2 (use_stmt);
2682
2683           if (TREE_CODE (rhs2) == SSA_NAME)
2684             {
2685               gimple stmt2 = SSA_NAME_DEF_STMT (rhs2);
2686               if (has_single_use (rhs2)
2687                   && is_gimple_assign (stmt2)
2688                   && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
2689               return false;
2690             }
2691         }
2692
2693       /* We can't handle a * b + a * b.  */
2694       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
2695         return false;
2696
2697       /* While it is possible to validate whether or not the exact form
2698          that we've recognized is available in the backend, the assumption
2699          is that the transformation is never a loss.  For instance, suppose
2700          the target only has the plain FMA pattern available.  Consider
2701          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
2702          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
2703          still have 3 operations, but in the FMA form the two NEGs are
2704          independent and could be run in parallel.  */
2705     }
2706
2707   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2708     {
2709       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2710       enum tree_code use_code;
2711       tree addop, mulop1 = op1, result = mul_result;
2712       bool negate_p = false;
2713
2714       if (is_gimple_debug (use_stmt))
2715         continue;
2716
2717       use_code = gimple_assign_rhs_code (use_stmt);
2718       if (use_code == NEGATE_EXPR)
2719         {
2720           result = gimple_assign_lhs (use_stmt);
2721           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2722           gsi_remove (&gsi, true);
2723           release_defs (use_stmt);
2724
2725           use_stmt = neguse_stmt;
2726           gsi = gsi_for_stmt (use_stmt);
2727           use_code = gimple_assign_rhs_code (use_stmt);
2728           negate_p = true;
2729         }
2730
2731       if (gimple_assign_rhs1 (use_stmt) == result)
2732         {
2733           addop = gimple_assign_rhs2 (use_stmt);
2734           /* a * b - c -> a * b + (-c)  */
2735           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2736             addop = force_gimple_operand_gsi (&gsi,
2737                                               build1 (NEGATE_EXPR,
2738                                                       type, addop),
2739                                               true, NULL_TREE, true,
2740                                               GSI_SAME_STMT);
2741         }
2742       else
2743         {
2744           addop = gimple_assign_rhs1 (use_stmt);
2745           /* a - b * c -> (-b) * c + a */
2746           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2747             negate_p = !negate_p;
2748         }
2749
2750       if (negate_p)
2751         mulop1 = force_gimple_operand_gsi (&gsi,
2752                                            build1 (NEGATE_EXPR,
2753                                                    type, mulop1),
2754                                            true, NULL_TREE, true,
2755                                            GSI_SAME_STMT);
2756
2757       fma_stmt = gimple_build_assign_with_ops (FMA_EXPR,
2758                                                gimple_assign_lhs (use_stmt),
2759                                                mulop1, op2,
2760                                                addop);
2761       gsi_replace (&gsi, fma_stmt, true);
2762       widen_mul_stats.fmas_inserted++;
2763     }
2764
2765   return true;
2766 }
2767
2768 /* Find integer multiplications where the operands are extended from
2769    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
2770    where appropriate.  */
2771
2772 static unsigned int
2773 execute_optimize_widening_mul (void)
2774 {
2775   basic_block bb;
2776   bool cfg_changed = false;
2777
2778   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
2779
2780   FOR_EACH_BB (bb)
2781     {
2782       gimple_stmt_iterator gsi;
2783
2784       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
2785         {
2786           gimple stmt = gsi_stmt (gsi);
2787           enum tree_code code;
2788
2789           if (is_gimple_assign (stmt))
2790             {
2791               code = gimple_assign_rhs_code (stmt);
2792               switch (code)
2793                 {
2794                 case MULT_EXPR:
2795                   if (!convert_mult_to_widen (stmt, &gsi)
2796                       && convert_mult_to_fma (stmt,
2797                                               gimple_assign_rhs1 (stmt),
2798                                               gimple_assign_rhs2 (stmt)))
2799                     {
2800                       gsi_remove (&gsi, true);
2801                       release_defs (stmt);
2802                       continue;
2803                     }
2804                   break;
2805
2806                 case PLUS_EXPR:
2807                 case MINUS_EXPR:
2808                   convert_plusminus_to_widen (&gsi, stmt, code);
2809                   break;
2810
2811                 default:;
2812                 }
2813             }
2814           else if (is_gimple_call (stmt)
2815                    && gimple_call_lhs (stmt))
2816             {
2817               tree fndecl = gimple_call_fndecl (stmt);
2818               if (fndecl
2819                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
2820                 {
2821                   switch (DECL_FUNCTION_CODE (fndecl))
2822                     {
2823                       case BUILT_IN_POWF:
2824                       case BUILT_IN_POW:
2825                       case BUILT_IN_POWL:
2826                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
2827                             && REAL_VALUES_EQUAL
2828                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
2829                                   dconst2)
2830                             && convert_mult_to_fma (stmt,
2831                                                     gimple_call_arg (stmt, 0),
2832                                                     gimple_call_arg (stmt, 0)))
2833                           {
2834                             unlink_stmt_vdef (stmt);
2835                             if (gsi_remove (&gsi, true)
2836                                 && gimple_purge_dead_eh_edges (bb))
2837                               cfg_changed = true;
2838                             release_defs (stmt);
2839                             continue;
2840                           }
2841                           break;
2842
2843                       default:;
2844                     }
2845                 }
2846             }
2847           gsi_next (&gsi);
2848         }
2849     }
2850
2851   statistics_counter_event (cfun, "widening multiplications inserted",
2852                             widen_mul_stats.widen_mults_inserted);
2853   statistics_counter_event (cfun, "widening maccs inserted",
2854                             widen_mul_stats.maccs_inserted);
2855   statistics_counter_event (cfun, "fused multiply-adds inserted",
2856                             widen_mul_stats.fmas_inserted);
2857
2858   return cfg_changed ? TODO_cleanup_cfg : 0;
2859 }
2860
2861 static bool
2862 gate_optimize_widening_mul (void)
2863 {
2864   return flag_expensive_optimizations && optimize;
2865 }
2866
2867 namespace {
2868
2869 const pass_data pass_data_optimize_widening_mul =
2870 {
2871   GIMPLE_PASS, /* type */
2872   "widening_mul", /* name */
2873   OPTGROUP_NONE, /* optinfo_flags */
2874   true, /* has_gate */
2875   true, /* has_execute */
2876   TV_NONE, /* tv_id */
2877   PROP_ssa, /* properties_required */
2878   0, /* properties_provided */
2879   0, /* properties_destroyed */
2880   0, /* todo_flags_start */
2881   ( TODO_verify_ssa | TODO_verify_stmts
2882     | TODO_update_ssa ), /* todo_flags_finish */
2883 };
2884
2885 class pass_optimize_widening_mul : public gimple_opt_pass
2886 {
2887 public:
2888   pass_optimize_widening_mul (gcc::context *ctxt)
2889     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
2890   {}
2891
2892   /* opt_pass methods: */
2893   bool gate () { return gate_optimize_widening_mul (); }
2894   unsigned int execute () { return execute_optimize_widening_mul (); }
2895
2896 }; // class pass_optimize_widening_mul
2897
2898 } // anon namespace
2899
2900 gimple_opt_pass *
2901 make_pass_optimize_widening_mul (gcc::context *ctxt)
2902 {
2903   return new pass_optimize_widening_mul (ctxt);
2904 }