gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005-2014 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  21    operations.  These are common in sequences such as this one:
  22
  23         modulus = sqrt(x*x + y*y + z*z);
  24         x = x / modulus;
  25         y = y / modulus;
  26         z = z / modulus;
  27
  28    that can be optimized to
  29
  30         modulus = sqrt(x*x + y*y + z*z);
  31         rmodulus = 1.0 / modulus;
  32         x = x * rmodulus;
  33         y = y * rmodulus;
  34         z = z * rmodulus;
  35
  36    We do this for loop invariant divisors, and with this pass whenever
  37    we notice that a division has the same divisor multiple times.
  38
  39    Of course, like in PRE, we don't insert a division if a dominator
  40    already has one.  However, this cannot be done as an extension of
  41    PRE for several reasons.
  42
  43    First of all, with some experiments it was found out that the
  44    transformation is not always useful if there are only two divisions
  45    hy the same divisor.  This is probably because modern processors
  46    can pipeline the divisions; on older, in-order processors it should
  47    still be effective to optimize two divisions by the same number.
  48    We make this a param, and it shall be called N in the remainder of
  49    this comment.
  50
  51    Second, if trapping math is active, we have less freedom on where
  52    to insert divisions: we can only do so in basic blocks that already
  53    contain one.  (If divisions don't trap, instead, we can insert
  54    divisions elsewhere, which will be in blocks that are common dominators
  55    of those that have the division).
  56
  57    We really don't want to compute the reciprocal unless a division will
  58    be found.  To do this, we won't insert the division in a basic block
  59    that has less than N divisions *post-dominating* it.
  60
  61    The algorithm constructs a subset of the dominator tree, holding the
  62    blocks containing the divisions and the common dominators to them,
  63    and walk it twice.  The first walk is in post-order, and it annotates
  64    each block with the number of divisions that post-dominate it: this
  65    gives information on where divisions can be inserted profitably.
  66    The second walk is in pre-order, and it inserts divisions as explained
  67    above, and replaces divisions by multiplications.
  68
  69    In the best case, the cost of the pass is O(n_statements).  In the
  70    worst-case, the cost is due to creating the dominator tree subset,
  71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  72    for n_statements / n_basic_blocks statements.  So, the amortized cost
  73    of creating the dominator tree subset is O(n_basic_blocks) and the
  74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  75
  76    More practically, the cost will be small because there are few
  77    divisions, and they tend to be in the same basic block, so insert_bb
  78    is called very few times.
  79
  80    If we did this using domwalk.c, an efficient implementation would have
  81    to work on all the variables in a single pass, because we could not
  82    work on just a subset of the dominator tree, as we do now, and the
  83    cost would also be something like O(n_statements * n_basic_blocks).
  84    The data structures would be more complex in order to work on all the
  85    variables in a single pass.  */
  86
  87 #include "config.h"
  88 #include "system.h"
  89 #include "coretypes.h"
  90 #include "tm.h"
  91 #include "flags.h"
  92 #include "tree.h"
  93 #include "basic-block.h"
  94 #include "tree-ssa-alias.h"
  95 #include "internal-fn.h"
  96 #include "gimple-fold.h"
  97 #include "gimple-expr.h"
  98 #include "is-a.h"
  99 #include "gimple.h"
 100 #include "gimple-iterator.h"
 101 #include "gimplify-me.h"
 102 #include "stor-layout.h"
 103 #include "gimple-ssa.h"
 104 #include "tree-cfg.h"
 105 #include "tree-phinodes.h"
 106 #include "ssa-iterators.h"
 107 #include "stringpool.h"
 108 #include "tree-ssanames.h"
 109 #include "expr.h"
 110 #include "tree-dfa.h"
 111 #include "tree-ssa.h"
 112 #include "tree-pass.h"
 113 #include "alloc-pool.h"
 114 #include "target.h"
 115 #include "gimple-pretty-print.h"
 116
 117 /* FIXME: RTL headers have to be included here for optabs.  */
 118 #include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
 119 #include "expr.h"               /* Because optabs.h wants sepops.  */
 120 #include "optabs.h"
 121
 122 /* This structure represents one basic block that either computes a
 123    division, or is a common dominator for basic block that compute a
 124    division.  */
 125 struct occurrence {
 126   /* The basic block represented by this structure.  */
 127   basic_block bb;
 128
 129   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 130      inserted in BB.  */
 131   tree recip_def;
 132
 133   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 134      was inserted in BB.  */
 135   gimple recip_def_stmt;
 136
 137   /* Pointer to a list of "struct occurrence"s for blocks dominated
 138      by BB.  */
 139   struct occurrence *children;
 140
 141   /* Pointer to the next "struct occurrence"s in the list of blocks
 142      sharing a common dominator.  */
 143   struct occurrence *next;
 144
 145   /* The number of divisions that are in BB before compute_merit.  The
 146      number of divisions that are in BB or post-dominate it after
 147      compute_merit.  */
 148   int num_divisions;
 149
 150   /* True if the basic block has a division, false if it is a common
 151      dominator for basic blocks that do.  If it is false and trapping
 152      math is active, BB is not a candidate for inserting a reciprocal.  */
 153   bool bb_has_division;
 154 };
 155
 156 static struct
 157 {
 158   /* Number of 1.0/X ops inserted.  */
 159   int rdivs_inserted;
 160
 161   /* Number of 1.0/FUNC ops inserted.  */
 162   int rfuncs_inserted;
 163 } reciprocal_stats;
 164
 165 static struct
 166 {
 167   /* Number of cexpi calls inserted.  */
 168   int inserted;
 169 } sincos_stats;
 170
 171 static struct
 172 {
 173   /* Number of hand-written 16-bit bswaps found.  */
 174   int found_16bit;
 175
 176   /* Number of hand-written 32-bit bswaps found.  */
 177   int found_32bit;
 178
 179   /* Number of hand-written 64-bit bswaps found.  */
 180   int found_64bit;
 181 } bswap_stats;
 182
 183 static struct
 184 {
 185   /* Number of widening multiplication ops inserted.  */
 186   int widen_mults_inserted;
 187
 188   /* Number of integer multiply-and-accumulate ops inserted.  */
 189   int maccs_inserted;
 190
 191   /* Number of fp fused multiply-add ops inserted.  */
 192   int fmas_inserted;
 193 } widen_mul_stats;
 194
 195 /* The instance of "struct occurrence" representing the highest
 196    interesting block in the dominator tree.  */
 197 static struct occurrence *occ_head;
 198
 199 /* Allocation pool for getting instances of "struct occurrence".  */
 200 static alloc_pool occ_pool;
 201
 202
 203
 204 /* Allocate and return a new struct occurrence for basic block BB, and
 205    whose children list is headed by CHILDREN.  */
 206 static struct occurrence *
 207 occ_new (basic_block bb, struct occurrence *children)
 208 {
 209   struct occurrence *occ;
 210
 211   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
 212   memset (occ, 0, sizeof (struct occurrence));
 213
 214   occ->bb = bb;
 215   occ->children = children;
 216   return occ;
 217 }
 218
 219
 220 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 221    list of "struct occurrence"s, one per basic block, having IDOM as
 222    their common dominator.
 223
 224    We try to insert NEW_OCC as deep as possible in the tree, and we also
 225    insert any other block that is a common dominator for BB and one
 226    block already in the tree.  */
 227
 228 static void
 229 insert_bb (struct occurrence *new_occ, basic_block idom,
 230            struct occurrence **p_head)
 231 {
 232   struct occurrence *occ, **p_occ;
 233
 234   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 235     {
 236       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 237       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 238       if (dom == bb)
 239         {
 240           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 241              from its list.  */
 242           *p_occ = occ->next;
 243           occ->next = new_occ->children;
 244           new_occ->children = occ;
 245
 246           /* Try the next block (it may as well be dominated by BB).  */
 247         }
 248
 249       else if (dom == occ_bb)
 250         {
 251           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 252           insert_bb (new_occ, dom, &occ->children);
 253           return;
 254         }
 255
 256       else if (dom != idom)
 257         {
 258           gcc_assert (!dom->aux);
 259
 260           /* There is a dominator between IDOM and BB, add it and make
 261              two children out of NEW_OCC and OCC.  First, remove OCC from
 262              its list.  */
 263           *p_occ = occ->next;
 264           new_occ->next = occ;
 265           occ->next = NULL;
 266
 267           /* None of the previous blocks has DOM as a dominator: if we tail
 268              recursed, we would reexamine them uselessly. Just switch BB with
 269              DOM, and go on looking for blocks dominated by DOM.  */
 270           new_occ = occ_new (dom, new_occ);
 271         }
 272
 273       else
 274         {
 275           /* Nothing special, go on with the next element.  */
 276           p_occ = &occ->next;
 277         }
 278     }
 279
 280   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 281   new_occ->next = *p_head;
 282   *p_head = new_occ;
 283 }
 284
 285 /* Register that we found a division in BB.  */
 286
 287 static inline void
 288 register_division_in (basic_block bb)
 289 {
 290   struct occurrence *occ;
 291
 292   occ = (struct occurrence *) bb->aux;
 293   if (!occ)
 294     {
 295       occ = occ_new (bb, NULL);
 296       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
 297     }
 298
 299   occ->bb_has_division = true;
 300   occ->num_divisions++;
 301 }
 302
 303
 304 /* Compute the number of divisions that postdominate each block in OCC and
 305    its children.  */
 306
 307 static void
 308 compute_merit (struct occurrence *occ)
 309 {
 310   struct occurrence *occ_child;
 311   basic_block dom = occ->bb;
 312
 313   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 314     {
 315       basic_block bb;
 316       if (occ_child->children)
 317         compute_merit (occ_child);
 318
 319       if (flag_exceptions)
 320         bb = single_noncomplex_succ (dom);
 321       else
 322         bb = dom;
 323
 324       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 325         occ->num_divisions += occ_child->num_divisions;
 326     }
 327 }
 328
 329
 330 /* Return whether USE_STMT is a floating-point division by DEF.  */
 331 static inline bool
 332 is_division_by (gimple use_stmt, tree def)
 333 {
 334   return is_gimple_assign (use_stmt)
 335          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 336          && gimple_assign_rhs2 (use_stmt) == def
 337          /* Do not recognize x / x as valid division, as we are getting
 338             confused later by replacing all immediate uses x in such
 339             a stmt.  */
 340          && gimple_assign_rhs1 (use_stmt) != def;
 341 }
 342
 343 /* Walk the subset of the dominator tree rooted at OCC, setting the
 344    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 345    the given basic block.  The field may be left NULL, of course,
 346    if it is not possible or profitable to do the optimization.
 347
 348    DEF_BSI is an iterator pointing at the statement defining DEF.
 349    If RECIP_DEF is set, a dominator already has a computation that can
 350    be used.  */
 351
 352 static void
 353 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 354                     tree def, tree recip_def, int threshold)
 355 {
 356   tree type;
 357   gimple new_stmt;
 358   gimple_stmt_iterator gsi;
 359   struct occurrence *occ_child;
 360
 361   if (!recip_def
 362       && (occ->bb_has_division || !flag_trapping_math)
 363       && occ->num_divisions >= threshold)
 364     {
 365       /* Make a variable with the replacement and substitute it.  */
 366       type = TREE_TYPE (def);
 367       recip_def = create_tmp_reg (type, "reciptmp");
 368       new_stmt = gimple_build_assign_with_ops (RDIV_EXPR, recip_def,
 369                                                build_one_cst (type), def);
 370
 371       if (occ->bb_has_division)
 372         {
 373           /* Case 1: insert before an existing division.  */
 374           gsi = gsi_after_labels (occ->bb);
 375           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 376             gsi_next (&gsi);
 377
 378           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 379         }
 380       else if (def_gsi && occ->bb == def_gsi->bb)
 381         {
 382           /* Case 2: insert right after the definition.  Note that this will
 383              never happen if the definition statement can throw, because in
 384              that case the sole successor of the statement's basic block will
 385              dominate all the uses as well.  */
 386           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 387         }
 388       else
 389         {
 390           /* Case 3: insert in a basic block not containing defs/uses.  */
 391           gsi = gsi_after_labels (occ->bb);
 392           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 393         }
 394
 395       reciprocal_stats.rdivs_inserted++;
 396
 397       occ->recip_def_stmt = new_stmt;
 398     }
 399
 400   occ->recip_def = recip_def;
 401   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 402     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 403 }
 404
 405
 406 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 407    possible.  */
 408
 409 static inline void
 410 replace_reciprocal (use_operand_p use_p)
 411 {
 412   gimple use_stmt = USE_STMT (use_p);
 413   basic_block bb = gimple_bb (use_stmt);
 414   struct occurrence *occ = (struct occurrence *) bb->aux;
 415
 416   if (optimize_bb_for_speed_p (bb)
 417       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 418     {
 419       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 420       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 421       SET_USE (use_p, occ->recip_def);
 422       fold_stmt_inplace (&gsi);
 423       update_stmt (use_stmt);
 424     }
 425 }
 426
 427
 428 /* Free OCC and return one more "struct occurrence" to be freed.  */
 429
 430 static struct occurrence *
 431 free_bb (struct occurrence *occ)
 432 {
 433   struct occurrence *child, *next;
 434
 435   /* First get the two pointers hanging off OCC.  */
 436   next = occ->next;
 437   child = occ->children;
 438   occ->bb->aux = NULL;
 439   pool_free (occ_pool, occ);
 440
 441   /* Now ensure that we don't recurse unless it is necessary.  */
 442   if (!child)
 443     return next;
 444   else
 445     {
 446       while (next)
 447         next = free_bb (next);
 448
 449       return child;
 450     }
 451 }
 452
 453
 454 /* Look for floating-point divisions among DEF's uses, and try to
 455    replace them by multiplications with the reciprocal.  Add
 456    as many statements computing the reciprocal as needed.
 457
 458    DEF must be a GIMPLE register of a floating-point type.  */
 459
 460 static void
 461 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 462 {
 463   use_operand_p use_p;
 464   imm_use_iterator use_iter;
 465   struct occurrence *occ;
 466   int count = 0, threshold;
 467
 468   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 469
 470   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 471     {
 472       gimple use_stmt = USE_STMT (use_p);
 473       if (is_division_by (use_stmt, def))
 474         {
 475           register_division_in (gimple_bb (use_stmt));
 476           count++;
 477         }
 478     }
 479
 480   /* Do the expensive part only if we can hope to optimize something.  */
 481   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 482   if (count >= threshold)
 483     {
 484       gimple use_stmt;
 485       for (occ = occ_head; occ; occ = occ->next)
 486         {
 487           compute_merit (occ);
 488           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 489         }
 490
 491       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 492         {
 493           if (is_division_by (use_stmt, def))
 494             {
 495               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 496                 replace_reciprocal (use_p);
 497             }
 498         }
 499     }
 500
 501   for (occ = occ_head; occ; )
 502     occ = free_bb (occ);
 503
 504   occ_head = NULL;
 505 }
 506
 507 static bool
 508 gate_cse_reciprocals (void)
 509 {
 510   return optimize && flag_reciprocal_math;
 511 }
 512
 513 /* Go through all the floating-point SSA_NAMEs, and call
 514    execute_cse_reciprocals_1 on each of them.  */
 515 static unsigned int
 516 execute_cse_reciprocals (void)
 517 {
 518   basic_block bb;
 519   tree arg;
 520
 521   occ_pool = create_alloc_pool ("dominators for recip",
 522                                 sizeof (struct occurrence),
 523                                 n_basic_blocks_for_fn (cfun) / 3 + 1);
 524
 525   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 526   calculate_dominance_info (CDI_DOMINATORS);
 527   calculate_dominance_info (CDI_POST_DOMINATORS);
 528
 529 #ifdef ENABLE_CHECKING
 530   FOR_EACH_BB_FN (bb, cfun)
 531     gcc_assert (!bb->aux);
 532 #endif
 533
 534   for (arg = DECL_ARGUMENTS (cfun->decl); arg; arg = DECL_CHAIN (arg))
 535     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 536         && is_gimple_reg (arg))
 537       {
 538         tree name = ssa_default_def (cfun, arg);
 539         if (name)
 540           execute_cse_reciprocals_1 (NULL, name);
 541       }
 542
 543   FOR_EACH_BB_FN (bb, cfun)
 544     {
 545       gimple_stmt_iterator gsi;
 546       gimple phi;
 547       tree def;
 548
 549       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 550         {
 551           phi = gsi_stmt (gsi);
 552           def = PHI_RESULT (phi);
 553           if (! virtual_operand_p (def)
 554               && FLOAT_TYPE_P (TREE_TYPE (def)))
 555             execute_cse_reciprocals_1 (NULL, def);
 556         }
 557
 558       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 559         {
 560           gimple stmt = gsi_stmt (gsi);
 561
 562           if (gimple_has_lhs (stmt)
 563               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 564               && FLOAT_TYPE_P (TREE_TYPE (def))
 565               && TREE_CODE (def) == SSA_NAME)
 566             execute_cse_reciprocals_1 (&gsi, def);
 567         }
 568
 569       if (optimize_bb_for_size_p (bb))
 570         continue;
 571
 572       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 573       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 574         {
 575           gimple stmt = gsi_stmt (gsi);
 576           tree fndecl;
 577
 578           if (is_gimple_assign (stmt)
 579               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 580             {
 581               tree arg1 = gimple_assign_rhs2 (stmt);
 582               gimple stmt1;
 583
 584               if (TREE_CODE (arg1) != SSA_NAME)
 585                 continue;
 586
 587               stmt1 = SSA_NAME_DEF_STMT (arg1);
 588
 589               if (is_gimple_call (stmt1)
 590                   && gimple_call_lhs (stmt1)
 591                   && (fndecl = gimple_call_fndecl (stmt1))
 592                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 593                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 594                 {
 595                   enum built_in_function code;
 596                   bool md_code, fail;
 597                   imm_use_iterator ui;
 598                   use_operand_p use_p;
 599
 600                   code = DECL_FUNCTION_CODE (fndecl);
 601                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 602
 603                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 604                   if (!fndecl)
 605                     continue;
 606
 607                   /* Check that all uses of the SSA name are divisions,
 608                      otherwise replacing the defining statement will do
 609                      the wrong thing.  */
 610                   fail = false;
 611                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 612                     {
 613                       gimple stmt2 = USE_STMT (use_p);
 614                       if (is_gimple_debug (stmt2))
 615                         continue;
 616                       if (!is_gimple_assign (stmt2)
 617                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 618                           || gimple_assign_rhs1 (stmt2) == arg1
 619                           || gimple_assign_rhs2 (stmt2) != arg1)
 620                         {
 621                           fail = true;
 622                           break;
 623                         }
 624                     }
 625                   if (fail)
 626                     continue;
 627
 628                   gimple_replace_ssa_lhs (stmt1, arg1);
 629                   gimple_call_set_fndecl (stmt1, fndecl);
 630                   update_stmt (stmt1);
 631                   reciprocal_stats.rfuncs_inserted++;
 632
 633                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 634                     {
 635                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 636                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 637                       fold_stmt_inplace (&gsi);
 638                       update_stmt (stmt);
 639                     }
 640                 }
 641             }
 642         }
 643     }
 644
 645   statistics_counter_event (cfun, "reciprocal divs inserted",
 646                             reciprocal_stats.rdivs_inserted);
 647   statistics_counter_event (cfun, "reciprocal functions inserted",
 648                             reciprocal_stats.rfuncs_inserted);
 649
 650   free_dominance_info (CDI_DOMINATORS);
 651   free_dominance_info (CDI_POST_DOMINATORS);
 652   free_alloc_pool (occ_pool);
 653   return 0;
 654 }
 655
 656 namespace {
 657
 658 const pass_data pass_data_cse_reciprocals =
 659 {
 660   GIMPLE_PASS, /* type */
 661   "recip", /* name */
 662   OPTGROUP_NONE, /* optinfo_flags */
 663   true, /* has_gate */
 664   true, /* has_execute */
 665   TV_NONE, /* tv_id */
 666   PROP_ssa, /* properties_required */
 667   0, /* properties_provided */
 668   0, /* properties_destroyed */
 669   0, /* todo_flags_start */
 670   ( TODO_update_ssa | TODO_verify_ssa
 671     | TODO_verify_stmts ), /* todo_flags_finish */
 672 };
 673
 674 class pass_cse_reciprocals : public gimple_opt_pass
 675 {
 676 public:
 677   pass_cse_reciprocals (gcc::context *ctxt)
 678     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
 679   {}
 680
 681   /* opt_pass methods: */
 682   bool gate () { return gate_cse_reciprocals (); }
 683   unsigned int execute () { return execute_cse_reciprocals (); }
 684
 685 }; // class pass_cse_reciprocals
 686
 687 } // anon namespace
 688
 689 gimple_opt_pass *
 690 make_pass_cse_reciprocals (gcc::context *ctxt)
 691 {
 692   return new pass_cse_reciprocals (ctxt);
 693 }
 694
 695 /* Records an occurrence at statement USE_STMT in the vector of trees
 696    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 697    is not yet initialized.  Returns true if the occurrence was pushed on
 698    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 699    statements in the vector.  */
 700
 701 static bool
 702 maybe_record_sincos (vec<gimple> *stmts,
 703                      basic_block *top_bb, gimple use_stmt)
 704 {
 705   basic_block use_bb = gimple_bb (use_stmt);
 706   if (*top_bb
 707       && (*top_bb == use_bb
 708           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 709     stmts->safe_push (use_stmt);
 710   else if (!*top_bb
 711            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 712     {
 713       stmts->safe_push (use_stmt);
 714       *top_bb = use_bb;
 715     }
 716   else
 717     return false;
 718
 719   return true;
 720 }
 721
 722 /* Look for sin, cos and cexpi calls with the same argument NAME and
 723    create a single call to cexpi CSEing the result in this case.
 724    We first walk over all immediate uses of the argument collecting
 725    statements that we can CSE in a vector and in a second pass replace
 726    the statement rhs with a REALPART or IMAGPART expression on the
 727    result of the cexpi call we insert before the use statement that
 728    dominates all other candidates.  */
 729
 730 static bool
 731 execute_cse_sincos_1 (tree name)
 732 {
 733   gimple_stmt_iterator gsi;
 734   imm_use_iterator use_iter;
 735   tree fndecl, res, type;
 736   gimple def_stmt, use_stmt, stmt;
 737   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 738   vec<gimple> stmts = vNULL;
 739   basic_block top_bb = NULL;
 740   int i;
 741   bool cfg_changed = false;
 742
 743   type = TREE_TYPE (name);
 744   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 745     {
 746       if (gimple_code (use_stmt) != GIMPLE_CALL
 747           || !gimple_call_lhs (use_stmt)
 748           || !(fndecl = gimple_call_fndecl (use_stmt))
 749           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 750         continue;
 751
 752       switch (DECL_FUNCTION_CODE (fndecl))
 753         {
 754         CASE_FLT_FN (BUILT_IN_COS):
 755           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 756           break;
 757
 758         CASE_FLT_FN (BUILT_IN_SIN):
 759           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 760           break;
 761
 762         CASE_FLT_FN (BUILT_IN_CEXPI):
 763           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 764           break;
 765
 766         default:;
 767         }
 768     }
 769
 770   if (seen_cos + seen_sin + seen_cexpi <= 1)
 771     {
 772       stmts.release ();
 773       return false;
 774     }
 775
 776   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 777      the name def statement.  */
 778   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 779   if (!fndecl)
 780     return false;
 781   stmt = gimple_build_call (fndecl, 1, name);
 782   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
 783   gimple_call_set_lhs (stmt, res);
 784
 785   def_stmt = SSA_NAME_DEF_STMT (name);
 786   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 787       && gimple_code (def_stmt) != GIMPLE_PHI
 788       && gimple_bb (def_stmt) == top_bb)
 789     {
 790       gsi = gsi_for_stmt (def_stmt);
 791       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 792     }
 793   else
 794     {
 795       gsi = gsi_after_labels (top_bb);
 796       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 797     }
 798   sincos_stats.inserted++;
 799
 800   /* And adjust the recorded old call sites.  */
 801   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
 802     {
 803       tree rhs = NULL;
 804       fndecl = gimple_call_fndecl (use_stmt);
 805
 806       switch (DECL_FUNCTION_CODE (fndecl))
 807         {
 808         CASE_FLT_FN (BUILT_IN_COS):
 809           rhs = fold_build1 (REALPART_EXPR, type, res);
 810           break;
 811
 812         CASE_FLT_FN (BUILT_IN_SIN):
 813           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 814           break;
 815
 816         CASE_FLT_FN (BUILT_IN_CEXPI):
 817           rhs = res;
 818           break;
 819
 820         default:;
 821           gcc_unreachable ();
 822         }
 823
 824         /* Replace call with a copy.  */
 825         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 826
 827         gsi = gsi_for_stmt (use_stmt);
 828         gsi_replace (&gsi, stmt, true);
 829         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 830           cfg_changed = true;
 831     }
 832
 833   stmts.release ();
 834
 835   return cfg_changed;
 836 }
 837
 838 /* To evaluate powi(x,n), the floating point value x raised to the
 839    constant integer exponent n, we use a hybrid algorithm that
 840    combines the "window method" with look-up tables.  For an
 841    introduction to exponentiation algorithms and "addition chains",
 842    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 843    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 844    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 845    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 846
 847 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 848    multiplications to inline before calling the system library's pow
 849    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 850    so this default never requires calling pow, powf or powl.  */
 851
 852 #ifndef POWI_MAX_MULTS
 853 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 854 #endif
 855
 856 /* The size of the "optimal power tree" lookup table.  All
 857    exponents less than this value are simply looked up in the
 858    powi_table below.  This threshold is also used to size the
 859    cache of pseudo registers that hold intermediate results.  */
 860 #define POWI_TABLE_SIZE 256
 861
 862 /* The size, in bits of the window, used in the "window method"
 863    exponentiation algorithm.  This is equivalent to a radix of
 864    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 865 #define POWI_WINDOW_SIZE 3
 866
 867 /* The following table is an efficient representation of an
 868    "optimal power tree".  For each value, i, the corresponding
 869    value, j, in the table states than an optimal evaluation
 870    sequence for calculating pow(x,i) can be found by evaluating
 871    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 872    100 integers is given in Knuth's "Seminumerical algorithms".  */
 873
 874 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 875   {
 876       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 877       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 878       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 879      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 880      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 881      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 882      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 883      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 884      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 885      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 886      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 887      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 888      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 889      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 890      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 891      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 892      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 893      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 894      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 895      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 896      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 897      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 898      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 899      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 900      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 901     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 902     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 903     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 904     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 905     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 906     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 907     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 908   };
 909
 910
 911 /* Return the number of multiplications required to calculate
 912    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 913    subroutine of powi_cost.  CACHE is an array indicating
 914    which exponents have already been calculated.  */
 915
 916 static int
 917 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 918 {
 919   /* If we've already calculated this exponent, then this evaluation
 920      doesn't require any additional multiplications.  */
 921   if (cache[n])
 922     return 0;
 923
 924   cache[n] = true;
 925   return powi_lookup_cost (n - powi_table[n], cache)
 926          + powi_lookup_cost (powi_table[n], cache) + 1;
 927 }
 928
 929 /* Return the number of multiplications required to calculate
 930    powi(x,n) for an arbitrary x, given the exponent N.  This
 931    function needs to be kept in sync with powi_as_mults below.  */
 932
 933 static int
 934 powi_cost (HOST_WIDE_INT n)
 935 {
 936   bool cache[POWI_TABLE_SIZE];
 937   unsigned HOST_WIDE_INT digit;
 938   unsigned HOST_WIDE_INT val;
 939   int result;
 940
 941   if (n == 0)
 942     return 0;
 943
 944   /* Ignore the reciprocal when calculating the cost.  */
 945   val = (n < 0) ? -n : n;
 946
 947   /* Initialize the exponent cache.  */
 948   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 949   cache[1] = true;
 950
 951   result = 0;
 952
 953   while (val >= POWI_TABLE_SIZE)
 954     {
 955       if (val & 1)
 956         {
 957           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 958           result += powi_lookup_cost (digit, cache)
 959                     + POWI_WINDOW_SIZE + 1;
 960           val >>= POWI_WINDOW_SIZE;
 961         }
 962       else
 963         {
 964           val >>= 1;
 965           result++;
 966         }
 967     }
 968
 969   return result + powi_lookup_cost (val, cache);
 970 }
 971
 972 /* Recursive subroutine of powi_as_mults.  This function takes the
 973    array, CACHE, of already calculated exponents and an exponent N and
 974    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 975
 976 static tree
 977 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 978                  HOST_WIDE_INT n, tree *cache)
 979 {
 980   tree op0, op1, ssa_target;
 981   unsigned HOST_WIDE_INT digit;
 982   gimple mult_stmt;
 983
 984   if (n < POWI_TABLE_SIZE && cache[n])
 985     return cache[n];
 986
 987   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
 988
 989   if (n < POWI_TABLE_SIZE)
 990     {
 991       cache[n] = ssa_target;
 992       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
 993       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
 994     }
 995   else if (n & 1)
 996     {
 997       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
 998       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
 999       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1000     }
1001   else
1002     {
1003       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1004       op1 = op0;
1005     }
1006
1007   mult_stmt = gimple_build_assign_with_ops (MULT_EXPR, ssa_target, op0, op1);
1008   gimple_set_location (mult_stmt, loc);
1009   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1010
1011   return ssa_target;
1012 }
1013
1014 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1015    This function needs to be kept in sync with powi_cost above.  */
1016
1017 static tree
1018 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1019                tree arg0, HOST_WIDE_INT n)
1020 {
1021   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1022   gimple div_stmt;
1023   tree target;
1024
1025   if (n == 0)
1026     return build_real (type, dconst1);
1027
1028   memset (cache, 0,  sizeof (cache));
1029   cache[1] = arg0;
1030
1031   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1032   if (n >= 0)
1033     return result;
1034
1035   /* If the original exponent was negative, reciprocate the result.  */
1036   target = make_temp_ssa_name (type, NULL, "powmult");
1037   div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target,
1038                                            build_real (type, dconst1),
1039                                            result);
1040   gimple_set_location (div_stmt, loc);
1041   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1042
1043   return target;
1044 }
1045
1046 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1047    location info LOC.  If the arguments are appropriate, create an
1048    equivalent sequence of statements prior to GSI using an optimal
1049    number of multiplications, and return an expession holding the
1050    result.  */
1051
1052 static tree
1053 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1054                             tree arg0, HOST_WIDE_INT n)
1055 {
1056   /* Avoid largest negative number.  */
1057   if (n != -n
1058       && ((n >= -1 && n <= 2)
1059           || (optimize_function_for_speed_p (cfun)
1060               && powi_cost (n) <= POWI_MAX_MULTS)))
1061     return powi_as_mults (gsi, loc, arg0, n);
1062
1063   return NULL_TREE;
1064 }
1065
1066 /* Build a gimple call statement that calls FN with argument ARG.
1067    Set the lhs of the call statement to a fresh SSA name.  Insert the
1068    statement prior to GSI's current position, and return the fresh
1069    SSA name.  */
1070
1071 static tree
1072 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1073                        tree fn, tree arg)
1074 {
1075   gimple call_stmt;
1076   tree ssa_target;
1077
1078   call_stmt = gimple_build_call (fn, 1, arg);
1079   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1080   gimple_set_lhs (call_stmt, ssa_target);
1081   gimple_set_location (call_stmt, loc);
1082   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1083
1084   return ssa_target;
1085 }
1086
1087 /* Build a gimple binary operation with the given CODE and arguments
1088    ARG0, ARG1, assigning the result to a new SSA name for variable
1089    TARGET.  Insert the statement prior to GSI's current position, and
1090    return the fresh SSA name.*/
1091
1092 static tree
1093 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1094                         const char *name, enum tree_code code,
1095                         tree arg0, tree arg1)
1096 {
1097   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1098   gimple stmt = gimple_build_assign_with_ops (code, result, arg0, arg1);
1099   gimple_set_location (stmt, loc);
1100   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1101   return result;
1102 }
1103
1104 /* Build a gimple reference operation with the given CODE and argument
1105    ARG, assigning the result to a new SSA name of TYPE with NAME.
1106    Insert the statement prior to GSI's current position, and return
1107    the fresh SSA name.  */
1108
1109 static inline tree
1110 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1111                       const char *name, enum tree_code code, tree arg0)
1112 {
1113   tree result = make_temp_ssa_name (type, NULL, name);
1114   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1115   gimple_set_location (stmt, loc);
1116   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1117   return result;
1118 }
1119
1120 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1121    prior to GSI's current position, and return the fresh SSA name.  */
1122
1123 static tree
1124 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1125                        tree type, tree val)
1126 {
1127   tree result = make_ssa_name (type, NULL);
1128   gimple stmt = gimple_build_assign_with_ops (NOP_EXPR, result, val, NULL_TREE);
1129   gimple_set_location (stmt, loc);
1130   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1131   return result;
1132 }
1133
1134 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1135    with location info LOC.  If possible, create an equivalent and
1136    less expensive sequence of statements prior to GSI, and return an
1137    expession holding the result.  */
1138
1139 static tree
1140 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1141                            tree arg0, tree arg1)
1142 {
1143   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1144   REAL_VALUE_TYPE c2, dconst3;
1145   HOST_WIDE_INT n;
1146   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1147   enum machine_mode mode;
1148   bool hw_sqrt_exists, c_is_int, c2_is_int;
1149
1150   /* If the exponent isn't a constant, there's nothing of interest
1151      to be done.  */
1152   if (TREE_CODE (arg1) != REAL_CST)
1153     return NULL_TREE;
1154
1155   /* If the exponent is equivalent to an integer, expand to an optimal
1156      multiplication sequence when profitable.  */
1157   c = TREE_REAL_CST (arg1);
1158   n = real_to_integer (&c);
1159   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1160   c_is_int = real_identical (&c, &cint);
1161
1162   if (c_is_int
1163       && ((n >= -1 && n <= 2)
1164           || (flag_unsafe_math_optimizations
1165               && optimize_insn_for_speed_p ()
1166               && powi_cost (n) <= POWI_MAX_MULTS)))
1167     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1168
1169   /* Attempt various optimizations using sqrt and cbrt.  */
1170   type = TREE_TYPE (arg0);
1171   mode = TYPE_MODE (type);
1172   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1173
1174   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1175      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1176      sqrt(-0) = -0.  */
1177   if (sqrtfn
1178       && REAL_VALUES_EQUAL (c, dconsthalf)
1179       && !HONOR_SIGNED_ZEROS (mode))
1180     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1181
1182   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1183      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1184      so do this optimization even if -Os.  Don't do this optimization
1185      if we don't have a hardware sqrt insn.  */
1186   dconst1_4 = dconst1;
1187   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1188   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1189
1190   if (flag_unsafe_math_optimizations
1191       && sqrtfn
1192       && REAL_VALUES_EQUAL (c, dconst1_4)
1193       && hw_sqrt_exists)
1194     {
1195       /* sqrt(x)  */
1196       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1197
1198       /* sqrt(sqrt(x))  */
1199       return build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1200     }
1201
1202   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1203      optimizing for space.  Don't do this optimization if we don't have
1204      a hardware sqrt insn.  */
1205   real_from_integer (&dconst3_4, VOIDmode, 3, 0, 0);
1206   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1207
1208   if (flag_unsafe_math_optimizations
1209       && sqrtfn
1210       && optimize_function_for_speed_p (cfun)
1211       && REAL_VALUES_EQUAL (c, dconst3_4)
1212       && hw_sqrt_exists)
1213     {
1214       /* sqrt(x)  */
1215       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1216
1217       /* sqrt(sqrt(x))  */
1218       sqrt_sqrt = build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1219
1220       /* sqrt(x) * sqrt(sqrt(x))  */
1221       return build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1222                                      sqrt_arg0, sqrt_sqrt);
1223     }
1224
1225   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1226      optimizations since 1./3. is not exactly representable.  If x
1227      is negative and finite, the correct value of pow(x,1./3.) is
1228      a NaN with the "invalid" exception raised, because the value
1229      of 1./3. actually has an even denominator.  The correct value
1230      of cbrt(x) is a negative real value.  */
1231   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1232   dconst1_3 = real_value_truncate (mode, dconst_third ());
1233
1234   if (flag_unsafe_math_optimizations
1235       && cbrtfn
1236       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1237       && REAL_VALUES_EQUAL (c, dconst1_3))
1238     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1239
1240   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1241      if we don't have a hardware sqrt insn.  */
1242   dconst1_6 = dconst1_3;
1243   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1244
1245   if (flag_unsafe_math_optimizations
1246       && sqrtfn
1247       && cbrtfn
1248       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1249       && optimize_function_for_speed_p (cfun)
1250       && hw_sqrt_exists
1251       && REAL_VALUES_EQUAL (c, dconst1_6))
1252     {
1253       /* sqrt(x)  */
1254       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1255
1256       /* cbrt(sqrt(x))  */
1257       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1258     }
1259
1260   /* Optimize pow(x,c), where n = 2c for some nonzero integer n
1261      and c not an integer, into
1262
1263        sqrt(x) * powi(x, n/2),                n > 0;
1264        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1265
1266      Do not calculate the powi factor when n/2 = 0.  */
1267   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1268   n = real_to_integer (&c2);
1269   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1270   c2_is_int = real_identical (&c2, &cint);
1271
1272   if (flag_unsafe_math_optimizations
1273       && sqrtfn
1274       && c2_is_int
1275       && !c_is_int
1276       && optimize_function_for_speed_p (cfun))
1277     {
1278       tree powi_x_ndiv2 = NULL_TREE;
1279
1280       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1281          possible or profitable, give up.  Skip the degenerate case when
1282          n is 1 or -1, where the result is always 1.  */
1283       if (absu_hwi (n) != 1)
1284         {
1285           powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1286                                                      abs_hwi (n / 2));
1287           if (!powi_x_ndiv2)
1288             return NULL_TREE;
1289         }
1290
1291       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1292          result of the optimal multiply sequence just calculated.  */
1293       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1294
1295       if (absu_hwi (n) == 1)
1296         result = sqrt_arg0;
1297       else
1298         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1299                                          sqrt_arg0, powi_x_ndiv2);
1300
1301       /* If n is negative, reciprocate the result.  */
1302       if (n < 0)
1303         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1304                                          build_real (type, dconst1), result);
1305       return result;
1306     }
1307
1308   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1309
1310      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1311      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1312
1313      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1314      different from pow(x, 1./3.) due to rounding and behavior with
1315      negative x, we need to constrain this transformation to unsafe
1316      math and positive x or finite math.  */
1317   real_from_integer (&dconst3, VOIDmode, 3, 0, 0);
1318   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1319   real_round (&c2, mode, &c2);
1320   n = real_to_integer (&c2);
1321   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1322   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1323   real_convert (&c2, mode, &c2);
1324
1325   if (flag_unsafe_math_optimizations
1326       && cbrtfn
1327       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1328       && real_identical (&c2, &c)
1329       && !c2_is_int
1330       && optimize_function_for_speed_p (cfun)
1331       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1332     {
1333       tree powi_x_ndiv3 = NULL_TREE;
1334
1335       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1336          possible or profitable, give up.  Skip the degenerate case when
1337          abs(n) < 3, where the result is always 1.  */
1338       if (absu_hwi (n) >= 3)
1339         {
1340           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1341                                                      abs_hwi (n / 3));
1342           if (!powi_x_ndiv3)
1343             return NULL_TREE;
1344         }
1345
1346       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1347          as that creates an unnecessary variable.  Instead, just produce
1348          either cbrt(x) or cbrt(x) * cbrt(x).  */
1349       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1350
1351       if (absu_hwi (n) % 3 == 1)
1352         powi_cbrt_x = cbrt_x;
1353       else
1354         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1355                                               cbrt_x, cbrt_x);
1356
1357       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1358       if (absu_hwi (n) < 3)
1359         result = powi_cbrt_x;
1360       else
1361         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1362                                          powi_x_ndiv3, powi_cbrt_x);
1363
1364       /* If n is negative, reciprocate the result.  */
1365       if (n < 0)
1366         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1367                                          build_real (type, dconst1), result);
1368
1369       return result;
1370     }
1371
1372   /* No optimizations succeeded.  */
1373   return NULL_TREE;
1374 }
1375
1376 /* ARG is the argument to a cabs builtin call in GSI with location info
1377    LOC.  Create a sequence of statements prior to GSI that calculates
1378    sqrt(R*R + I*I), where R and I are the real and imaginary components
1379    of ARG, respectively.  Return an expression holding the result.  */
1380
1381 static tree
1382 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1383 {
1384   tree real_part, imag_part, addend1, addend2, sum, result;
1385   tree type = TREE_TYPE (TREE_TYPE (arg));
1386   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1387   enum machine_mode mode = TYPE_MODE (type);
1388
1389   if (!flag_unsafe_math_optimizations
1390       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1391       || !sqrtfn
1392       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1393     return NULL_TREE;
1394
1395   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1396                                     REALPART_EXPR, arg);
1397   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1398                                     real_part, real_part);
1399   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1400                                     IMAGPART_EXPR, arg);
1401   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1402                                     imag_part, imag_part);
1403   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1404   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1405
1406   return result;
1407 }
1408
1409 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1410    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1411    an optimal number of multiplies, when n is a constant.  */
1412
1413 static unsigned int
1414 execute_cse_sincos (void)
1415 {
1416   basic_block bb;
1417   bool cfg_changed = false;
1418
1419   calculate_dominance_info (CDI_DOMINATORS);
1420   memset (&sincos_stats, 0, sizeof (sincos_stats));
1421
1422   FOR_EACH_BB_FN (bb, cfun)
1423     {
1424       gimple_stmt_iterator gsi;
1425       bool cleanup_eh = false;
1426
1427       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1428         {
1429           gimple stmt = gsi_stmt (gsi);
1430           tree fndecl;
1431
1432           /* Only the last stmt in a bb could throw, no need to call
1433              gimple_purge_dead_eh_edges if we change something in the middle
1434              of a basic block.  */
1435           cleanup_eh = false;
1436
1437           if (is_gimple_call (stmt)
1438               && gimple_call_lhs (stmt)
1439               && (fndecl = gimple_call_fndecl (stmt))
1440               && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1441             {
1442               tree arg, arg0, arg1, result;
1443               HOST_WIDE_INT n;
1444               location_t loc;
1445
1446               switch (DECL_FUNCTION_CODE (fndecl))
1447                 {
1448                 CASE_FLT_FN (BUILT_IN_COS):
1449                 CASE_FLT_FN (BUILT_IN_SIN):
1450                 CASE_FLT_FN (BUILT_IN_CEXPI):
1451                   /* Make sure we have either sincos or cexp.  */
1452                   if (!targetm.libc_has_function (function_c99_math_complex)
1453                       && !targetm.libc_has_function (function_sincos))
1454                     break;
1455
1456                   arg = gimple_call_arg (stmt, 0);
1457                   if (TREE_CODE (arg) == SSA_NAME)
1458                     cfg_changed |= execute_cse_sincos_1 (arg);
1459                   break;
1460
1461                 CASE_FLT_FN (BUILT_IN_POW):
1462                   arg0 = gimple_call_arg (stmt, 0);
1463                   arg1 = gimple_call_arg (stmt, 1);
1464
1465                   loc = gimple_location (stmt);
1466                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1467
1468                   if (result)
1469                     {
1470                       tree lhs = gimple_get_lhs (stmt);
1471                       gimple new_stmt = gimple_build_assign (lhs, result);
1472                       gimple_set_location (new_stmt, loc);
1473                       unlink_stmt_vdef (stmt);
1474                       gsi_replace (&gsi, new_stmt, true);
1475                       cleanup_eh = true;
1476                       if (gimple_vdef (stmt))
1477                         release_ssa_name (gimple_vdef (stmt));
1478                     }
1479                   break;
1480
1481                 CASE_FLT_FN (BUILT_IN_POWI):
1482                   arg0 = gimple_call_arg (stmt, 0);
1483                   arg1 = gimple_call_arg (stmt, 1);
1484                   loc = gimple_location (stmt);
1485
1486                   if (real_minus_onep (arg0))
1487                     {
1488                       tree t0, t1, cond, one, minus_one;
1489                       gimple stmt;
1490
1491                       t0 = TREE_TYPE (arg0);
1492                       t1 = TREE_TYPE (arg1);
1493                       one = build_real (t0, dconst1);
1494                       minus_one = build_real (t0, dconstm1);
1495
1496                       cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1497                       stmt = gimple_build_assign_with_ops (BIT_AND_EXPR, cond,
1498                                                            arg1,
1499                                                            build_int_cst (t1,
1500                                                                           1));
1501                       gimple_set_location (stmt, loc);
1502                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1503
1504                       result = make_temp_ssa_name (t0, NULL, "powi");
1505                       stmt = gimple_build_assign_with_ops (COND_EXPR, result,
1506                                                            cond,
1507                                                            minus_one, one);
1508                       gimple_set_location (stmt, loc);
1509                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1510                     }
1511                   else
1512                     {
1513                       if (!tree_fits_shwi_p (arg1))
1514                         break;
1515
1516                       n = tree_to_shwi (arg1);
1517                       result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1518                     }
1519
1520                   if (result)
1521                     {
1522                       tree lhs = gimple_get_lhs (stmt);
1523                       gimple new_stmt = gimple_build_assign (lhs, result);
1524                       gimple_set_location (new_stmt, loc);
1525                       unlink_stmt_vdef (stmt);
1526                       gsi_replace (&gsi, new_stmt, true);
1527                       cleanup_eh = true;
1528                       if (gimple_vdef (stmt))
1529                         release_ssa_name (gimple_vdef (stmt));
1530                     }
1531                   break;
1532
1533                 CASE_FLT_FN (BUILT_IN_CABS):
1534                   arg0 = gimple_call_arg (stmt, 0);
1535                   loc = gimple_location (stmt);
1536                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1537
1538                   if (result)
1539                     {
1540                       tree lhs = gimple_get_lhs (stmt);
1541                       gimple new_stmt = gimple_build_assign (lhs, result);
1542                       gimple_set_location (new_stmt, loc);
1543                       unlink_stmt_vdef (stmt);
1544                       gsi_replace (&gsi, new_stmt, true);
1545                       cleanup_eh = true;
1546                       if (gimple_vdef (stmt))
1547                         release_ssa_name (gimple_vdef (stmt));
1548                     }
1549                   break;
1550
1551                 default:;
1552                 }
1553             }
1554         }
1555       if (cleanup_eh)
1556         cfg_changed |= gimple_purge_dead_eh_edges (bb);
1557     }
1558
1559   statistics_counter_event (cfun, "sincos statements inserted",
1560                             sincos_stats.inserted);
1561
1562   free_dominance_info (CDI_DOMINATORS);
1563   return cfg_changed ? TODO_cleanup_cfg : 0;
1564 }
1565
1566 static bool
1567 gate_cse_sincos (void)
1568 {
1569   /* We no longer require either sincos or cexp, since powi expansion
1570      piggybacks on this pass.  */
1571   return optimize;
1572 }
1573
1574 namespace {
1575
1576 const pass_data pass_data_cse_sincos =
1577 {
1578   GIMPLE_PASS, /* type */
1579   "sincos", /* name */
1580   OPTGROUP_NONE, /* optinfo_flags */
1581   true, /* has_gate */
1582   true, /* has_execute */
1583   TV_NONE, /* tv_id */
1584   PROP_ssa, /* properties_required */
1585   0, /* properties_provided */
1586   0, /* properties_destroyed */
1587   0, /* todo_flags_start */
1588   ( TODO_update_ssa | TODO_verify_ssa
1589     | TODO_verify_stmts ), /* todo_flags_finish */
1590 };
1591
1592 class pass_cse_sincos : public gimple_opt_pass
1593 {
1594 public:
1595   pass_cse_sincos (gcc::context *ctxt)
1596     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1597   {}
1598
1599   /* opt_pass methods: */
1600   bool gate () { return gate_cse_sincos (); }
1601   unsigned int execute () { return execute_cse_sincos (); }
1602
1603 }; // class pass_cse_sincos
1604
1605 } // anon namespace
1606
1607 gimple_opt_pass *
1608 make_pass_cse_sincos (gcc::context *ctxt)
1609 {
1610   return new pass_cse_sincos (ctxt);
1611 }
1612
1613 /* A symbolic number is used to detect byte permutation and selection
1614    patterns.  Therefore the field N contains an artificial number
1615    consisting of byte size markers:
1616
1617    0    - byte has the value 0
1618    1..size - byte contains the content of the byte
1619    number indexed with that value minus one  */
1620
1621 struct symbolic_number {
1622   unsigned HOST_WIDEST_INT n;
1623   int size;
1624 };
1625
1626 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1627    number N.  Return false if the requested operation is not permitted
1628    on a symbolic number.  */
1629
1630 static inline bool
1631 do_shift_rotate (enum tree_code code,
1632                  struct symbolic_number *n,
1633                  int count)
1634 {
1635   if (count % 8 != 0)
1636     return false;
1637
1638   /* Zero out the extra bits of N in order to avoid them being shifted
1639      into the significant bits.  */
1640   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1641     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1642
1643   switch (code)
1644     {
1645     case LSHIFT_EXPR:
1646       n->n <<= count;
1647       break;
1648     case RSHIFT_EXPR:
1649       n->n >>= count;
1650       break;
1651     case LROTATE_EXPR:
1652       n->n = (n->n << count) | (n->n >> ((n->size * BITS_PER_UNIT) - count));
1653       break;
1654     case RROTATE_EXPR:
1655       n->n = (n->n >> count) | (n->n << ((n->size * BITS_PER_UNIT) - count));
1656       break;
1657     default:
1658       return false;
1659     }
1660   /* Zero unused bits for size.  */
1661   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1662     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1663   return true;
1664 }
1665
1666 /* Perform sanity checking for the symbolic number N and the gimple
1667    statement STMT.  */
1668
1669 static inline bool
1670 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1671 {
1672   tree lhs_type;
1673
1674   lhs_type = gimple_expr_type (stmt);
1675
1676   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1677     return false;
1678
1679   if (TYPE_PRECISION (lhs_type) != n->size * BITS_PER_UNIT)
1680     return false;
1681
1682   return true;
1683 }
1684
1685 /* find_bswap_1 invokes itself recursively with N and tries to perform
1686    the operation given by the rhs of STMT on the result.  If the
1687    operation could successfully be executed the function returns the
1688    tree expression of the source operand and NULL otherwise.  */
1689
1690 static tree
1691 find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
1692 {
1693   enum tree_code code;
1694   tree rhs1, rhs2 = NULL;
1695   gimple rhs1_stmt, rhs2_stmt;
1696   tree source_expr1;
1697   enum gimple_rhs_class rhs_class;
1698
1699   if (!limit || !is_gimple_assign (stmt))
1700     return NULL_TREE;
1701
1702   rhs1 = gimple_assign_rhs1 (stmt);
1703
1704   if (TREE_CODE (rhs1) != SSA_NAME)
1705     return NULL_TREE;
1706
1707   code = gimple_assign_rhs_code (stmt);
1708   rhs_class = gimple_assign_rhs_class (stmt);
1709   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1710
1711   if (rhs_class == GIMPLE_BINARY_RHS)
1712     rhs2 = gimple_assign_rhs2 (stmt);
1713
1714   /* Handle unary rhs and binary rhs with integer constants as second
1715      operand.  */
1716
1717   if (rhs_class == GIMPLE_UNARY_RHS
1718       || (rhs_class == GIMPLE_BINARY_RHS
1719           && TREE_CODE (rhs2) == INTEGER_CST))
1720     {
1721       if (code != BIT_AND_EXPR
1722           && code != LSHIFT_EXPR
1723           && code != RSHIFT_EXPR
1724           && code != LROTATE_EXPR
1725           && code != RROTATE_EXPR
1726           && code != NOP_EXPR
1727           && code != CONVERT_EXPR)
1728         return NULL_TREE;
1729
1730       source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
1731
1732       /* If find_bswap_1 returned NULL STMT is a leaf node and we have
1733          to initialize the symbolic number.  */
1734       if (!source_expr1)
1735         {
1736           /* Set up the symbolic number N by setting each byte to a
1737              value between 1 and the byte size of rhs1.  The highest
1738              order byte is set to n->size and the lowest order
1739              byte to 1.  */
1740           n->size = TYPE_PRECISION (TREE_TYPE (rhs1));
1741           if (n->size % BITS_PER_UNIT != 0)
1742             return NULL_TREE;
1743           n->size /= BITS_PER_UNIT;
1744           n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1745                   (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
1746
1747           if (n->size < (int)sizeof (HOST_WIDEST_INT))
1748             n->n &= ((unsigned HOST_WIDEST_INT)1 <<
1749                      (n->size * BITS_PER_UNIT)) - 1;
1750
1751           source_expr1 = rhs1;
1752         }
1753
1754       switch (code)
1755         {
1756         case BIT_AND_EXPR:
1757           {
1758             int i;
1759             unsigned HOST_WIDEST_INT val = widest_int_cst_value (rhs2);
1760             unsigned HOST_WIDEST_INT tmp = val;
1761
1762             /* Only constants masking full bytes are allowed.  */
1763             for (i = 0; i < n->size; i++, tmp >>= BITS_PER_UNIT)
1764               if ((tmp & 0xff) != 0 && (tmp & 0xff) != 0xff)
1765                 return NULL_TREE;
1766
1767             n->n &= val;
1768           }
1769           break;
1770         case LSHIFT_EXPR:
1771         case RSHIFT_EXPR:
1772         case LROTATE_EXPR:
1773         case RROTATE_EXPR:
1774           if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
1775             return NULL_TREE;
1776           break;
1777         CASE_CONVERT:
1778           {
1779             int type_size;
1780
1781             type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1782             if (type_size % BITS_PER_UNIT != 0)
1783               return NULL_TREE;
1784
1785             if (type_size / BITS_PER_UNIT < (int)(sizeof (HOST_WIDEST_INT)))
1786               {
1787                 /* If STMT casts to a smaller type mask out the bits not
1788                    belonging to the target type.  */
1789                 n->n &= ((unsigned HOST_WIDEST_INT)1 << type_size) - 1;
1790               }
1791             n->size = type_size / BITS_PER_UNIT;
1792           }
1793           break;
1794         default:
1795           return NULL_TREE;
1796         };
1797       return verify_symbolic_number_p (n, stmt) ? source_expr1 : NULL;
1798     }
1799
1800   /* Handle binary rhs.  */
1801
1802   if (rhs_class == GIMPLE_BINARY_RHS)
1803     {
1804       int i;
1805       struct symbolic_number n1, n2;
1806       unsigned HOST_WIDEST_INT mask;
1807       tree source_expr2;
1808
1809       if (code != BIT_IOR_EXPR)
1810         return NULL_TREE;
1811
1812       if (TREE_CODE (rhs2) != SSA_NAME)
1813         return NULL_TREE;
1814
1815       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1816
1817       switch (code)
1818         {
1819         case BIT_IOR_EXPR:
1820           source_expr1 = find_bswap_1 (rhs1_stmt, &n1, limit - 1);
1821
1822           if (!source_expr1)
1823             return NULL_TREE;
1824
1825           source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
1826
1827           if (source_expr1 != source_expr2
1828               || n1.size != n2.size)
1829             return NULL_TREE;
1830
1831           n->size = n1.size;
1832           for (i = 0, mask = 0xff; i < n->size; i++, mask <<= BITS_PER_UNIT)
1833             {
1834               unsigned HOST_WIDEST_INT masked1, masked2;
1835
1836               masked1 = n1.n & mask;
1837               masked2 = n2.n & mask;
1838               if (masked1 && masked2 && masked1 != masked2)
1839                 return NULL_TREE;
1840             }
1841           n->n = n1.n | n2.n;
1842
1843           if (!verify_symbolic_number_p (n, stmt))
1844             return NULL_TREE;
1845
1846           break;
1847         default:
1848           return NULL_TREE;
1849         }
1850       return source_expr1;
1851     }
1852   return NULL_TREE;
1853 }
1854
1855 /* Check if STMT completes a bswap implementation consisting of ORs,
1856    SHIFTs and ANDs.  Return the source tree expression on which the
1857    byte swap is performed and NULL if no bswap was found.  */
1858
1859 static tree
1860 find_bswap (gimple stmt)
1861 {
1862 /* The number which the find_bswap result should match in order to
1863    have a full byte swap.  The number is shifted to the left according
1864    to the size of the symbolic number before using it.  */
1865   unsigned HOST_WIDEST_INT cmp =
1866     sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1867     (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708;
1868
1869   struct symbolic_number n;
1870   tree source_expr;
1871   int limit;
1872
1873   /* The last parameter determines the depth search limit.  It usually
1874      correlates directly to the number of bytes to be touched.  We
1875      increase that number by three  here in order to also
1876      cover signed -> unsigned converions of the src operand as can be seen
1877      in libgcc, and for initial shift/and operation of the src operand.  */
1878   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
1879   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
1880   source_expr =  find_bswap_1 (stmt, &n, limit);
1881
1882   if (!source_expr)
1883     return NULL_TREE;
1884
1885   /* Zero out the extra bits of N and CMP.  */
1886   if (n.size < (int)sizeof (HOST_WIDEST_INT))
1887     {
1888       unsigned HOST_WIDEST_INT mask =
1889         ((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
1890
1891       n.n &= mask;
1892       cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
1893     }
1894
1895   /* A complete byte swap should make the symbolic number to start
1896      with the largest digit in the highest order byte.  */
1897   if (cmp != n.n)
1898     return NULL_TREE;
1899
1900   return source_expr;
1901 }
1902
1903 /* Find manual byte swap implementations and turn them into a bswap
1904    builtin invokation.  */
1905
1906 static unsigned int
1907 execute_optimize_bswap (void)
1908 {
1909   basic_block bb;
1910   bool bswap16_p, bswap32_p, bswap64_p;
1911   bool changed = false;
1912   tree bswap16_type = NULL_TREE, bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
1913
1914   if (BITS_PER_UNIT != 8)
1915     return 0;
1916
1917   if (sizeof (HOST_WIDEST_INT) < 8)
1918     return 0;
1919
1920   bswap16_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP16)
1921                && optab_handler (bswap_optab, HImode) != CODE_FOR_nothing);
1922   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
1923                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
1924   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
1925                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
1926                    || (bswap32_p && word_mode == SImode)));
1927
1928   if (!bswap16_p && !bswap32_p && !bswap64_p)
1929     return 0;
1930
1931   /* Determine the argument type of the builtins.  The code later on
1932      assumes that the return and argument type are the same.  */
1933   if (bswap16_p)
1934     {
1935       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
1936       bswap16_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1937     }
1938
1939   if (bswap32_p)
1940     {
1941       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1942       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1943     }
1944
1945   if (bswap64_p)
1946     {
1947       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1948       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1949     }
1950
1951   memset (&bswap_stats, 0, sizeof (bswap_stats));
1952
1953   FOR_EACH_BB_FN (bb, cfun)
1954     {
1955       gimple_stmt_iterator gsi;
1956
1957       /* We do a reverse scan for bswap patterns to make sure we get the
1958          widest match. As bswap pattern matching doesn't handle
1959          previously inserted smaller bswap replacements as sub-
1960          patterns, the wider variant wouldn't be detected.  */
1961       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1962         {
1963           gimple stmt = gsi_stmt (gsi);
1964           tree bswap_src, bswap_type;
1965           tree bswap_tmp;
1966           tree fndecl = NULL_TREE;
1967           int type_size;
1968           gimple call;
1969
1970           if (!is_gimple_assign (stmt)
1971               || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
1972             continue;
1973
1974           type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1975
1976           switch (type_size)
1977             {
1978             case 16:
1979               if (bswap16_p)
1980                 {
1981                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
1982                   bswap_type = bswap16_type;
1983                 }
1984               break;
1985             case 32:
1986               if (bswap32_p)
1987                 {
1988                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1989                   bswap_type = bswap32_type;
1990                 }
1991               break;
1992             case 64:
1993               if (bswap64_p)
1994                 {
1995                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1996                   bswap_type = bswap64_type;
1997                 }
1998               break;
1999             default:
2000               continue;
2001             }
2002
2003           if (!fndecl)
2004             continue;
2005
2006           bswap_src = find_bswap (stmt);
2007
2008           if (!bswap_src)
2009             continue;
2010
2011           changed = true;
2012           if (type_size == 16)
2013             bswap_stats.found_16bit++;
2014           else if (type_size == 32)
2015             bswap_stats.found_32bit++;
2016           else
2017             bswap_stats.found_64bit++;
2018
2019           bswap_tmp = bswap_src;
2020
2021           /* Convert the src expression if necessary.  */
2022           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
2023             {
2024               gimple convert_stmt;
2025               bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2026               convert_stmt = gimple_build_assign_with_ops
2027                                 (NOP_EXPR, bswap_tmp, bswap_src, NULL);
2028               gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2029             }
2030
2031           call = gimple_build_call (fndecl, 1, bswap_tmp);
2032
2033           bswap_tmp = gimple_assign_lhs (stmt);
2034
2035           /* Convert the result if necessary.  */
2036           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
2037             {
2038               gimple convert_stmt;
2039               bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2040               convert_stmt = gimple_build_assign_with_ops
2041                         (NOP_EXPR, gimple_assign_lhs (stmt), bswap_tmp, NULL);
2042               gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2043             }
2044
2045           gimple_call_set_lhs (call, bswap_tmp);
2046
2047           if (dump_file)
2048             {
2049               fprintf (dump_file, "%d bit bswap implementation found at: ",
2050                        (int)type_size);
2051               print_gimple_stmt (dump_file, stmt, 0, 0);
2052             }
2053
2054           gsi_insert_after (&gsi, call, GSI_SAME_STMT);
2055           gsi_remove (&gsi, true);
2056         }
2057     }
2058
2059   statistics_counter_event (cfun, "16-bit bswap implementations found",
2060                             bswap_stats.found_16bit);
2061   statistics_counter_event (cfun, "32-bit bswap implementations found",
2062                             bswap_stats.found_32bit);
2063   statistics_counter_event (cfun, "64-bit bswap implementations found",
2064                             bswap_stats.found_64bit);
2065
2066   return (changed ? TODO_update_ssa | TODO_verify_ssa
2067           | TODO_verify_stmts : 0);
2068 }
2069
2070 static bool
2071 gate_optimize_bswap (void)
2072 {
2073   return flag_expensive_optimizations && optimize;
2074 }
2075
2076 namespace {
2077
2078 const pass_data pass_data_optimize_bswap =
2079 {
2080   GIMPLE_PASS, /* type */
2081   "bswap", /* name */
2082   OPTGROUP_NONE, /* optinfo_flags */
2083   true, /* has_gate */
2084   true, /* has_execute */
2085   TV_NONE, /* tv_id */
2086   PROP_ssa, /* properties_required */
2087   0, /* properties_provided */
2088   0, /* properties_destroyed */
2089   0, /* todo_flags_start */
2090   0, /* todo_flags_finish */
2091 };
2092
2093 class pass_optimize_bswap : public gimple_opt_pass
2094 {
2095 public:
2096   pass_optimize_bswap (gcc::context *ctxt)
2097     : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2098   {}
2099
2100   /* opt_pass methods: */
2101   bool gate () { return gate_optimize_bswap (); }
2102   unsigned int execute () { return execute_optimize_bswap (); }
2103
2104 }; // class pass_optimize_bswap
2105
2106 } // anon namespace
2107
2108 gimple_opt_pass *
2109 make_pass_optimize_bswap (gcc::context *ctxt)
2110 {
2111   return new pass_optimize_bswap (ctxt);
2112 }
2113
2114 /* Return true if stmt is a type conversion operation that can be stripped
2115    when used in a widening multiply operation.  */
2116 static bool
2117 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
2118 {
2119   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2120
2121   if (TREE_CODE (result_type) == INTEGER_TYPE)
2122     {
2123       tree op_type;
2124       tree inner_op_type;
2125
2126       if (!CONVERT_EXPR_CODE_P (rhs_code))
2127         return false;
2128
2129       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2130
2131       /* If the type of OP has the same precision as the result, then
2132          we can strip this conversion.  The multiply operation will be
2133          selected to create the correct extension as a by-product.  */
2134       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2135         return true;
2136
2137       /* We can also strip a conversion if it preserves the signed-ness of
2138          the operation and doesn't narrow the range.  */
2139       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2140
2141       /* If the inner-most type is unsigned, then we can strip any
2142          intermediate widening operation.  If it's signed, then the
2143          intermediate widening operation must also be signed.  */
2144       if ((TYPE_UNSIGNED (inner_op_type)
2145            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2146           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2147         return true;
2148
2149       return false;
2150     }
2151
2152   return rhs_code == FIXED_CONVERT_EXPR;
2153 }
2154
2155 /* Return true if RHS is a suitable operand for a widening multiplication,
2156    assuming a target type of TYPE.
2157    There are two cases:
2158
2159      - RHS makes some value at least twice as wide.  Store that value
2160        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2161
2162      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2163        but leave *TYPE_OUT untouched.  */
2164
2165 static bool
2166 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2167                         tree *new_rhs_out)
2168 {
2169   gimple stmt;
2170   tree type1, rhs1;
2171
2172   if (TREE_CODE (rhs) == SSA_NAME)
2173     {
2174       stmt = SSA_NAME_DEF_STMT (rhs);
2175       if (is_gimple_assign (stmt))
2176         {
2177           if (! widening_mult_conversion_strippable_p (type, stmt))
2178             rhs1 = rhs;
2179           else
2180             {
2181               rhs1 = gimple_assign_rhs1 (stmt);
2182
2183               if (TREE_CODE (rhs1) == INTEGER_CST)
2184                 {
2185                   *new_rhs_out = rhs1;
2186                   *type_out = NULL;
2187                   return true;
2188                 }
2189             }
2190         }
2191       else
2192         rhs1 = rhs;
2193
2194       type1 = TREE_TYPE (rhs1);
2195
2196       if (TREE_CODE (type1) != TREE_CODE (type)
2197           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2198         return false;
2199
2200       *new_rhs_out = rhs1;
2201       *type_out = type1;
2202       return true;
2203     }
2204
2205   if (TREE_CODE (rhs) == INTEGER_CST)
2206     {
2207       *new_rhs_out = rhs;
2208       *type_out = NULL;
2209       return true;
2210     }
2211
2212   return false;
2213 }
2214
2215 /* Return true if STMT performs a widening multiplication, assuming the
2216    output type is TYPE.  If so, store the unwidened types of the operands
2217    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2218    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2219    and *TYPE2_OUT would give the operands of the multiplication.  */
2220
2221 static bool
2222 is_widening_mult_p (gimple stmt,
2223                     tree *type1_out, tree *rhs1_out,
2224                     tree *type2_out, tree *rhs2_out)
2225 {
2226   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2227
2228   if (TREE_CODE (type) != INTEGER_TYPE
2229       && TREE_CODE (type) != FIXED_POINT_TYPE)
2230     return false;
2231
2232   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2233                                rhs1_out))
2234     return false;
2235
2236   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2237                                rhs2_out))
2238     return false;
2239
2240   if (*type1_out == NULL)
2241     {
2242       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2243         return false;
2244       *type1_out = *type2_out;
2245     }
2246
2247   if (*type2_out == NULL)
2248     {
2249       if (!int_fits_type_p (*rhs2_out, *type1_out))
2250         return false;
2251       *type2_out = *type1_out;
2252     }
2253
2254   /* Ensure that the larger of the two operands comes first. */
2255   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2256     {
2257       tree tmp;
2258       tmp = *type1_out;
2259       *type1_out = *type2_out;
2260       *type2_out = tmp;
2261       tmp = *rhs1_out;
2262       *rhs1_out = *rhs2_out;
2263       *rhs2_out = tmp;
2264     }
2265
2266   return true;
2267 }
2268
2269 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2270    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2271    value is true iff we converted the statement.  */
2272
2273 static bool
2274 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2275 {
2276   tree lhs, rhs1, rhs2, type, type1, type2;
2277   enum insn_code handler;
2278   enum machine_mode to_mode, from_mode, actual_mode;
2279   optab op;
2280   int actual_precision;
2281   location_t loc = gimple_location (stmt);
2282   bool from_unsigned1, from_unsigned2;
2283
2284   lhs = gimple_assign_lhs (stmt);
2285   type = TREE_TYPE (lhs);
2286   if (TREE_CODE (type) != INTEGER_TYPE)
2287     return false;
2288
2289   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2290     return false;
2291
2292   to_mode = TYPE_MODE (type);
2293   from_mode = TYPE_MODE (type1);
2294   from_unsigned1 = TYPE_UNSIGNED (type1);
2295   from_unsigned2 = TYPE_UNSIGNED (type2);
2296
2297   if (from_unsigned1 && from_unsigned2)
2298     op = umul_widen_optab;
2299   else if (!from_unsigned1 && !from_unsigned2)
2300     op = smul_widen_optab;
2301   else
2302     op = usmul_widen_optab;
2303
2304   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2305                                                   0, &actual_mode);
2306
2307   if (handler == CODE_FOR_nothing)
2308     {
2309       if (op != smul_widen_optab)
2310         {
2311           /* We can use a signed multiply with unsigned types as long as
2312              there is a wider mode to use, or it is the smaller of the two
2313              types that is unsigned.  Note that type1 >= type2, always.  */
2314           if ((TYPE_UNSIGNED (type1)
2315                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2316               || (TYPE_UNSIGNED (type2)
2317                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2318             {
2319               from_mode = GET_MODE_WIDER_MODE (from_mode);
2320               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2321                 return false;
2322             }
2323
2324           op = smul_widen_optab;
2325           handler = find_widening_optab_handler_and_mode (op, to_mode,
2326                                                           from_mode, 0,
2327                                                           &actual_mode);
2328
2329           if (handler == CODE_FOR_nothing)
2330             return false;
2331
2332           from_unsigned1 = from_unsigned2 = false;
2333         }
2334       else
2335         return false;
2336     }
2337
2338   /* Ensure that the inputs to the handler are in the correct precison
2339      for the opcode.  This will be the full mode size.  */
2340   actual_precision = GET_MODE_PRECISION (actual_mode);
2341   if (2 * actual_precision > TYPE_PRECISION (type))
2342     return false;
2343   if (actual_precision != TYPE_PRECISION (type1)
2344       || from_unsigned1 != TYPE_UNSIGNED (type1))
2345     rhs1 = build_and_insert_cast (gsi, loc,
2346                                   build_nonstandard_integer_type
2347                                     (actual_precision, from_unsigned1), rhs1);
2348   if (actual_precision != TYPE_PRECISION (type2)
2349       || from_unsigned2 != TYPE_UNSIGNED (type2))
2350     rhs2 = build_and_insert_cast (gsi, loc,
2351                                   build_nonstandard_integer_type
2352                                     (actual_precision, from_unsigned2), rhs2);
2353
2354   /* Handle constants.  */
2355   if (TREE_CODE (rhs1) == INTEGER_CST)
2356     rhs1 = fold_convert (type1, rhs1);
2357   if (TREE_CODE (rhs2) == INTEGER_CST)
2358     rhs2 = fold_convert (type2, rhs2);
2359
2360   gimple_assign_set_rhs1 (stmt, rhs1);
2361   gimple_assign_set_rhs2 (stmt, rhs2);
2362   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2363   update_stmt (stmt);
2364   widen_mul_stats.widen_mults_inserted++;
2365   return true;
2366 }
2367
2368 /* Process a single gimple statement STMT, which is found at the
2369    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2370    rhs (given by CODE), and try to convert it into a
2371    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2372    is true iff we converted the statement.  */
2373
2374 static bool
2375 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2376                             enum tree_code code)
2377 {
2378   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2379   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2380   tree type, type1, type2, optype;
2381   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2382   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2383   optab this_optab;
2384   enum tree_code wmult_code;
2385   enum insn_code handler;
2386   enum machine_mode to_mode, from_mode, actual_mode;
2387   location_t loc = gimple_location (stmt);
2388   int actual_precision;
2389   bool from_unsigned1, from_unsigned2;
2390
2391   lhs = gimple_assign_lhs (stmt);
2392   type = TREE_TYPE (lhs);
2393   if (TREE_CODE (type) != INTEGER_TYPE
2394       && TREE_CODE (type) != FIXED_POINT_TYPE)
2395     return false;
2396
2397   if (code == MINUS_EXPR)
2398     wmult_code = WIDEN_MULT_MINUS_EXPR;
2399   else
2400     wmult_code = WIDEN_MULT_PLUS_EXPR;
2401
2402   rhs1 = gimple_assign_rhs1 (stmt);
2403   rhs2 = gimple_assign_rhs2 (stmt);
2404
2405   if (TREE_CODE (rhs1) == SSA_NAME)
2406     {
2407       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2408       if (is_gimple_assign (rhs1_stmt))
2409         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2410     }
2411
2412   if (TREE_CODE (rhs2) == SSA_NAME)
2413     {
2414       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2415       if (is_gimple_assign (rhs2_stmt))
2416         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2417     }
2418
2419   /* Allow for one conversion statement between the multiply
2420      and addition/subtraction statement.  If there are more than
2421      one conversions then we assume they would invalidate this
2422      transformation.  If that's not the case then they should have
2423      been folded before now.  */
2424   if (CONVERT_EXPR_CODE_P (rhs1_code))
2425     {
2426       conv1_stmt = rhs1_stmt;
2427       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2428       if (TREE_CODE (rhs1) == SSA_NAME)
2429         {
2430           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2431           if (is_gimple_assign (rhs1_stmt))
2432             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2433         }
2434       else
2435         return false;
2436     }
2437   if (CONVERT_EXPR_CODE_P (rhs2_code))
2438     {
2439       conv2_stmt = rhs2_stmt;
2440       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2441       if (TREE_CODE (rhs2) == SSA_NAME)
2442         {
2443           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2444           if (is_gimple_assign (rhs2_stmt))
2445             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2446         }
2447       else
2448         return false;
2449     }
2450
2451   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2452      is_widening_mult_p, but we still need the rhs returns.
2453
2454      It might also appear that it would be sufficient to use the existing
2455      operands of the widening multiply, but that would limit the choice of
2456      multiply-and-accumulate instructions.
2457
2458      If the widened-multiplication result has more than one uses, it is
2459      probably wiser not to do the conversion.  */
2460   if (code == PLUS_EXPR
2461       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2462     {
2463       if (!has_single_use (rhs1)
2464           || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2465                                   &type2, &mult_rhs2))
2466         return false;
2467       add_rhs = rhs2;
2468       conv_stmt = conv1_stmt;
2469     }
2470   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2471     {
2472       if (!has_single_use (rhs2)
2473           || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2474                                   &type2, &mult_rhs2))
2475         return false;
2476       add_rhs = rhs1;
2477       conv_stmt = conv2_stmt;
2478     }
2479   else
2480     return false;
2481
2482   to_mode = TYPE_MODE (type);
2483   from_mode = TYPE_MODE (type1);
2484   from_unsigned1 = TYPE_UNSIGNED (type1);
2485   from_unsigned2 = TYPE_UNSIGNED (type2);
2486   optype = type1;
2487
2488   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2489   if (from_unsigned1 != from_unsigned2)
2490     {
2491       if (!INTEGRAL_TYPE_P (type))
2492         return false;
2493       /* We can use a signed multiply with unsigned types as long as
2494          there is a wider mode to use, or it is the smaller of the two
2495          types that is unsigned.  Note that type1 >= type2, always.  */
2496       if ((from_unsigned1
2497            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2498           || (from_unsigned2
2499               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2500         {
2501           from_mode = GET_MODE_WIDER_MODE (from_mode);
2502           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2503             return false;
2504         }
2505
2506       from_unsigned1 = from_unsigned2 = false;
2507       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2508                                                false);
2509     }
2510
2511   /* If there was a conversion between the multiply and addition
2512      then we need to make sure it fits a multiply-and-accumulate.
2513      The should be a single mode change which does not change the
2514      value.  */
2515   if (conv_stmt)
2516     {
2517       /* We use the original, unmodified data types for this.  */
2518       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2519       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2520       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2521       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2522
2523       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2524         {
2525           /* Conversion is a truncate.  */
2526           if (TYPE_PRECISION (to_type) < data_size)
2527             return false;
2528         }
2529       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2530         {
2531           /* Conversion is an extend.  Check it's the right sort.  */
2532           if (TYPE_UNSIGNED (from_type) != is_unsigned
2533               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2534             return false;
2535         }
2536       /* else convert is a no-op for our purposes.  */
2537     }
2538
2539   /* Verify that the machine can perform a widening multiply
2540      accumulate in this mode/signedness combination, otherwise
2541      this transformation is likely to pessimize code.  */
2542   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2543   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2544                                                   from_mode, 0, &actual_mode);
2545
2546   if (handler == CODE_FOR_nothing)
2547     return false;
2548
2549   /* Ensure that the inputs to the handler are in the correct precison
2550      for the opcode.  This will be the full mode size.  */
2551   actual_precision = GET_MODE_PRECISION (actual_mode);
2552   if (actual_precision != TYPE_PRECISION (type1)
2553       || from_unsigned1 != TYPE_UNSIGNED (type1))
2554     mult_rhs1 = build_and_insert_cast (gsi, loc,
2555                                        build_nonstandard_integer_type
2556                                          (actual_precision, from_unsigned1),
2557                                        mult_rhs1);
2558   if (actual_precision != TYPE_PRECISION (type2)
2559       || from_unsigned2 != TYPE_UNSIGNED (type2))
2560     mult_rhs2 = build_and_insert_cast (gsi, loc,
2561                                        build_nonstandard_integer_type
2562                                          (actual_precision, from_unsigned2),
2563                                        mult_rhs2);
2564
2565   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2566     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2567
2568   /* Handle constants.  */
2569   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2570     mult_rhs1 = fold_convert (type1, mult_rhs1);
2571   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2572     mult_rhs2 = fold_convert (type2, mult_rhs2);
2573
2574   gimple_assign_set_rhs_with_ops_1 (gsi, wmult_code, mult_rhs1, mult_rhs2,
2575                                     add_rhs);
2576   update_stmt (gsi_stmt (*gsi));
2577   widen_mul_stats.maccs_inserted++;
2578   return true;
2579 }
2580
2581 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
2582    with uses in additions and subtractions to form fused multiply-add
2583    operations.  Returns true if successful and MUL_STMT should be removed.  */
2584
2585 static bool
2586 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
2587 {
2588   tree mul_result = gimple_get_lhs (mul_stmt);
2589   tree type = TREE_TYPE (mul_result);
2590   gimple use_stmt, neguse_stmt, fma_stmt;
2591   use_operand_p use_p;
2592   imm_use_iterator imm_iter;
2593
2594   if (FLOAT_TYPE_P (type)
2595       && flag_fp_contract_mode == FP_CONTRACT_OFF)
2596     return false;
2597
2598   /* We don't want to do bitfield reduction ops.  */
2599   if (INTEGRAL_TYPE_P (type)
2600       && (TYPE_PRECISION (type)
2601           != GET_MODE_PRECISION (TYPE_MODE (type))))
2602     return false;
2603
2604   /* If the target doesn't support it, don't generate it.  We assume that
2605      if fma isn't available then fms, fnma or fnms are not either.  */
2606   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
2607     return false;
2608
2609   /* If the multiplication has zero uses, it is kept around probably because
2610      of -fnon-call-exceptions.  Don't optimize it away in that case,
2611      it is DCE job.  */
2612   if (has_zero_uses (mul_result))
2613     return false;
2614
2615   /* Make sure that the multiplication statement becomes dead after
2616      the transformation, thus that all uses are transformed to FMAs.
2617      This means we assume that an FMA operation has the same cost
2618      as an addition.  */
2619   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
2620     {
2621       enum tree_code use_code;
2622       tree result = mul_result;
2623       bool negate_p = false;
2624
2625       use_stmt = USE_STMT (use_p);
2626
2627       if (is_gimple_debug (use_stmt))
2628         continue;
2629
2630       /* For now restrict this operations to single basic blocks.  In theory
2631          we would want to support sinking the multiplication in
2632          m = a*b;
2633          if ()
2634            ma = m + c;
2635          else
2636            d = m;
2637          to form a fma in the then block and sink the multiplication to the
2638          else block.  */
2639       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2640         return false;
2641
2642       if (!is_gimple_assign (use_stmt))
2643         return false;
2644
2645       use_code = gimple_assign_rhs_code (use_stmt);
2646
2647       /* A negate on the multiplication leads to FNMA.  */
2648       if (use_code == NEGATE_EXPR)
2649         {
2650           ssa_op_iter iter;
2651           use_operand_p usep;
2652
2653           result = gimple_assign_lhs (use_stmt);
2654
2655           /* Make sure the negate statement becomes dead with this
2656              single transformation.  */
2657           if (!single_imm_use (gimple_assign_lhs (use_stmt),
2658                                &use_p, &neguse_stmt))
2659             return false;
2660
2661           /* Make sure the multiplication isn't also used on that stmt.  */
2662           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
2663             if (USE_FROM_PTR (usep) == mul_result)
2664               return false;
2665
2666           /* Re-validate.  */
2667           use_stmt = neguse_stmt;
2668           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2669             return false;
2670           if (!is_gimple_assign (use_stmt))
2671             return false;
2672
2673           use_code = gimple_assign_rhs_code (use_stmt);
2674           negate_p = true;
2675         }
2676
2677       switch (use_code)
2678         {
2679         case MINUS_EXPR:
2680           if (gimple_assign_rhs2 (use_stmt) == result)
2681             negate_p = !negate_p;
2682           break;
2683         case PLUS_EXPR:
2684           break;
2685         default:
2686           /* FMA can only be formed from PLUS and MINUS.  */
2687           return false;
2688         }
2689
2690       /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
2691          by a MULT_EXPR that we'll visit later, we might be able to
2692          get a more profitable match with fnma.
2693          OTOH, if we don't, a negate / fma pair has likely lower latency
2694          that a mult / subtract pair.  */
2695       if (use_code == MINUS_EXPR && !negate_p
2696           && gimple_assign_rhs1 (use_stmt) == result
2697           && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
2698           && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
2699         {
2700           tree rhs2 = gimple_assign_rhs2 (use_stmt);
2701
2702           if (TREE_CODE (rhs2) == SSA_NAME)
2703             {
2704               gimple stmt2 = SSA_NAME_DEF_STMT (rhs2);
2705               if (has_single_use (rhs2)
2706                   && is_gimple_assign (stmt2)
2707                   && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
2708               return false;
2709             }
2710         }
2711
2712       /* We can't handle a * b + a * b.  */
2713       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
2714         return false;
2715
2716       /* While it is possible to validate whether or not the exact form
2717          that we've recognized is available in the backend, the assumption
2718          is that the transformation is never a loss.  For instance, suppose
2719          the target only has the plain FMA pattern available.  Consider
2720          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
2721          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
2722          still have 3 operations, but in the FMA form the two NEGs are
2723          independent and could be run in parallel.  */
2724     }
2725
2726   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2727     {
2728       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2729       enum tree_code use_code;
2730       tree addop, mulop1 = op1, result = mul_result;
2731       bool negate_p = false;
2732
2733       if (is_gimple_debug (use_stmt))
2734         continue;
2735
2736       use_code = gimple_assign_rhs_code (use_stmt);
2737       if (use_code == NEGATE_EXPR)
2738         {
2739           result = gimple_assign_lhs (use_stmt);
2740           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2741           gsi_remove (&gsi, true);
2742           release_defs (use_stmt);
2743
2744           use_stmt = neguse_stmt;
2745           gsi = gsi_for_stmt (use_stmt);
2746           use_code = gimple_assign_rhs_code (use_stmt);
2747           negate_p = true;
2748         }
2749
2750       if (gimple_assign_rhs1 (use_stmt) == result)
2751         {
2752           addop = gimple_assign_rhs2 (use_stmt);
2753           /* a * b - c -> a * b + (-c)  */
2754           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2755             addop = force_gimple_operand_gsi (&gsi,
2756                                               build1 (NEGATE_EXPR,
2757                                                       type, addop),
2758                                               true, NULL_TREE, true,
2759                                               GSI_SAME_STMT);
2760         }
2761       else
2762         {
2763           addop = gimple_assign_rhs1 (use_stmt);
2764           /* a - b * c -> (-b) * c + a */
2765           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2766             negate_p = !negate_p;
2767         }
2768
2769       if (negate_p)
2770         mulop1 = force_gimple_operand_gsi (&gsi,
2771                                            build1 (NEGATE_EXPR,
2772                                                    type, mulop1),
2773                                            true, NULL_TREE, true,
2774                                            GSI_SAME_STMT);
2775
2776       fma_stmt = gimple_build_assign_with_ops (FMA_EXPR,
2777                                                gimple_assign_lhs (use_stmt),
2778                                                mulop1, op2,
2779                                                addop);
2780       gsi_replace (&gsi, fma_stmt, true);
2781       widen_mul_stats.fmas_inserted++;
2782     }
2783
2784   return true;
2785 }
2786
2787 /* Find integer multiplications where the operands are extended from
2788    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
2789    where appropriate.  */
2790
2791 static unsigned int
2792 execute_optimize_widening_mul (void)
2793 {
2794   basic_block bb;
2795   bool cfg_changed = false;
2796
2797   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
2798
2799   FOR_EACH_BB_FN (bb, cfun)
2800     {
2801       gimple_stmt_iterator gsi;
2802
2803       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
2804         {
2805           gimple stmt = gsi_stmt (gsi);
2806           enum tree_code code;
2807
2808           if (is_gimple_assign (stmt))
2809             {
2810               code = gimple_assign_rhs_code (stmt);
2811               switch (code)
2812                 {
2813                 case MULT_EXPR:
2814                   if (!convert_mult_to_widen (stmt, &gsi)
2815                       && convert_mult_to_fma (stmt,
2816                                               gimple_assign_rhs1 (stmt),
2817                                               gimple_assign_rhs2 (stmt)))
2818                     {
2819                       gsi_remove (&gsi, true);
2820                       release_defs (stmt);
2821                       continue;
2822                     }
2823                   break;
2824
2825                 case PLUS_EXPR:
2826                 case MINUS_EXPR:
2827                   convert_plusminus_to_widen (&gsi, stmt, code);
2828                   break;
2829
2830                 default:;
2831                 }
2832             }
2833           else if (is_gimple_call (stmt)
2834                    && gimple_call_lhs (stmt))
2835             {
2836               tree fndecl = gimple_call_fndecl (stmt);
2837               if (fndecl
2838                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
2839                 {
2840                   switch (DECL_FUNCTION_CODE (fndecl))
2841                     {
2842                       case BUILT_IN_POWF:
2843                       case BUILT_IN_POW:
2844                       case BUILT_IN_POWL:
2845                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
2846                             && REAL_VALUES_EQUAL
2847                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
2848                                   dconst2)
2849                             && convert_mult_to_fma (stmt,
2850                                                     gimple_call_arg (stmt, 0),
2851                                                     gimple_call_arg (stmt, 0)))
2852                           {
2853                             unlink_stmt_vdef (stmt);
2854                             if (gsi_remove (&gsi, true)
2855                                 && gimple_purge_dead_eh_edges (bb))
2856                               cfg_changed = true;
2857                             release_defs (stmt);
2858                             continue;
2859                           }
2860                           break;
2861
2862                       default:;
2863                     }
2864                 }
2865             }
2866           gsi_next (&gsi);
2867         }
2868     }
2869
2870   statistics_counter_event (cfun, "widening multiplications inserted",
2871                             widen_mul_stats.widen_mults_inserted);
2872   statistics_counter_event (cfun, "widening maccs inserted",
2873                             widen_mul_stats.maccs_inserted);
2874   statistics_counter_event (cfun, "fused multiply-adds inserted",
2875                             widen_mul_stats.fmas_inserted);
2876
2877   return cfg_changed ? TODO_cleanup_cfg : 0;
2878 }
2879
2880 static bool
2881 gate_optimize_widening_mul (void)
2882 {
2883   return flag_expensive_optimizations && optimize;
2884 }
2885
2886 namespace {
2887
2888 const pass_data pass_data_optimize_widening_mul =
2889 {
2890   GIMPLE_PASS, /* type */
2891   "widening_mul", /* name */
2892   OPTGROUP_NONE, /* optinfo_flags */
2893   true, /* has_gate */
2894   true, /* has_execute */
2895   TV_NONE, /* tv_id */
2896   PROP_ssa, /* properties_required */
2897   0, /* properties_provided */
2898   0, /* properties_destroyed */
2899   0, /* todo_flags_start */
2900   ( TODO_verify_ssa | TODO_verify_stmts
2901     | TODO_update_ssa ), /* todo_flags_finish */
2902 };
2903
2904 class pass_optimize_widening_mul : public gimple_opt_pass
2905 {
2906 public:
2907   pass_optimize_widening_mul (gcc::context *ctxt)
2908     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
2909   {}
2910
2911   /* opt_pass methods: */
2912   bool gate () { return gate_optimize_widening_mul (); }
2913   unsigned int execute () { return execute_optimize_widening_mul (); }
2914
2915 }; // class pass_optimize_widening_mul
2916
2917 } // anon namespace
2918
2919 gimple_opt_pass *
2920 make_pass_optimize_widening_mul (gcc::context *ctxt)
2921 {
2922   return new pass_optimize_widening_mul (ctxt);
2923 }