gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005-2014 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  21    operations.  These are common in sequences such as this one:
  22
  23         modulus = sqrt(x*x + y*y + z*z);
  24         x = x / modulus;
  25         y = y / modulus;
  26         z = z / modulus;
  27
  28    that can be optimized to
  29
  30         modulus = sqrt(x*x + y*y + z*z);
  31         rmodulus = 1.0 / modulus;
  32         x = x * rmodulus;
  33         y = y * rmodulus;
  34         z = z * rmodulus;
  35
  36    We do this for loop invariant divisors, and with this pass whenever
  37    we notice that a division has the same divisor multiple times.
  38
  39    Of course, like in PRE, we don't insert a division if a dominator
  40    already has one.  However, this cannot be done as an extension of
  41    PRE for several reasons.
  42
  43    First of all, with some experiments it was found out that the
  44    transformation is not always useful if there are only two divisions
  45    hy the same divisor.  This is probably because modern processors
  46    can pipeline the divisions; on older, in-order processors it should
  47    still be effective to optimize two divisions by the same number.
  48    We make this a param, and it shall be called N in the remainder of
  49    this comment.
  50
  51    Second, if trapping math is active, we have less freedom on where
  52    to insert divisions: we can only do so in basic blocks that already
  53    contain one.  (If divisions don't trap, instead, we can insert
  54    divisions elsewhere, which will be in blocks that are common dominators
  55    of those that have the division).
  56
  57    We really don't want to compute the reciprocal unless a division will
  58    be found.  To do this, we won't insert the division in a basic block
  59    that has less than N divisions *post-dominating* it.
  60
  61    The algorithm constructs a subset of the dominator tree, holding the
  62    blocks containing the divisions and the common dominators to them,
  63    and walk it twice.  The first walk is in post-order, and it annotates
  64    each block with the number of divisions that post-dominate it: this
  65    gives information on where divisions can be inserted profitably.
  66    The second walk is in pre-order, and it inserts divisions as explained
  67    above, and replaces divisions by multiplications.
  68
  69    In the best case, the cost of the pass is O(n_statements).  In the
  70    worst-case, the cost is due to creating the dominator tree subset,
  71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  72    for n_statements / n_basic_blocks statements.  So, the amortized cost
  73    of creating the dominator tree subset is O(n_basic_blocks) and the
  74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  75
  76    More practically, the cost will be small because there are few
  77    divisions, and they tend to be in the same basic block, so insert_bb
  78    is called very few times.
  79
  80    If we did this using domwalk.c, an efficient implementation would have
  81    to work on all the variables in a single pass, because we could not
  82    work on just a subset of the dominator tree, as we do now, and the
  83    cost would also be something like O(n_statements * n_basic_blocks).
  84    The data structures would be more complex in order to work on all the
  85    variables in a single pass.  */
  86
  87 #include "config.h"
  88 #include "system.h"
  89 #include "coretypes.h"
  90 #include "tm.h"
  91 #include "flags.h"
  92 #include "tree.h"
  93 #include "predict.h"
  94 #include "vec.h"
  95 #include "hashtab.h"
  96 #include "hash-set.h"
  97 #include "machmode.h"
  98 #include "hard-reg-set.h"
  99 #include "input.h"
 100 #include "function.h"
 101 #include "dominance.h"
 102 #include "cfg.h"
 103 #include "basic-block.h"
 104 #include "tree-ssa-alias.h"
 105 #include "internal-fn.h"
 106 #include "gimple-fold.h"
 107 #include "gimple-expr.h"
 108 #include "is-a.h"
 109 #include "gimple.h"
 110 #include "gimple-iterator.h"
 111 #include "gimplify.h"
 112 #include "gimplify-me.h"
 113 #include "stor-layout.h"
 114 #include "gimple-ssa.h"
 115 #include "tree-cfg.h"
 116 #include "tree-phinodes.h"
 117 #include "ssa-iterators.h"
 118 #include "stringpool.h"
 119 #include "tree-ssanames.h"
 120 #include "expr.h"
 121 #include "tree-dfa.h"
 122 #include "tree-ssa.h"
 123 #include "tree-pass.h"
 124 #include "alloc-pool.h"
 125 #include "target.h"
 126 #include "gimple-pretty-print.h"
 127 #include "builtins.h"
 128
 129 /* FIXME: RTL headers have to be included here for optabs.  */
 130 #include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
 131 #include "expr.h"               /* Because optabs.h wants sepops.  */
 132 #include "insn-codes.h"
 133 #include "optabs.h"
 134
 135 /* This structure represents one basic block that either computes a
 136    division, or is a common dominator for basic block that compute a
 137    division.  */
 138 struct occurrence {
 139   /* The basic block represented by this structure.  */
 140   basic_block bb;
 141
 142   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 143      inserted in BB.  */
 144   tree recip_def;
 145
 146   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 147      was inserted in BB.  */
 148   gimple recip_def_stmt;
 149
 150   /* Pointer to a list of "struct occurrence"s for blocks dominated
 151      by BB.  */
 152   struct occurrence *children;
 153
 154   /* Pointer to the next "struct occurrence"s in the list of blocks
 155      sharing a common dominator.  */
 156   struct occurrence *next;
 157
 158   /* The number of divisions that are in BB before compute_merit.  The
 159      number of divisions that are in BB or post-dominate it after
 160      compute_merit.  */
 161   int num_divisions;
 162
 163   /* True if the basic block has a division, false if it is a common
 164      dominator for basic blocks that do.  If it is false and trapping
 165      math is active, BB is not a candidate for inserting a reciprocal.  */
 166   bool bb_has_division;
 167 };
 168
 169 static struct
 170 {
 171   /* Number of 1.0/X ops inserted.  */
 172   int rdivs_inserted;
 173
 174   /* Number of 1.0/FUNC ops inserted.  */
 175   int rfuncs_inserted;
 176 } reciprocal_stats;
 177
 178 static struct
 179 {
 180   /* Number of cexpi calls inserted.  */
 181   int inserted;
 182 } sincos_stats;
 183
 184 static struct
 185 {
 186   /* Number of hand-written 16-bit nop / bswaps found.  */
 187   int found_16bit;
 188
 189   /* Number of hand-written 32-bit nop / bswaps found.  */
 190   int found_32bit;
 191
 192   /* Number of hand-written 64-bit nop / bswaps found.  */
 193   int found_64bit;
 194 } nop_stats, bswap_stats;
 195
 196 static struct
 197 {
 198   /* Number of widening multiplication ops inserted.  */
 199   int widen_mults_inserted;
 200
 201   /* Number of integer multiply-and-accumulate ops inserted.  */
 202   int maccs_inserted;
 203
 204   /* Number of fp fused multiply-add ops inserted.  */
 205   int fmas_inserted;
 206 } widen_mul_stats;
 207
 208 /* The instance of "struct occurrence" representing the highest
 209    interesting block in the dominator tree.  */
 210 static struct occurrence *occ_head;
 211
 212 /* Allocation pool for getting instances of "struct occurrence".  */
 213 static alloc_pool occ_pool;
 214
 215
 216
 217 /* Allocate and return a new struct occurrence for basic block BB, and
 218    whose children list is headed by CHILDREN.  */
 219 static struct occurrence *
 220 occ_new (basic_block bb, struct occurrence *children)
 221 {
 222   struct occurrence *occ;
 223
 224   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
 225   memset (occ, 0, sizeof (struct occurrence));
 226
 227   occ->bb = bb;
 228   occ->children = children;
 229   return occ;
 230 }
 231
 232
 233 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 234    list of "struct occurrence"s, one per basic block, having IDOM as
 235    their common dominator.
 236
 237    We try to insert NEW_OCC as deep as possible in the tree, and we also
 238    insert any other block that is a common dominator for BB and one
 239    block already in the tree.  */
 240
 241 static void
 242 insert_bb (struct occurrence *new_occ, basic_block idom,
 243            struct occurrence **p_head)
 244 {
 245   struct occurrence *occ, **p_occ;
 246
 247   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 248     {
 249       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 250       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 251       if (dom == bb)
 252         {
 253           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 254              from its list.  */
 255           *p_occ = occ->next;
 256           occ->next = new_occ->children;
 257           new_occ->children = occ;
 258
 259           /* Try the next block (it may as well be dominated by BB).  */
 260         }
 261
 262       else if (dom == occ_bb)
 263         {
 264           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 265           insert_bb (new_occ, dom, &occ->children);
 266           return;
 267         }
 268
 269       else if (dom != idom)
 270         {
 271           gcc_assert (!dom->aux);
 272
 273           /* There is a dominator between IDOM and BB, add it and make
 274              two children out of NEW_OCC and OCC.  First, remove OCC from
 275              its list.  */
 276           *p_occ = occ->next;
 277           new_occ->next = occ;
 278           occ->next = NULL;
 279
 280           /* None of the previous blocks has DOM as a dominator: if we tail
 281              recursed, we would reexamine them uselessly. Just switch BB with
 282              DOM, and go on looking for blocks dominated by DOM.  */
 283           new_occ = occ_new (dom, new_occ);
 284         }
 285
 286       else
 287         {
 288           /* Nothing special, go on with the next element.  */
 289           p_occ = &occ->next;
 290         }
 291     }
 292
 293   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 294   new_occ->next = *p_head;
 295   *p_head = new_occ;
 296 }
 297
 298 /* Register that we found a division in BB.  */
 299
 300 static inline void
 301 register_division_in (basic_block bb)
 302 {
 303   struct occurrence *occ;
 304
 305   occ = (struct occurrence *) bb->aux;
 306   if (!occ)
 307     {
 308       occ = occ_new (bb, NULL);
 309       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
 310     }
 311
 312   occ->bb_has_division = true;
 313   occ->num_divisions++;
 314 }
 315
 316
 317 /* Compute the number of divisions that postdominate each block in OCC and
 318    its children.  */
 319
 320 static void
 321 compute_merit (struct occurrence *occ)
 322 {
 323   struct occurrence *occ_child;
 324   basic_block dom = occ->bb;
 325
 326   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 327     {
 328       basic_block bb;
 329       if (occ_child->children)
 330         compute_merit (occ_child);
 331
 332       if (flag_exceptions)
 333         bb = single_noncomplex_succ (dom);
 334       else
 335         bb = dom;
 336
 337       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 338         occ->num_divisions += occ_child->num_divisions;
 339     }
 340 }
 341
 342
 343 /* Return whether USE_STMT is a floating-point division by DEF.  */
 344 static inline bool
 345 is_division_by (gimple use_stmt, tree def)
 346 {
 347   return is_gimple_assign (use_stmt)
 348          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 349          && gimple_assign_rhs2 (use_stmt) == def
 350          /* Do not recognize x / x as valid division, as we are getting
 351             confused later by replacing all immediate uses x in such
 352             a stmt.  */
 353          && gimple_assign_rhs1 (use_stmt) != def;
 354 }
 355
 356 /* Walk the subset of the dominator tree rooted at OCC, setting the
 357    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 358    the given basic block.  The field may be left NULL, of course,
 359    if it is not possible or profitable to do the optimization.
 360
 361    DEF_BSI is an iterator pointing at the statement defining DEF.
 362    If RECIP_DEF is set, a dominator already has a computation that can
 363    be used.  */
 364
 365 static void
 366 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 367                     tree def, tree recip_def, int threshold)
 368 {
 369   tree type;
 370   gassign *new_stmt;
 371   gimple_stmt_iterator gsi;
 372   struct occurrence *occ_child;
 373
 374   if (!recip_def
 375       && (occ->bb_has_division || !flag_trapping_math)
 376       && occ->num_divisions >= threshold)
 377     {
 378       /* Make a variable with the replacement and substitute it.  */
 379       type = TREE_TYPE (def);
 380       recip_def = create_tmp_reg (type, "reciptmp");
 381       new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
 382                                       build_one_cst (type), def);
 383
 384       if (occ->bb_has_division)
 385         {
 386           /* Case 1: insert before an existing division.  */
 387           gsi = gsi_after_labels (occ->bb);
 388           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 389             gsi_next (&gsi);
 390
 391           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 392         }
 393       else if (def_gsi && occ->bb == def_gsi->bb)
 394         {
 395           /* Case 2: insert right after the definition.  Note that this will
 396              never happen if the definition statement can throw, because in
 397              that case the sole successor of the statement's basic block will
 398              dominate all the uses as well.  */
 399           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 400         }
 401       else
 402         {
 403           /* Case 3: insert in a basic block not containing defs/uses.  */
 404           gsi = gsi_after_labels (occ->bb);
 405           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 406         }
 407
 408       reciprocal_stats.rdivs_inserted++;
 409
 410       occ->recip_def_stmt = new_stmt;
 411     }
 412
 413   occ->recip_def = recip_def;
 414   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 415     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 416 }
 417
 418
 419 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 420    possible.  */
 421
 422 static inline void
 423 replace_reciprocal (use_operand_p use_p)
 424 {
 425   gimple use_stmt = USE_STMT (use_p);
 426   basic_block bb = gimple_bb (use_stmt);
 427   struct occurrence *occ = (struct occurrence *) bb->aux;
 428
 429   if (optimize_bb_for_speed_p (bb)
 430       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 431     {
 432       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 433       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 434       SET_USE (use_p, occ->recip_def);
 435       fold_stmt_inplace (&gsi);
 436       update_stmt (use_stmt);
 437     }
 438 }
 439
 440
 441 /* Free OCC and return one more "struct occurrence" to be freed.  */
 442
 443 static struct occurrence *
 444 free_bb (struct occurrence *occ)
 445 {
 446   struct occurrence *child, *next;
 447
 448   /* First get the two pointers hanging off OCC.  */
 449   next = occ->next;
 450   child = occ->children;
 451   occ->bb->aux = NULL;
 452   pool_free (occ_pool, occ);
 453
 454   /* Now ensure that we don't recurse unless it is necessary.  */
 455   if (!child)
 456     return next;
 457   else
 458     {
 459       while (next)
 460         next = free_bb (next);
 461
 462       return child;
 463     }
 464 }
 465
 466
 467 /* Look for floating-point divisions among DEF's uses, and try to
 468    replace them by multiplications with the reciprocal.  Add
 469    as many statements computing the reciprocal as needed.
 470
 471    DEF must be a GIMPLE register of a floating-point type.  */
 472
 473 static void
 474 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 475 {
 476   use_operand_p use_p;
 477   imm_use_iterator use_iter;
 478   struct occurrence *occ;
 479   int count = 0, threshold;
 480
 481   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 482
 483   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 484     {
 485       gimple use_stmt = USE_STMT (use_p);
 486       if (is_division_by (use_stmt, def))
 487         {
 488           register_division_in (gimple_bb (use_stmt));
 489           count++;
 490         }
 491     }
 492
 493   /* Do the expensive part only if we can hope to optimize something.  */
 494   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 495   if (count >= threshold)
 496     {
 497       gimple use_stmt;
 498       for (occ = occ_head; occ; occ = occ->next)
 499         {
 500           compute_merit (occ);
 501           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 502         }
 503
 504       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 505         {
 506           if (is_division_by (use_stmt, def))
 507             {
 508               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 509                 replace_reciprocal (use_p);
 510             }
 511         }
 512     }
 513
 514   for (occ = occ_head; occ; )
 515     occ = free_bb (occ);
 516
 517   occ_head = NULL;
 518 }
 519
 520 /* Go through all the floating-point SSA_NAMEs, and call
 521    execute_cse_reciprocals_1 on each of them.  */
 522 namespace {
 523
 524 const pass_data pass_data_cse_reciprocals =
 525 {
 526   GIMPLE_PASS, /* type */
 527   "recip", /* name */
 528   OPTGROUP_NONE, /* optinfo_flags */
 529   TV_NONE, /* tv_id */
 530   PROP_ssa, /* properties_required */
 531   0, /* properties_provided */
 532   0, /* properties_destroyed */
 533   0, /* todo_flags_start */
 534   TODO_update_ssa, /* todo_flags_finish */
 535 };
 536
 537 class pass_cse_reciprocals : public gimple_opt_pass
 538 {
 539 public:
 540   pass_cse_reciprocals (gcc::context *ctxt)
 541     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
 542   {}
 543
 544   /* opt_pass methods: */
 545   virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
 546   virtual unsigned int execute (function *);
 547
 548 }; // class pass_cse_reciprocals
 549
 550 unsigned int
 551 pass_cse_reciprocals::execute (function *fun)
 552 {
 553   basic_block bb;
 554   tree arg;
 555
 556   occ_pool = create_alloc_pool ("dominators for recip",
 557                                 sizeof (struct occurrence),
 558                                 n_basic_blocks_for_fn (fun) / 3 + 1);
 559
 560   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 561   calculate_dominance_info (CDI_DOMINATORS);
 562   calculate_dominance_info (CDI_POST_DOMINATORS);
 563
 564 #ifdef ENABLE_CHECKING
 565   FOR_EACH_BB_FN (bb, fun)
 566     gcc_assert (!bb->aux);
 567 #endif
 568
 569   for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
 570     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 571         && is_gimple_reg (arg))
 572       {
 573         tree name = ssa_default_def (fun, arg);
 574         if (name)
 575           execute_cse_reciprocals_1 (NULL, name);
 576       }
 577
 578   FOR_EACH_BB_FN (bb, fun)
 579     {
 580       tree def;
 581
 582       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 583            gsi_next (&gsi))
 584         {
 585           gphi *phi = gsi.phi ();
 586           def = PHI_RESULT (phi);
 587           if (! virtual_operand_p (def)
 588               && FLOAT_TYPE_P (TREE_TYPE (def)))
 589             execute_cse_reciprocals_1 (NULL, def);
 590         }
 591
 592       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 593            gsi_next (&gsi))
 594         {
 595           gimple stmt = gsi_stmt (gsi);
 596
 597           if (gimple_has_lhs (stmt)
 598               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 599               && FLOAT_TYPE_P (TREE_TYPE (def))
 600               && TREE_CODE (def) == SSA_NAME)
 601             execute_cse_reciprocals_1 (&gsi, def);
 602         }
 603
 604       if (optimize_bb_for_size_p (bb))
 605         continue;
 606
 607       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 608       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 609            gsi_next (&gsi))
 610         {
 611           gimple stmt = gsi_stmt (gsi);
 612           tree fndecl;
 613
 614           if (is_gimple_assign (stmt)
 615               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 616             {
 617               tree arg1 = gimple_assign_rhs2 (stmt);
 618               gimple stmt1;
 619
 620               if (TREE_CODE (arg1) != SSA_NAME)
 621                 continue;
 622
 623               stmt1 = SSA_NAME_DEF_STMT (arg1);
 624
 625               if (is_gimple_call (stmt1)
 626                   && gimple_call_lhs (stmt1)
 627                   && (fndecl = gimple_call_fndecl (stmt1))
 628                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 629                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 630                 {
 631                   enum built_in_function code;
 632                   bool md_code, fail;
 633                   imm_use_iterator ui;
 634                   use_operand_p use_p;
 635
 636                   code = DECL_FUNCTION_CODE (fndecl);
 637                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 638
 639                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 640                   if (!fndecl)
 641                     continue;
 642
 643                   /* Check that all uses of the SSA name are divisions,
 644                      otherwise replacing the defining statement will do
 645                      the wrong thing.  */
 646                   fail = false;
 647                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 648                     {
 649                       gimple stmt2 = USE_STMT (use_p);
 650                       if (is_gimple_debug (stmt2))
 651                         continue;
 652                       if (!is_gimple_assign (stmt2)
 653                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 654                           || gimple_assign_rhs1 (stmt2) == arg1
 655                           || gimple_assign_rhs2 (stmt2) != arg1)
 656                         {
 657                           fail = true;
 658                           break;
 659                         }
 660                     }
 661                   if (fail)
 662                     continue;
 663
 664                   gimple_replace_ssa_lhs (stmt1, arg1);
 665                   gimple_call_set_fndecl (stmt1, fndecl);
 666                   update_stmt (stmt1);
 667                   reciprocal_stats.rfuncs_inserted++;
 668
 669                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 670                     {
 671                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 672                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 673                       fold_stmt_inplace (&gsi);
 674                       update_stmt (stmt);
 675                     }
 676                 }
 677             }
 678         }
 679     }
 680
 681   statistics_counter_event (fun, "reciprocal divs inserted",
 682                             reciprocal_stats.rdivs_inserted);
 683   statistics_counter_event (fun, "reciprocal functions inserted",
 684                             reciprocal_stats.rfuncs_inserted);
 685
 686   free_dominance_info (CDI_DOMINATORS);
 687   free_dominance_info (CDI_POST_DOMINATORS);
 688   free_alloc_pool (occ_pool);
 689   return 0;
 690 }
 691
 692 } // anon namespace
 693
 694 gimple_opt_pass *
 695 make_pass_cse_reciprocals (gcc::context *ctxt)
 696 {
 697   return new pass_cse_reciprocals (ctxt);
 698 }
 699
 700 /* Records an occurrence at statement USE_STMT in the vector of trees
 701    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 702    is not yet initialized.  Returns true if the occurrence was pushed on
 703    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 704    statements in the vector.  */
 705
 706 static bool
 707 maybe_record_sincos (vec<gimple> *stmts,
 708                      basic_block *top_bb, gimple use_stmt)
 709 {
 710   basic_block use_bb = gimple_bb (use_stmt);
 711   if (*top_bb
 712       && (*top_bb == use_bb
 713           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 714     stmts->safe_push (use_stmt);
 715   else if (!*top_bb
 716            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 717     {
 718       stmts->safe_push (use_stmt);
 719       *top_bb = use_bb;
 720     }
 721   else
 722     return false;
 723
 724   return true;
 725 }
 726
 727 /* Look for sin, cos and cexpi calls with the same argument NAME and
 728    create a single call to cexpi CSEing the result in this case.
 729    We first walk over all immediate uses of the argument collecting
 730    statements that we can CSE in a vector and in a second pass replace
 731    the statement rhs with a REALPART or IMAGPART expression on the
 732    result of the cexpi call we insert before the use statement that
 733    dominates all other candidates.  */
 734
 735 static bool
 736 execute_cse_sincos_1 (tree name)
 737 {
 738   gimple_stmt_iterator gsi;
 739   imm_use_iterator use_iter;
 740   tree fndecl, res, type;
 741   gimple def_stmt, use_stmt, stmt;
 742   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 743   vec<gimple> stmts = vNULL;
 744   basic_block top_bb = NULL;
 745   int i;
 746   bool cfg_changed = false;
 747
 748   type = TREE_TYPE (name);
 749   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 750     {
 751       if (gimple_code (use_stmt) != GIMPLE_CALL
 752           || !gimple_call_lhs (use_stmt)
 753           || !(fndecl = gimple_call_fndecl (use_stmt))
 754           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 755         continue;
 756
 757       switch (DECL_FUNCTION_CODE (fndecl))
 758         {
 759         CASE_FLT_FN (BUILT_IN_COS):
 760           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 761           break;
 762
 763         CASE_FLT_FN (BUILT_IN_SIN):
 764           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 765           break;
 766
 767         CASE_FLT_FN (BUILT_IN_CEXPI):
 768           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 769           break;
 770
 771         default:;
 772         }
 773     }
 774
 775   if (seen_cos + seen_sin + seen_cexpi <= 1)
 776     {
 777       stmts.release ();
 778       return false;
 779     }
 780
 781   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 782      the name def statement.  */
 783   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 784   if (!fndecl)
 785     return false;
 786   stmt = gimple_build_call (fndecl, 1, name);
 787   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
 788   gimple_call_set_lhs (stmt, res);
 789
 790   def_stmt = SSA_NAME_DEF_STMT (name);
 791   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 792       && gimple_code (def_stmt) != GIMPLE_PHI
 793       && gimple_bb (def_stmt) == top_bb)
 794     {
 795       gsi = gsi_for_stmt (def_stmt);
 796       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 797     }
 798   else
 799     {
 800       gsi = gsi_after_labels (top_bb);
 801       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 802     }
 803   sincos_stats.inserted++;
 804
 805   /* And adjust the recorded old call sites.  */
 806   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
 807     {
 808       tree rhs = NULL;
 809       fndecl = gimple_call_fndecl (use_stmt);
 810
 811       switch (DECL_FUNCTION_CODE (fndecl))
 812         {
 813         CASE_FLT_FN (BUILT_IN_COS):
 814           rhs = fold_build1 (REALPART_EXPR, type, res);
 815           break;
 816
 817         CASE_FLT_FN (BUILT_IN_SIN):
 818           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 819           break;
 820
 821         CASE_FLT_FN (BUILT_IN_CEXPI):
 822           rhs = res;
 823           break;
 824
 825         default:;
 826           gcc_unreachable ();
 827         }
 828
 829         /* Replace call with a copy.  */
 830         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 831
 832         gsi = gsi_for_stmt (use_stmt);
 833         gsi_replace (&gsi, stmt, true);
 834         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 835           cfg_changed = true;
 836     }
 837
 838   stmts.release ();
 839
 840   return cfg_changed;
 841 }
 842
 843 /* To evaluate powi(x,n), the floating point value x raised to the
 844    constant integer exponent n, we use a hybrid algorithm that
 845    combines the "window method" with look-up tables.  For an
 846    introduction to exponentiation algorithms and "addition chains",
 847    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 848    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 849    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 850    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 851
 852 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 853    multiplications to inline before calling the system library's pow
 854    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 855    so this default never requires calling pow, powf or powl.  */
 856
 857 #ifndef POWI_MAX_MULTS
 858 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 859 #endif
 860
 861 /* The size of the "optimal power tree" lookup table.  All
 862    exponents less than this value are simply looked up in the
 863    powi_table below.  This threshold is also used to size the
 864    cache of pseudo registers that hold intermediate results.  */
 865 #define POWI_TABLE_SIZE 256
 866
 867 /* The size, in bits of the window, used in the "window method"
 868    exponentiation algorithm.  This is equivalent to a radix of
 869    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 870 #define POWI_WINDOW_SIZE 3
 871
 872 /* The following table is an efficient representation of an
 873    "optimal power tree".  For each value, i, the corresponding
 874    value, j, in the table states than an optimal evaluation
 875    sequence for calculating pow(x,i) can be found by evaluating
 876    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 877    100 integers is given in Knuth's "Seminumerical algorithms".  */
 878
 879 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 880   {
 881       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 882       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 883       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 884      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 885      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 886      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 887      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 888      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 889      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 890      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 891      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 892      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 893      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 894      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 895      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 896      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 897      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 898      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 899      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 900      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 901      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 902      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 903      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 904      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 905      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 906     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 907     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 908     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 909     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 910     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 911     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 912     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 913   };
 914
 915
 916 /* Return the number of multiplications required to calculate
 917    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 918    subroutine of powi_cost.  CACHE is an array indicating
 919    which exponents have already been calculated.  */
 920
 921 static int
 922 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 923 {
 924   /* If we've already calculated this exponent, then this evaluation
 925      doesn't require any additional multiplications.  */
 926   if (cache[n])
 927     return 0;
 928
 929   cache[n] = true;
 930   return powi_lookup_cost (n - powi_table[n], cache)
 931          + powi_lookup_cost (powi_table[n], cache) + 1;
 932 }
 933
 934 /* Return the number of multiplications required to calculate
 935    powi(x,n) for an arbitrary x, given the exponent N.  This
 936    function needs to be kept in sync with powi_as_mults below.  */
 937
 938 static int
 939 powi_cost (HOST_WIDE_INT n)
 940 {
 941   bool cache[POWI_TABLE_SIZE];
 942   unsigned HOST_WIDE_INT digit;
 943   unsigned HOST_WIDE_INT val;
 944   int result;
 945
 946   if (n == 0)
 947     return 0;
 948
 949   /* Ignore the reciprocal when calculating the cost.  */
 950   val = (n < 0) ? -n : n;
 951
 952   /* Initialize the exponent cache.  */
 953   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 954   cache[1] = true;
 955
 956   result = 0;
 957
 958   while (val >= POWI_TABLE_SIZE)
 959     {
 960       if (val & 1)
 961         {
 962           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 963           result += powi_lookup_cost (digit, cache)
 964                     + POWI_WINDOW_SIZE + 1;
 965           val >>= POWI_WINDOW_SIZE;
 966         }
 967       else
 968         {
 969           val >>= 1;
 970           result++;
 971         }
 972     }
 973
 974   return result + powi_lookup_cost (val, cache);
 975 }
 976
 977 /* Recursive subroutine of powi_as_mults.  This function takes the
 978    array, CACHE, of already calculated exponents and an exponent N and
 979    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 980
 981 static tree
 982 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 983                  HOST_WIDE_INT n, tree *cache)
 984 {
 985   tree op0, op1, ssa_target;
 986   unsigned HOST_WIDE_INT digit;
 987   gassign *mult_stmt;
 988
 989   if (n < POWI_TABLE_SIZE && cache[n])
 990     return cache[n];
 991
 992   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
 993
 994   if (n < POWI_TABLE_SIZE)
 995     {
 996       cache[n] = ssa_target;
 997       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
 998       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
 999     }
1000   else if (n & 1)
1001     {
1002       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1003       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1004       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1005     }
1006   else
1007     {
1008       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1009       op1 = op0;
1010     }
1011
1012   mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1013   gimple_set_location (mult_stmt, loc);
1014   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1015
1016   return ssa_target;
1017 }
1018
1019 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1020    This function needs to be kept in sync with powi_cost above.  */
1021
1022 static tree
1023 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1024                tree arg0, HOST_WIDE_INT n)
1025 {
1026   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1027   gassign *div_stmt;
1028   tree target;
1029
1030   if (n == 0)
1031     return build_real (type, dconst1);
1032
1033   memset (cache, 0,  sizeof (cache));
1034   cache[1] = arg0;
1035
1036   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1037   if (n >= 0)
1038     return result;
1039
1040   /* If the original exponent was negative, reciprocate the result.  */
1041   target = make_temp_ssa_name (type, NULL, "powmult");
1042   div_stmt = gimple_build_assign (target, RDIV_EXPR,
1043                                   build_real (type, dconst1), result);
1044   gimple_set_location (div_stmt, loc);
1045   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1046
1047   return target;
1048 }
1049
1050 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1051    location info LOC.  If the arguments are appropriate, create an
1052    equivalent sequence of statements prior to GSI using an optimal
1053    number of multiplications, and return an expession holding the
1054    result.  */
1055
1056 static tree
1057 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1058                             tree arg0, HOST_WIDE_INT n)
1059 {
1060   /* Avoid largest negative number.  */
1061   if (n != -n
1062       && ((n >= -1 && n <= 2)
1063           || (optimize_function_for_speed_p (cfun)
1064               && powi_cost (n) <= POWI_MAX_MULTS)))
1065     return powi_as_mults (gsi, loc, arg0, n);
1066
1067   return NULL_TREE;
1068 }
1069
1070 /* Build a gimple call statement that calls FN with argument ARG.
1071    Set the lhs of the call statement to a fresh SSA name.  Insert the
1072    statement prior to GSI's current position, and return the fresh
1073    SSA name.  */
1074
1075 static tree
1076 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1077                        tree fn, tree arg)
1078 {
1079   gcall *call_stmt;
1080   tree ssa_target;
1081
1082   call_stmt = gimple_build_call (fn, 1, arg);
1083   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1084   gimple_set_lhs (call_stmt, ssa_target);
1085   gimple_set_location (call_stmt, loc);
1086   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1087
1088   return ssa_target;
1089 }
1090
1091 /* Build a gimple binary operation with the given CODE and arguments
1092    ARG0, ARG1, assigning the result to a new SSA name for variable
1093    TARGET.  Insert the statement prior to GSI's current position, and
1094    return the fresh SSA name.*/
1095
1096 static tree
1097 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1098                         const char *name, enum tree_code code,
1099                         tree arg0, tree arg1)
1100 {
1101   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1102   gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1103   gimple_set_location (stmt, loc);
1104   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1105   return result;
1106 }
1107
1108 /* Build a gimple reference operation with the given CODE and argument
1109    ARG, assigning the result to a new SSA name of TYPE with NAME.
1110    Insert the statement prior to GSI's current position, and return
1111    the fresh SSA name.  */
1112
1113 static inline tree
1114 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1115                       const char *name, enum tree_code code, tree arg0)
1116 {
1117   tree result = make_temp_ssa_name (type, NULL, name);
1118   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1119   gimple_set_location (stmt, loc);
1120   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1121   return result;
1122 }
1123
1124 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1125    prior to GSI's current position, and return the fresh SSA name.  */
1126
1127 static tree
1128 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1129                        tree type, tree val)
1130 {
1131   tree result = make_ssa_name (type);
1132   gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1133   gimple_set_location (stmt, loc);
1134   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1135   return result;
1136 }
1137
1138 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1139    with location info LOC.  If possible, create an equivalent and
1140    less expensive sequence of statements prior to GSI, and return an
1141    expession holding the result.  */
1142
1143 static tree
1144 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1145                            tree arg0, tree arg1)
1146 {
1147   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1148   REAL_VALUE_TYPE c2, dconst3;
1149   HOST_WIDE_INT n;
1150   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1151   machine_mode mode;
1152   bool hw_sqrt_exists, c_is_int, c2_is_int;
1153
1154   /* If the exponent isn't a constant, there's nothing of interest
1155      to be done.  */
1156   if (TREE_CODE (arg1) != REAL_CST)
1157     return NULL_TREE;
1158
1159   /* If the exponent is equivalent to an integer, expand to an optimal
1160      multiplication sequence when profitable.  */
1161   c = TREE_REAL_CST (arg1);
1162   n = real_to_integer (&c);
1163   real_from_integer (&cint, VOIDmode, n, SIGNED);
1164   c_is_int = real_identical (&c, &cint);
1165
1166   if (c_is_int
1167       && ((n >= -1 && n <= 2)
1168           || (flag_unsafe_math_optimizations
1169               && optimize_bb_for_speed_p (gsi_bb (*gsi))
1170               && powi_cost (n) <= POWI_MAX_MULTS)))
1171     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1172
1173   /* Attempt various optimizations using sqrt and cbrt.  */
1174   type = TREE_TYPE (arg0);
1175   mode = TYPE_MODE (type);
1176   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1177
1178   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1179      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1180      sqrt(-0) = -0.  */
1181   if (sqrtfn
1182       && REAL_VALUES_EQUAL (c, dconsthalf)
1183       && !HONOR_SIGNED_ZEROS (mode))
1184     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1185
1186   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1187      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1188      so do this optimization even if -Os.  Don't do this optimization
1189      if we don't have a hardware sqrt insn.  */
1190   dconst1_4 = dconst1;
1191   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1192   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1193
1194   if (flag_unsafe_math_optimizations
1195       && sqrtfn
1196       && REAL_VALUES_EQUAL (c, dconst1_4)
1197       && hw_sqrt_exists)
1198     {
1199       /* sqrt(x)  */
1200       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1201
1202       /* sqrt(sqrt(x))  */
1203       return build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1204     }
1205
1206   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1207      optimizing for space.  Don't do this optimization if we don't have
1208      a hardware sqrt insn.  */
1209   real_from_integer (&dconst3_4, VOIDmode, 3, SIGNED);
1210   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1211
1212   if (flag_unsafe_math_optimizations
1213       && sqrtfn
1214       && optimize_function_for_speed_p (cfun)
1215       && REAL_VALUES_EQUAL (c, dconst3_4)
1216       && hw_sqrt_exists)
1217     {
1218       /* sqrt(x)  */
1219       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1220
1221       /* sqrt(sqrt(x))  */
1222       sqrt_sqrt = build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1223
1224       /* sqrt(x) * sqrt(sqrt(x))  */
1225       return build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1226                                      sqrt_arg0, sqrt_sqrt);
1227     }
1228
1229   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1230      optimizations since 1./3. is not exactly representable.  If x
1231      is negative and finite, the correct value of pow(x,1./3.) is
1232      a NaN with the "invalid" exception raised, because the value
1233      of 1./3. actually has an even denominator.  The correct value
1234      of cbrt(x) is a negative real value.  */
1235   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1236   dconst1_3 = real_value_truncate (mode, dconst_third ());
1237
1238   if (flag_unsafe_math_optimizations
1239       && cbrtfn
1240       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1241       && REAL_VALUES_EQUAL (c, dconst1_3))
1242     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1243
1244   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1245      if we don't have a hardware sqrt insn.  */
1246   dconst1_6 = dconst1_3;
1247   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1248
1249   if (flag_unsafe_math_optimizations
1250       && sqrtfn
1251       && cbrtfn
1252       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1253       && optimize_function_for_speed_p (cfun)
1254       && hw_sqrt_exists
1255       && REAL_VALUES_EQUAL (c, dconst1_6))
1256     {
1257       /* sqrt(x)  */
1258       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1259
1260       /* cbrt(sqrt(x))  */
1261       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1262     }
1263
1264   /* Optimize pow(x,c), where n = 2c for some nonzero integer n
1265      and c not an integer, into
1266
1267        sqrt(x) * powi(x, n/2),                n > 0;
1268        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1269
1270      Do not calculate the powi factor when n/2 = 0.  */
1271   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1272   n = real_to_integer (&c2);
1273   real_from_integer (&cint, VOIDmode, n, SIGNED);
1274   c2_is_int = real_identical (&c2, &cint);
1275
1276   if (flag_unsafe_math_optimizations
1277       && sqrtfn
1278       && c2_is_int
1279       && !c_is_int
1280       && optimize_function_for_speed_p (cfun))
1281     {
1282       tree powi_x_ndiv2 = NULL_TREE;
1283
1284       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1285          possible or profitable, give up.  Skip the degenerate case when
1286          n is 1 or -1, where the result is always 1.  */
1287       if (absu_hwi (n) != 1)
1288         {
1289           powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1290                                                      abs_hwi (n / 2));
1291           if (!powi_x_ndiv2)
1292             return NULL_TREE;
1293         }
1294
1295       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1296          result of the optimal multiply sequence just calculated.  */
1297       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1298
1299       if (absu_hwi (n) == 1)
1300         result = sqrt_arg0;
1301       else
1302         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1303                                          sqrt_arg0, powi_x_ndiv2);
1304
1305       /* If n is negative, reciprocate the result.  */
1306       if (n < 0)
1307         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1308                                          build_real (type, dconst1), result);
1309       return result;
1310     }
1311
1312   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1313
1314      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1315      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1316
1317      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1318      different from pow(x, 1./3.) due to rounding and behavior with
1319      negative x, we need to constrain this transformation to unsafe
1320      math and positive x or finite math.  */
1321   real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1322   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1323   real_round (&c2, mode, &c2);
1324   n = real_to_integer (&c2);
1325   real_from_integer (&cint, VOIDmode, n, SIGNED);
1326   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1327   real_convert (&c2, mode, &c2);
1328
1329   if (flag_unsafe_math_optimizations
1330       && cbrtfn
1331       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1332       && real_identical (&c2, &c)
1333       && !c2_is_int
1334       && optimize_function_for_speed_p (cfun)
1335       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1336     {
1337       tree powi_x_ndiv3 = NULL_TREE;
1338
1339       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1340          possible or profitable, give up.  Skip the degenerate case when
1341          abs(n) < 3, where the result is always 1.  */
1342       if (absu_hwi (n) >= 3)
1343         {
1344           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1345                                                      abs_hwi (n / 3));
1346           if (!powi_x_ndiv3)
1347             return NULL_TREE;
1348         }
1349
1350       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1351          as that creates an unnecessary variable.  Instead, just produce
1352          either cbrt(x) or cbrt(x) * cbrt(x).  */
1353       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1354
1355       if (absu_hwi (n) % 3 == 1)
1356         powi_cbrt_x = cbrt_x;
1357       else
1358         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1359                                               cbrt_x, cbrt_x);
1360
1361       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1362       if (absu_hwi (n) < 3)
1363         result = powi_cbrt_x;
1364       else
1365         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1366                                          powi_x_ndiv3, powi_cbrt_x);
1367
1368       /* If n is negative, reciprocate the result.  */
1369       if (n < 0)
1370         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1371                                          build_real (type, dconst1), result);
1372
1373       return result;
1374     }
1375
1376   /* No optimizations succeeded.  */
1377   return NULL_TREE;
1378 }
1379
1380 /* ARG is the argument to a cabs builtin call in GSI with location info
1381    LOC.  Create a sequence of statements prior to GSI that calculates
1382    sqrt(R*R + I*I), where R and I are the real and imaginary components
1383    of ARG, respectively.  Return an expression holding the result.  */
1384
1385 static tree
1386 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1387 {
1388   tree real_part, imag_part, addend1, addend2, sum, result;
1389   tree type = TREE_TYPE (TREE_TYPE (arg));
1390   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1391   machine_mode mode = TYPE_MODE (type);
1392
1393   if (!flag_unsafe_math_optimizations
1394       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1395       || !sqrtfn
1396       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1397     return NULL_TREE;
1398
1399   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1400                                     REALPART_EXPR, arg);
1401   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1402                                     real_part, real_part);
1403   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1404                                     IMAGPART_EXPR, arg);
1405   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1406                                     imag_part, imag_part);
1407   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1408   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1409
1410   return result;
1411 }
1412
1413 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1414    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1415    an optimal number of multiplies, when n is a constant.  */
1416
1417 namespace {
1418
1419 const pass_data pass_data_cse_sincos =
1420 {
1421   GIMPLE_PASS, /* type */
1422   "sincos", /* name */
1423   OPTGROUP_NONE, /* optinfo_flags */
1424   TV_NONE, /* tv_id */
1425   PROP_ssa, /* properties_required */
1426   0, /* properties_provided */
1427   0, /* properties_destroyed */
1428   0, /* todo_flags_start */
1429   TODO_update_ssa, /* todo_flags_finish */
1430 };
1431
1432 class pass_cse_sincos : public gimple_opt_pass
1433 {
1434 public:
1435   pass_cse_sincos (gcc::context *ctxt)
1436     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1437   {}
1438
1439   /* opt_pass methods: */
1440   virtual bool gate (function *)
1441     {
1442       /* We no longer require either sincos or cexp, since powi expansion
1443          piggybacks on this pass.  */
1444       return optimize;
1445     }
1446
1447   virtual unsigned int execute (function *);
1448
1449 }; // class pass_cse_sincos
1450
1451 unsigned int
1452 pass_cse_sincos::execute (function *fun)
1453 {
1454   basic_block bb;
1455   bool cfg_changed = false;
1456
1457   calculate_dominance_info (CDI_DOMINATORS);
1458   memset (&sincos_stats, 0, sizeof (sincos_stats));
1459
1460   FOR_EACH_BB_FN (bb, fun)
1461     {
1462       gimple_stmt_iterator gsi;
1463       bool cleanup_eh = false;
1464
1465       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1466         {
1467           gimple stmt = gsi_stmt (gsi);
1468           tree fndecl;
1469
1470           /* Only the last stmt in a bb could throw, no need to call
1471              gimple_purge_dead_eh_edges if we change something in the middle
1472              of a basic block.  */
1473           cleanup_eh = false;
1474
1475           if (is_gimple_call (stmt)
1476               && gimple_call_lhs (stmt)
1477               && (fndecl = gimple_call_fndecl (stmt))
1478               && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1479             {
1480               tree arg, arg0, arg1, result;
1481               HOST_WIDE_INT n;
1482               location_t loc;
1483
1484               switch (DECL_FUNCTION_CODE (fndecl))
1485                 {
1486                 CASE_FLT_FN (BUILT_IN_COS):
1487                 CASE_FLT_FN (BUILT_IN_SIN):
1488                 CASE_FLT_FN (BUILT_IN_CEXPI):
1489                   /* Make sure we have either sincos or cexp.  */
1490                   if (!targetm.libc_has_function (function_c99_math_complex)
1491                       && !targetm.libc_has_function (function_sincos))
1492                     break;
1493
1494                   arg = gimple_call_arg (stmt, 0);
1495                   if (TREE_CODE (arg) == SSA_NAME)
1496                     cfg_changed |= execute_cse_sincos_1 (arg);
1497                   break;
1498
1499                 CASE_FLT_FN (BUILT_IN_POW):
1500                   arg0 = gimple_call_arg (stmt, 0);
1501                   arg1 = gimple_call_arg (stmt, 1);
1502
1503                   loc = gimple_location (stmt);
1504                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1505
1506                   if (result)
1507                     {
1508                       tree lhs = gimple_get_lhs (stmt);
1509                       gassign *new_stmt = gimple_build_assign (lhs, result);
1510                       gimple_set_location (new_stmt, loc);
1511                       unlink_stmt_vdef (stmt);
1512                       gsi_replace (&gsi, new_stmt, true);
1513                       cleanup_eh = true;
1514                       if (gimple_vdef (stmt))
1515                         release_ssa_name (gimple_vdef (stmt));
1516                     }
1517                   break;
1518
1519                 CASE_FLT_FN (BUILT_IN_POWI):
1520                   arg0 = gimple_call_arg (stmt, 0);
1521                   arg1 = gimple_call_arg (stmt, 1);
1522                   loc = gimple_location (stmt);
1523
1524                   if (real_minus_onep (arg0))
1525                     {
1526                       tree t0, t1, cond, one, minus_one;
1527                       gassign *stmt;
1528
1529                       t0 = TREE_TYPE (arg0);
1530                       t1 = TREE_TYPE (arg1);
1531                       one = build_real (t0, dconst1);
1532                       minus_one = build_real (t0, dconstm1);
1533
1534                       cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1535                       stmt = gimple_build_assign (cond, BIT_AND_EXPR,
1536                                                   arg1, build_int_cst (t1, 1));
1537                       gimple_set_location (stmt, loc);
1538                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1539
1540                       result = make_temp_ssa_name (t0, NULL, "powi");
1541                       stmt = gimple_build_assign (result, COND_EXPR, cond,
1542                                                   minus_one, one);
1543                       gimple_set_location (stmt, loc);
1544                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1545                     }
1546                   else
1547                     {
1548                       if (!tree_fits_shwi_p (arg1))
1549                         break;
1550
1551                       n = tree_to_shwi (arg1);
1552                       result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1553                     }
1554
1555                   if (result)
1556                     {
1557                       tree lhs = gimple_get_lhs (stmt);
1558                       gassign *new_stmt = gimple_build_assign (lhs, result);
1559                       gimple_set_location (new_stmt, loc);
1560                       unlink_stmt_vdef (stmt);
1561                       gsi_replace (&gsi, new_stmt, true);
1562                       cleanup_eh = true;
1563                       if (gimple_vdef (stmt))
1564                         release_ssa_name (gimple_vdef (stmt));
1565                     }
1566                   break;
1567
1568                 CASE_FLT_FN (BUILT_IN_CABS):
1569                   arg0 = gimple_call_arg (stmt, 0);
1570                   loc = gimple_location (stmt);
1571                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1572
1573                   if (result)
1574                     {
1575                       tree lhs = gimple_get_lhs (stmt);
1576                       gassign *new_stmt = gimple_build_assign (lhs, result);
1577                       gimple_set_location (new_stmt, loc);
1578                       unlink_stmt_vdef (stmt);
1579                       gsi_replace (&gsi, new_stmt, true);
1580                       cleanup_eh = true;
1581                       if (gimple_vdef (stmt))
1582                         release_ssa_name (gimple_vdef (stmt));
1583                     }
1584                   break;
1585
1586                 default:;
1587                 }
1588             }
1589         }
1590       if (cleanup_eh)
1591         cfg_changed |= gimple_purge_dead_eh_edges (bb);
1592     }
1593
1594   statistics_counter_event (fun, "sincos statements inserted",
1595                             sincos_stats.inserted);
1596
1597   free_dominance_info (CDI_DOMINATORS);
1598   return cfg_changed ? TODO_cleanup_cfg : 0;
1599 }
1600
1601 } // anon namespace
1602
1603 gimple_opt_pass *
1604 make_pass_cse_sincos (gcc::context *ctxt)
1605 {
1606   return new pass_cse_sincos (ctxt);
1607 }
1608
1609 /* A symbolic number is used to detect byte permutation and selection
1610    patterns.  Therefore the field N contains an artificial number
1611    consisting of octet sized markers:
1612
1613    0    - target byte has the value 0
1614    FF   - target byte has an unknown value (eg. due to sign extension)
1615    1..size - marker value is the target byte index minus one.
1616
1617    To detect permutations on memory sources (arrays and structures), a symbolic
1618    number is also associated a base address (the array or structure the load is
1619    made from), an offset from the base address and a range which gives the
1620    difference between the highest and lowest accessed memory location to make
1621    such a symbolic number. The range is thus different from size which reflects
1622    the size of the type of current expression. Note that for non memory source,
1623    range holds the same value as size.
1624
1625    For instance, for an array char a[], (short) a[0] | (short) a[3] would have
1626    a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
1627    still have a size of 2 but this time a range of 1.  */
1628
1629 struct symbolic_number {
1630   uint64_t n;
1631   tree type;
1632   tree base_addr;
1633   tree offset;
1634   HOST_WIDE_INT bytepos;
1635   tree alias_set;
1636   tree vuse;
1637   unsigned HOST_WIDE_INT range;
1638 };
1639
1640 #define BITS_PER_MARKER 8
1641 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1642 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1643 #define HEAD_MARKER(n, size) \
1644   ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1645
1646 /* The number which the find_bswap_or_nop_1 result should match in
1647    order to have a nop.  The number is masked according to the size of
1648    the symbolic number before using it.  */
1649 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1650   (uint64_t)0x08070605 << 32 | 0x04030201)
1651
1652 /* The number which the find_bswap_or_nop_1 result should match in
1653    order to have a byte swap.  The number is masked according to the
1654    size of the symbolic number before using it.  */
1655 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1656   (uint64_t)0x01020304 << 32 | 0x05060708)
1657
1658 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1659    number N.  Return false if the requested operation is not permitted
1660    on a symbolic number.  */
1661
1662 static inline bool
1663 do_shift_rotate (enum tree_code code,
1664                  struct symbolic_number *n,
1665                  int count)
1666 {
1667   int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1668   unsigned head_marker;
1669
1670   if (count % BITS_PER_UNIT != 0)
1671     return false;
1672   count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
1673
1674   /* Zero out the extra bits of N in order to avoid them being shifted
1675      into the significant bits.  */
1676   if (size < 64 / BITS_PER_MARKER)
1677     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1678
1679   switch (code)
1680     {
1681     case LSHIFT_EXPR:
1682       n->n <<= count;
1683       break;
1684     case RSHIFT_EXPR:
1685       head_marker = HEAD_MARKER (n->n, size);
1686       n->n >>= count;
1687       /* Arithmetic shift of signed type: result is dependent on the value.  */
1688       if (!TYPE_UNSIGNED (n->type) && head_marker)
1689         for (i = 0; i < count / BITS_PER_MARKER; i++)
1690           n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1691                   << ((size - 1 - i) * BITS_PER_MARKER);
1692       break;
1693     case LROTATE_EXPR:
1694       n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
1695       break;
1696     case RROTATE_EXPR:
1697       n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
1698       break;
1699     default:
1700       return false;
1701     }
1702   /* Zero unused bits for size.  */
1703   if (size < 64 / BITS_PER_MARKER)
1704     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1705   return true;
1706 }
1707
1708 /* Perform sanity checking for the symbolic number N and the gimple
1709    statement STMT.  */
1710
1711 static inline bool
1712 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1713 {
1714   tree lhs_type;
1715
1716   lhs_type = gimple_expr_type (stmt);
1717
1718   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1719     return false;
1720
1721   if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
1722     return false;
1723
1724   return true;
1725 }
1726
1727 /* Initialize the symbolic number N for the bswap pass from the base element
1728    SRC manipulated by the bitwise OR expression.  */
1729
1730 static bool
1731 init_symbolic_number (struct symbolic_number *n, tree src)
1732 {
1733   int size;
1734
1735   n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
1736
1737   /* Set up the symbolic number N by setting each byte to a value between 1 and
1738      the byte size of rhs1.  The highest order byte is set to n->size and the
1739      lowest order byte to 1.  */
1740   n->type = TREE_TYPE (src);
1741   size = TYPE_PRECISION (n->type);
1742   if (size % BITS_PER_UNIT != 0)
1743     return false;
1744   size /= BITS_PER_UNIT;
1745   if (size > 64 / BITS_PER_MARKER)
1746     return false;
1747   n->range = size;
1748   n->n = CMPNOP;
1749
1750   if (size < 64 / BITS_PER_MARKER)
1751     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1752
1753   return true;
1754 }
1755
1756 /* Check if STMT might be a byte swap or a nop from a memory source and returns
1757    the answer. If so, REF is that memory source and the base of the memory area
1758    accessed and the offset of the access from that base are recorded in N.  */
1759
1760 bool
1761 find_bswap_or_nop_load (gimple stmt, tree ref, struct symbolic_number *n)
1762 {
1763   /* Leaf node is an array or component ref. Memorize its base and
1764      offset from base to compare to other such leaf node.  */
1765   HOST_WIDE_INT bitsize, bitpos;
1766   machine_mode mode;
1767   int unsignedp, volatilep;
1768   tree offset, base_addr;
1769
1770   if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
1771     return false;
1772
1773   base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
1774                                    &unsignedp, &volatilep, false);
1775
1776   if (TREE_CODE (base_addr) == MEM_REF)
1777     {
1778       offset_int bit_offset = 0;
1779       tree off = TREE_OPERAND (base_addr, 1);
1780
1781       if (!integer_zerop (off))
1782         {
1783           offset_int boff, coff = mem_ref_offset (base_addr);
1784           boff = wi::lshift (coff, LOG2_BITS_PER_UNIT);
1785           bit_offset += boff;
1786         }
1787
1788       base_addr = TREE_OPERAND (base_addr, 0);
1789
1790       /* Avoid returning a negative bitpos as this may wreak havoc later.  */
1791       if (wi::neg_p (bit_offset))
1792         {
1793           offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
1794           offset_int tem = bit_offset.and_not (mask);
1795           /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
1796              Subtract it to BIT_OFFSET and add it (scaled) to OFFSET.  */
1797           bit_offset -= tem;
1798           tem = wi::arshift (tem, LOG2_BITS_PER_UNIT);
1799           if (offset)
1800             offset = size_binop (PLUS_EXPR, offset,
1801                                     wide_int_to_tree (sizetype, tem));
1802           else
1803             offset = wide_int_to_tree (sizetype, tem);
1804         }
1805
1806       bitpos += bit_offset.to_shwi ();
1807     }
1808
1809   if (bitpos % BITS_PER_UNIT)
1810     return false;
1811   if (bitsize % BITS_PER_UNIT)
1812     return false;
1813
1814   if (!init_symbolic_number (n, ref))
1815     return false;
1816   n->base_addr = base_addr;
1817   n->offset = offset;
1818   n->bytepos = bitpos / BITS_PER_UNIT;
1819   n->alias_set = reference_alias_ptr_type (ref);
1820   n->vuse = gimple_vuse (stmt);
1821   return true;
1822 }
1823
1824 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
1825    the operation given by the rhs of STMT on the result.  If the operation
1826    could successfully be executed the function returns a gimple stmt whose
1827    rhs's first tree is the expression of the source operand and NULL
1828    otherwise.  */
1829
1830 static gimple
1831 find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
1832 {
1833   enum tree_code code;
1834   tree rhs1, rhs2 = NULL;
1835   gimple rhs1_stmt, rhs2_stmt, source_stmt1;
1836   enum gimple_rhs_class rhs_class;
1837
1838   if (!limit || !is_gimple_assign (stmt))
1839     return NULL;
1840
1841   rhs1 = gimple_assign_rhs1 (stmt);
1842
1843   if (find_bswap_or_nop_load (stmt, rhs1, n))
1844     return stmt;
1845
1846   if (TREE_CODE (rhs1) != SSA_NAME)
1847     return NULL;
1848
1849   code = gimple_assign_rhs_code (stmt);
1850   rhs_class = gimple_assign_rhs_class (stmt);
1851   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1852
1853   if (rhs_class == GIMPLE_BINARY_RHS)
1854     rhs2 = gimple_assign_rhs2 (stmt);
1855
1856   /* Handle unary rhs and binary rhs with integer constants as second
1857      operand.  */
1858
1859   if (rhs_class == GIMPLE_UNARY_RHS
1860       || (rhs_class == GIMPLE_BINARY_RHS
1861           && TREE_CODE (rhs2) == INTEGER_CST))
1862     {
1863       if (code != BIT_AND_EXPR
1864           && code != LSHIFT_EXPR
1865           && code != RSHIFT_EXPR
1866           && code != LROTATE_EXPR
1867           && code != RROTATE_EXPR
1868           && !CONVERT_EXPR_CODE_P (code))
1869         return NULL;
1870
1871       source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
1872
1873       /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
1874          we have to initialize the symbolic number.  */
1875       if (!source_stmt1)
1876         {
1877           if (gimple_assign_load_p (stmt)
1878               || !init_symbolic_number (n, rhs1))
1879             return NULL;
1880           source_stmt1 = stmt;
1881         }
1882
1883       switch (code)
1884         {
1885         case BIT_AND_EXPR:
1886           {
1887             int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1888             uint64_t val = int_cst_value (rhs2), mask = 0;
1889             uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
1890
1891             /* Only constants masking full bytes are allowed.  */
1892             for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
1893               if ((val & tmp) != 0 && (val & tmp) != tmp)
1894                 return NULL;
1895               else if (val & tmp)
1896                 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
1897
1898             n->n &= mask;
1899           }
1900           break;
1901         case LSHIFT_EXPR:
1902         case RSHIFT_EXPR:
1903         case LROTATE_EXPR:
1904         case RROTATE_EXPR:
1905           if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
1906             return NULL;
1907           break;
1908         CASE_CONVERT:
1909           {
1910             int i, type_size, old_type_size;
1911             tree type;
1912
1913             type = gimple_expr_type (stmt);
1914             type_size = TYPE_PRECISION (type);
1915             if (type_size % BITS_PER_UNIT != 0)
1916               return NULL;
1917             type_size /= BITS_PER_UNIT;
1918             if (type_size > 64 / BITS_PER_MARKER)
1919               return NULL;
1920
1921             /* Sign extension: result is dependent on the value.  */
1922             old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1923             if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
1924                 && HEAD_MARKER (n->n, old_type_size))
1925               for (i = 0; i < type_size - old_type_size; i++)
1926                 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1927                         << ((type_size - 1 - i) * BITS_PER_MARKER);
1928
1929             if (type_size < 64 / BITS_PER_MARKER)
1930               {
1931                 /* If STMT casts to a smaller type mask out the bits not
1932                    belonging to the target type.  */
1933                 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
1934               }
1935             n->type = type;
1936             if (!n->base_addr)
1937               n->range = type_size;
1938           }
1939           break;
1940         default:
1941           return NULL;
1942         };
1943       return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
1944     }
1945
1946   /* Handle binary rhs.  */
1947
1948   if (rhs_class == GIMPLE_BINARY_RHS)
1949     {
1950       int i, size;
1951       struct symbolic_number n1, n2;
1952       uint64_t mask;
1953       gimple source_stmt2;
1954
1955       if (code != BIT_IOR_EXPR)
1956         return NULL;
1957
1958       if (TREE_CODE (rhs2) != SSA_NAME)
1959         return NULL;
1960
1961       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1962
1963       switch (code)
1964         {
1965         case BIT_IOR_EXPR:
1966           source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
1967
1968           if (!source_stmt1)
1969             return NULL;
1970
1971           source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
1972
1973           if (!source_stmt2)
1974             return NULL;
1975
1976           if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
1977             return NULL;
1978
1979           if (!n1.vuse != !n2.vuse ||
1980           (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
1981             return NULL;
1982
1983           if (gimple_assign_rhs1 (source_stmt1)
1984               != gimple_assign_rhs1 (source_stmt2))
1985             {
1986               int64_t inc;
1987               HOST_WIDE_INT off_sub;
1988               struct symbolic_number *n_ptr;
1989
1990               if (!n1.base_addr || !n2.base_addr
1991                   || !operand_equal_p (n1.base_addr, n2.base_addr, 0))
1992                 return NULL;
1993               if (!n1.offset != !n2.offset ||
1994                   (n1.offset && !operand_equal_p (n1.offset, n2.offset, 0)))
1995                 return NULL;
1996
1997               /* We swap n1 with n2 to have n1 < n2.  */
1998               if (n2.bytepos < n1.bytepos)
1999                 {
2000                   struct symbolic_number tmpn;
2001
2002                   tmpn = n2;
2003                   n2 = n1;
2004                   n1 = tmpn;
2005                   source_stmt1 = source_stmt2;
2006                 }
2007
2008               off_sub = n2.bytepos - n1.bytepos;
2009
2010               /* Check that the range of memory covered can be represented by
2011                  a symbolic number.  */
2012               if (off_sub + n2.range > 64 / BITS_PER_MARKER)
2013                 return NULL;
2014               n->range = n2.range + off_sub;
2015
2016               /* Reinterpret byte marks in symbolic number holding the value of
2017                  bigger weight according to target endianness.  */
2018               inc = BYTES_BIG_ENDIAN ? off_sub + n2.range - n1.range : off_sub;
2019               size = TYPE_PRECISION (n1.type) / BITS_PER_UNIT;
2020               if (BYTES_BIG_ENDIAN)
2021                 n_ptr = &n1;
2022               else
2023                 n_ptr = &n2;
2024               for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
2025                 {
2026                   unsigned marker =
2027                     (n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
2028                   if (marker && marker != MARKER_BYTE_UNKNOWN)
2029                     n_ptr->n += inc;
2030                 }
2031             }
2032           else
2033             n->range = n1.range;
2034
2035           if (!n1.alias_set
2036               || alias_ptr_types_compatible_p (n1.alias_set, n2.alias_set))
2037             n->alias_set = n1.alias_set;
2038           else
2039             n->alias_set = ptr_type_node;
2040           n->vuse = n1.vuse;
2041           n->base_addr = n1.base_addr;
2042           n->offset = n1.offset;
2043           n->bytepos = n1.bytepos;
2044           n->type = n1.type;
2045           size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2046           for (i = 0, mask = MARKER_MASK; i < size;
2047                i++, mask <<= BITS_PER_MARKER)
2048             {
2049               uint64_t masked1, masked2;
2050
2051               masked1 = n1.n & mask;
2052               masked2 = n2.n & mask;
2053               if (masked1 && masked2 && masked1 != masked2)
2054                 return NULL;
2055             }
2056           n->n = n1.n | n2.n;
2057
2058           if (!verify_symbolic_number_p (n, stmt))
2059             return NULL;
2060
2061           break;
2062         default:
2063           return NULL;
2064         }
2065       return source_stmt1;
2066     }
2067   return NULL;
2068 }
2069
2070 /* Check if STMT completes a bswap implementation or a read in a given
2071    endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2072    accordingly.  It also sets N to represent the kind of operations
2073    performed: size of the resulting expression and whether it works on
2074    a memory source, and if so alias-set and vuse.  At last, the
2075    function returns a stmt whose rhs's first tree is the source
2076    expression.  */
2077
2078 static gimple
2079 find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap)
2080 {
2081 /* The number which the find_bswap_or_nop_1 result should match in order
2082    to have a full byte swap.  The number is shifted to the right
2083    according to the size of the symbolic number before using it.  */
2084   uint64_t cmpxchg = CMPXCHG;
2085   uint64_t cmpnop = CMPNOP;
2086
2087   gimple source_stmt;
2088   int limit;
2089
2090   /* The last parameter determines the depth search limit.  It usually
2091      correlates directly to the number n of bytes to be touched.  We
2092      increase that number by log2(n) + 1 here in order to also
2093      cover signed -> unsigned conversions of the src operand as can be seen
2094      in libgcc, and for initial shift/and operation of the src operand.  */
2095   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2096   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2097   source_stmt =  find_bswap_or_nop_1 (stmt, n, limit);
2098
2099   if (!source_stmt)
2100     return NULL;
2101
2102   /* Find real size of result (highest non zero byte).  */
2103   if (n->base_addr)
2104     {
2105       int rsize;
2106       uint64_t tmpn;
2107
2108       for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2109       n->range = rsize;
2110     }
2111
2112   /* Zero out the extra bits of N and CMP*.  */
2113   if (n->range < (int) sizeof (int64_t))
2114     {
2115       uint64_t mask;
2116
2117       mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2118       cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2119       cmpnop &= mask;
2120     }
2121
2122   /* A complete byte swap should make the symbolic number to start with
2123      the largest digit in the highest order byte. Unchanged symbolic
2124      number indicates a read with same endianness as target architecture.  */
2125   if (n->n == cmpnop)
2126     *bswap = false;
2127   else if (n->n == cmpxchg)
2128     *bswap = true;
2129   else
2130     return NULL;
2131
2132   /* Useless bit manipulation performed by code.  */
2133   if (!n->base_addr && n->n == cmpnop)
2134     return NULL;
2135
2136   n->range *= BITS_PER_UNIT;
2137   return source_stmt;
2138 }
2139
2140 namespace {
2141
2142 const pass_data pass_data_optimize_bswap =
2143 {
2144   GIMPLE_PASS, /* type */
2145   "bswap", /* name */
2146   OPTGROUP_NONE, /* optinfo_flags */
2147   TV_NONE, /* tv_id */
2148   PROP_ssa, /* properties_required */
2149   0, /* properties_provided */
2150   0, /* properties_destroyed */
2151   0, /* todo_flags_start */
2152   0, /* todo_flags_finish */
2153 };
2154
2155 class pass_optimize_bswap : public gimple_opt_pass
2156 {
2157 public:
2158   pass_optimize_bswap (gcc::context *ctxt)
2159     : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2160   {}
2161
2162   /* opt_pass methods: */
2163   virtual bool gate (function *)
2164     {
2165       return flag_expensive_optimizations && optimize;
2166     }
2167
2168   virtual unsigned int execute (function *);
2169
2170 }; // class pass_optimize_bswap
2171
2172 /* Perform the bswap optimization: replace the expression computed in the rhs
2173    of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2174    Which of these alternatives replace the rhs is given by N->base_addr (non
2175    null if a load is needed) and BSWAP.  The type, VUSE and set-alias of the
2176    load to perform are also given in N while the builtin bswap invoke is given
2177    in FNDEL.  Finally, if a load is involved, SRC_STMT refers to one of the
2178    load statements involved to construct the rhs in CUR_STMT and N->range gives
2179    the size of the rhs expression for maintaining some statistics.
2180
2181    Note that if the replacement involve a load, CUR_STMT is moved just after
2182    SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2183    changing of basic block.  */
2184
2185 static bool
2186 bswap_replace (gimple cur_stmt, gimple src_stmt, tree fndecl, tree bswap_type,
2187                tree load_type, struct symbolic_number *n, bool bswap)
2188 {
2189   gimple_stmt_iterator gsi;
2190   tree src, tmp, tgt;
2191   gimple bswap_stmt;
2192
2193   gsi = gsi_for_stmt (cur_stmt);
2194   src = gimple_assign_rhs1 (src_stmt);
2195   tgt = gimple_assign_lhs (cur_stmt);
2196
2197   /* Need to load the value from memory first.  */
2198   if (n->base_addr)
2199     {
2200       gimple_stmt_iterator gsi_ins = gsi_for_stmt (src_stmt);
2201       tree addr_expr, addr_tmp, val_expr, val_tmp;
2202       tree load_offset_ptr, aligned_load_type;
2203       gimple addr_stmt, load_stmt;
2204       unsigned align;
2205
2206       align = get_object_alignment (src);
2207       if (bswap
2208           && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
2209           && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
2210         return false;
2211
2212       /* Move cur_stmt just before  one of the load of the original
2213          to ensure it has the same VUSE.  See PR61517 for what could
2214          go wrong.  */
2215       gsi_move_before (&gsi, &gsi_ins);
2216       gsi = gsi_for_stmt (cur_stmt);
2217
2218       /*  Compute address to load from and cast according to the size
2219           of the load.  */
2220       addr_expr = build_fold_addr_expr (unshare_expr (src));
2221       if (is_gimple_min_invariant (addr_expr))
2222         addr_tmp = addr_expr;
2223       else
2224         {
2225           addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2226                                          "load_src");
2227           addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2228           gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2229         }
2230
2231       /* Perform the load.  */
2232       aligned_load_type = load_type;
2233       if (align < TYPE_ALIGN (load_type))
2234         aligned_load_type = build_aligned_type (load_type, align);
2235       load_offset_ptr = build_int_cst (n->alias_set, 0);
2236       val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2237                               load_offset_ptr);
2238
2239       if (!bswap)
2240         {
2241           if (n->range == 16)
2242             nop_stats.found_16bit++;
2243           else if (n->range == 32)
2244             nop_stats.found_32bit++;
2245           else
2246             {
2247               gcc_assert (n->range == 64);
2248               nop_stats.found_64bit++;
2249             }
2250
2251           /* Convert the result of load if necessary.  */
2252           if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2253             {
2254               val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2255                                             "load_dst");
2256               load_stmt = gimple_build_assign (val_tmp, val_expr);
2257               gimple_set_vuse (load_stmt, n->vuse);
2258               gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2259               gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2260             }
2261           else
2262             {
2263               gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2264               gimple_set_vuse (cur_stmt, n->vuse);
2265             }
2266           update_stmt (cur_stmt);
2267
2268           if (dump_file)
2269             {
2270               fprintf (dump_file,
2271                        "%d bit load in target endianness found at: ",
2272                        (int)n->range);
2273               print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2274             }
2275           return true;
2276         }
2277       else
2278         {
2279           val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2280           load_stmt = gimple_build_assign (val_tmp, val_expr);
2281           gimple_set_vuse (load_stmt, n->vuse);
2282           gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2283         }
2284       src = val_tmp;
2285     }
2286
2287   if (n->range == 16)
2288     bswap_stats.found_16bit++;
2289   else if (n->range == 32)
2290     bswap_stats.found_32bit++;
2291   else
2292     {
2293       gcc_assert (n->range == 64);
2294       bswap_stats.found_64bit++;
2295     }
2296
2297   tmp = src;
2298
2299   /* Canonical form for 16 bit bswap is a rotate expression.  Only 16bit values
2300      are considered as rotation of 2N bit values by N bits is generally not
2301      equivalent to a bswap.  Consider for instance 0x01020304 >> 16 which gives
2302      0x03040102 while a bswap for that value is 0x04030201.  */
2303   if (bswap && n->range == 16)
2304     {
2305       tree count = build_int_cst (NULL, BITS_PER_UNIT);
2306       bswap_type = TREE_TYPE (src);
2307       src = fold_build2 (LROTATE_EXPR, bswap_type, src, count);
2308       bswap_stmt = gimple_build_assign (NULL, src);
2309     }
2310   else
2311     {
2312       /* Convert the src expression if necessary.  */
2313       if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2314         {
2315           gimple convert_stmt;
2316           tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2317           convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
2318           gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2319         }
2320
2321       bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2322     }
2323
2324   tmp = tgt;
2325
2326   /* Convert the result if necessary.  */
2327   if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2328     {
2329       gimple convert_stmt;
2330       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2331       convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
2332       gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2333     }
2334
2335   gimple_set_lhs (bswap_stmt, tmp);
2336
2337   if (dump_file)
2338     {
2339       fprintf (dump_file, "%d bit bswap implementation found at: ",
2340                (int)n->range);
2341       print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2342     }
2343
2344   gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2345   gsi_remove (&gsi, true);
2346   return true;
2347 }
2348
2349 /* Find manual byte swap implementations as well as load in a given
2350    endianness. Byte swaps are turned into a bswap builtin invokation
2351    while endian loads are converted to bswap builtin invokation or
2352    simple load according to the target endianness.  */
2353
2354 unsigned int
2355 pass_optimize_bswap::execute (function *fun)
2356 {
2357   basic_block bb;
2358   bool bswap16_p, bswap32_p, bswap64_p;
2359   bool changed = false;
2360   tree bswap16_type = NULL_TREE, bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2361
2362   if (BITS_PER_UNIT != 8)
2363     return 0;
2364
2365   bswap16_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP16)
2366                && optab_handler (bswap_optab, HImode) != CODE_FOR_nothing);
2367   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2368                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2369   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2370                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2371                    || (bswap32_p && word_mode == SImode)));
2372
2373   /* Determine the argument type of the builtins.  The code later on
2374      assumes that the return and argument type are the same.  */
2375   if (bswap16_p)
2376     {
2377       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
2378       bswap16_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2379     }
2380
2381   if (bswap32_p)
2382     {
2383       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2384       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2385     }
2386
2387   if (bswap64_p)
2388     {
2389       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2390       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2391     }
2392
2393   memset (&nop_stats, 0, sizeof (nop_stats));
2394   memset (&bswap_stats, 0, sizeof (bswap_stats));
2395
2396   FOR_EACH_BB_FN (bb, fun)
2397     {
2398       gimple_stmt_iterator gsi;
2399
2400       /* We do a reverse scan for bswap patterns to make sure we get the
2401          widest match. As bswap pattern matching doesn't handle previously
2402          inserted smaller bswap replacements as sub-patterns, the wider
2403          variant wouldn't be detected.  */
2404       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2405         {
2406           gimple src_stmt, cur_stmt = gsi_stmt (gsi);
2407           tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2408           enum tree_code code;
2409           struct symbolic_number n;
2410           bool bswap;
2411
2412           /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2413              might be moved to a different basic block by bswap_replace and gsi
2414              must not points to it if that's the case.  Moving the gsi_prev
2415              there make sure that gsi points to the statement previous to
2416              cur_stmt while still making sure that all statements are
2417              considered in this basic block.  */
2418           gsi_prev (&gsi);
2419
2420           if (!is_gimple_assign (cur_stmt))
2421             continue;
2422
2423           code = gimple_assign_rhs_code (cur_stmt);
2424           switch (code)
2425             {
2426             case LROTATE_EXPR:
2427             case RROTATE_EXPR:
2428               if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2429                   || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2430                      % BITS_PER_UNIT)
2431                 continue;
2432               /* Fall through.  */
2433             case BIT_IOR_EXPR:
2434               break;
2435             default:
2436               continue;
2437             }
2438
2439           src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2440
2441           if (!src_stmt)
2442             continue;
2443
2444           switch (n.range)
2445             {
2446             case 16:
2447               /* Already in canonical form, nothing to do.  */
2448               if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2449                 continue;
2450               load_type = uint16_type_node;
2451               if (bswap16_p)
2452                 {
2453                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
2454                   bswap_type = bswap16_type;
2455                 }
2456               break;
2457             case 32:
2458               load_type = uint32_type_node;
2459               if (bswap32_p)
2460                 {
2461                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2462                   bswap_type = bswap32_type;
2463                 }
2464               break;
2465             case 64:
2466               load_type = uint64_type_node;
2467               if (bswap64_p)
2468                 {
2469                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2470                   bswap_type = bswap64_type;
2471                 }
2472               break;
2473             default:
2474               continue;
2475             }
2476
2477           if (bswap && !fndecl)
2478             continue;
2479
2480           if (bswap_replace (cur_stmt, src_stmt, fndecl, bswap_type, load_type,
2481                              &n, bswap))
2482             changed = true;
2483         }
2484     }
2485
2486   statistics_counter_event (fun, "16-bit nop implementations found",
2487                             nop_stats.found_16bit);
2488   statistics_counter_event (fun, "32-bit nop implementations found",
2489                             nop_stats.found_32bit);
2490   statistics_counter_event (fun, "64-bit nop implementations found",
2491                             nop_stats.found_64bit);
2492   statistics_counter_event (fun, "16-bit bswap implementations found",
2493                             bswap_stats.found_16bit);
2494   statistics_counter_event (fun, "32-bit bswap implementations found",
2495                             bswap_stats.found_32bit);
2496   statistics_counter_event (fun, "64-bit bswap implementations found",
2497                             bswap_stats.found_64bit);
2498
2499   return (changed ? TODO_update_ssa : 0);
2500 }
2501
2502 } // anon namespace
2503
2504 gimple_opt_pass *
2505 make_pass_optimize_bswap (gcc::context *ctxt)
2506 {
2507   return new pass_optimize_bswap (ctxt);
2508 }
2509
2510 /* Return true if stmt is a type conversion operation that can be stripped
2511    when used in a widening multiply operation.  */
2512 static bool
2513 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
2514 {
2515   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2516
2517   if (TREE_CODE (result_type) == INTEGER_TYPE)
2518     {
2519       tree op_type;
2520       tree inner_op_type;
2521
2522       if (!CONVERT_EXPR_CODE_P (rhs_code))
2523         return false;
2524
2525       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2526
2527       /* If the type of OP has the same precision as the result, then
2528          we can strip this conversion.  The multiply operation will be
2529          selected to create the correct extension as a by-product.  */
2530       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2531         return true;
2532
2533       /* We can also strip a conversion if it preserves the signed-ness of
2534          the operation and doesn't narrow the range.  */
2535       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2536
2537       /* If the inner-most type is unsigned, then we can strip any
2538          intermediate widening operation.  If it's signed, then the
2539          intermediate widening operation must also be signed.  */
2540       if ((TYPE_UNSIGNED (inner_op_type)
2541            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2542           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2543         return true;
2544
2545       return false;
2546     }
2547
2548   return rhs_code == FIXED_CONVERT_EXPR;
2549 }
2550
2551 /* Return true if RHS is a suitable operand for a widening multiplication,
2552    assuming a target type of TYPE.
2553    There are two cases:
2554
2555      - RHS makes some value at least twice as wide.  Store that value
2556        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2557
2558      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2559        but leave *TYPE_OUT untouched.  */
2560
2561 static bool
2562 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2563                         tree *new_rhs_out)
2564 {
2565   gimple stmt;
2566   tree type1, rhs1;
2567
2568   if (TREE_CODE (rhs) == SSA_NAME)
2569     {
2570       stmt = SSA_NAME_DEF_STMT (rhs);
2571       if (is_gimple_assign (stmt))
2572         {
2573           if (! widening_mult_conversion_strippable_p (type, stmt))
2574             rhs1 = rhs;
2575           else
2576             {
2577               rhs1 = gimple_assign_rhs1 (stmt);
2578
2579               if (TREE_CODE (rhs1) == INTEGER_CST)
2580                 {
2581                   *new_rhs_out = rhs1;
2582                   *type_out = NULL;
2583                   return true;
2584                 }
2585             }
2586         }
2587       else
2588         rhs1 = rhs;
2589
2590       type1 = TREE_TYPE (rhs1);
2591
2592       if (TREE_CODE (type1) != TREE_CODE (type)
2593           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2594         return false;
2595
2596       *new_rhs_out = rhs1;
2597       *type_out = type1;
2598       return true;
2599     }
2600
2601   if (TREE_CODE (rhs) == INTEGER_CST)
2602     {
2603       *new_rhs_out = rhs;
2604       *type_out = NULL;
2605       return true;
2606     }
2607
2608   return false;
2609 }
2610
2611 /* Return true if STMT performs a widening multiplication, assuming the
2612    output type is TYPE.  If so, store the unwidened types of the operands
2613    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2614    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2615    and *TYPE2_OUT would give the operands of the multiplication.  */
2616
2617 static bool
2618 is_widening_mult_p (gimple stmt,
2619                     tree *type1_out, tree *rhs1_out,
2620                     tree *type2_out, tree *rhs2_out)
2621 {
2622   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2623
2624   if (TREE_CODE (type) != INTEGER_TYPE
2625       && TREE_CODE (type) != FIXED_POINT_TYPE)
2626     return false;
2627
2628   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2629                                rhs1_out))
2630     return false;
2631
2632   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2633                                rhs2_out))
2634     return false;
2635
2636   if (*type1_out == NULL)
2637     {
2638       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2639         return false;
2640       *type1_out = *type2_out;
2641     }
2642
2643   if (*type2_out == NULL)
2644     {
2645       if (!int_fits_type_p (*rhs2_out, *type1_out))
2646         return false;
2647       *type2_out = *type1_out;
2648     }
2649
2650   /* Ensure that the larger of the two operands comes first. */
2651   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2652     {
2653       tree tmp;
2654       tmp = *type1_out;
2655       *type1_out = *type2_out;
2656       *type2_out = tmp;
2657       tmp = *rhs1_out;
2658       *rhs1_out = *rhs2_out;
2659       *rhs2_out = tmp;
2660     }
2661
2662   return true;
2663 }
2664
2665 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2666    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2667    value is true iff we converted the statement.  */
2668
2669 static bool
2670 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2671 {
2672   tree lhs, rhs1, rhs2, type, type1, type2;
2673   enum insn_code handler;
2674   machine_mode to_mode, from_mode, actual_mode;
2675   optab op;
2676   int actual_precision;
2677   location_t loc = gimple_location (stmt);
2678   bool from_unsigned1, from_unsigned2;
2679
2680   lhs = gimple_assign_lhs (stmt);
2681   type = TREE_TYPE (lhs);
2682   if (TREE_CODE (type) != INTEGER_TYPE)
2683     return false;
2684
2685   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2686     return false;
2687
2688   to_mode = TYPE_MODE (type);
2689   from_mode = TYPE_MODE (type1);
2690   from_unsigned1 = TYPE_UNSIGNED (type1);
2691   from_unsigned2 = TYPE_UNSIGNED (type2);
2692
2693   if (from_unsigned1 && from_unsigned2)
2694     op = umul_widen_optab;
2695   else if (!from_unsigned1 && !from_unsigned2)
2696     op = smul_widen_optab;
2697   else
2698     op = usmul_widen_optab;
2699
2700   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2701                                                   0, &actual_mode);
2702
2703   if (handler == CODE_FOR_nothing)
2704     {
2705       if (op != smul_widen_optab)
2706         {
2707           /* We can use a signed multiply with unsigned types as long as
2708              there is a wider mode to use, or it is the smaller of the two
2709              types that is unsigned.  Note that type1 >= type2, always.  */
2710           if ((TYPE_UNSIGNED (type1)
2711                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2712               || (TYPE_UNSIGNED (type2)
2713                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2714             {
2715               from_mode = GET_MODE_WIDER_MODE (from_mode);
2716               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2717                 return false;
2718             }
2719
2720           op = smul_widen_optab;
2721           handler = find_widening_optab_handler_and_mode (op, to_mode,
2722                                                           from_mode, 0,
2723                                                           &actual_mode);
2724
2725           if (handler == CODE_FOR_nothing)
2726             return false;
2727
2728           from_unsigned1 = from_unsigned2 = false;
2729         }
2730       else
2731         return false;
2732     }
2733
2734   /* Ensure that the inputs to the handler are in the correct precison
2735      for the opcode.  This will be the full mode size.  */
2736   actual_precision = GET_MODE_PRECISION (actual_mode);
2737   if (2 * actual_precision > TYPE_PRECISION (type))
2738     return false;
2739   if (actual_precision != TYPE_PRECISION (type1)
2740       || from_unsigned1 != TYPE_UNSIGNED (type1))
2741     rhs1 = build_and_insert_cast (gsi, loc,
2742                                   build_nonstandard_integer_type
2743                                     (actual_precision, from_unsigned1), rhs1);
2744   if (actual_precision != TYPE_PRECISION (type2)
2745       || from_unsigned2 != TYPE_UNSIGNED (type2))
2746     rhs2 = build_and_insert_cast (gsi, loc,
2747                                   build_nonstandard_integer_type
2748                                     (actual_precision, from_unsigned2), rhs2);
2749
2750   /* Handle constants.  */
2751   if (TREE_CODE (rhs1) == INTEGER_CST)
2752     rhs1 = fold_convert (type1, rhs1);
2753   if (TREE_CODE (rhs2) == INTEGER_CST)
2754     rhs2 = fold_convert (type2, rhs2);
2755
2756   gimple_assign_set_rhs1 (stmt, rhs1);
2757   gimple_assign_set_rhs2 (stmt, rhs2);
2758   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2759   update_stmt (stmt);
2760   widen_mul_stats.widen_mults_inserted++;
2761   return true;
2762 }
2763
2764 /* Process a single gimple statement STMT, which is found at the
2765    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2766    rhs (given by CODE), and try to convert it into a
2767    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2768    is true iff we converted the statement.  */
2769
2770 static bool
2771 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2772                             enum tree_code code)
2773 {
2774   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2775   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2776   tree type, type1, type2, optype;
2777   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2778   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2779   optab this_optab;
2780   enum tree_code wmult_code;
2781   enum insn_code handler;
2782   machine_mode to_mode, from_mode, actual_mode;
2783   location_t loc = gimple_location (stmt);
2784   int actual_precision;
2785   bool from_unsigned1, from_unsigned2;
2786
2787   lhs = gimple_assign_lhs (stmt);
2788   type = TREE_TYPE (lhs);
2789   if (TREE_CODE (type) != INTEGER_TYPE
2790       && TREE_CODE (type) != FIXED_POINT_TYPE)
2791     return false;
2792
2793   if (code == MINUS_EXPR)
2794     wmult_code = WIDEN_MULT_MINUS_EXPR;
2795   else
2796     wmult_code = WIDEN_MULT_PLUS_EXPR;
2797
2798   rhs1 = gimple_assign_rhs1 (stmt);
2799   rhs2 = gimple_assign_rhs2 (stmt);
2800
2801   if (TREE_CODE (rhs1) == SSA_NAME)
2802     {
2803       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2804       if (is_gimple_assign (rhs1_stmt))
2805         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2806     }
2807
2808   if (TREE_CODE (rhs2) == SSA_NAME)
2809     {
2810       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2811       if (is_gimple_assign (rhs2_stmt))
2812         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2813     }
2814
2815   /* Allow for one conversion statement between the multiply
2816      and addition/subtraction statement.  If there are more than
2817      one conversions then we assume they would invalidate this
2818      transformation.  If that's not the case then they should have
2819      been folded before now.  */
2820   if (CONVERT_EXPR_CODE_P (rhs1_code))
2821     {
2822       conv1_stmt = rhs1_stmt;
2823       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2824       if (TREE_CODE (rhs1) == SSA_NAME)
2825         {
2826           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2827           if (is_gimple_assign (rhs1_stmt))
2828             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2829         }
2830       else
2831         return false;
2832     }
2833   if (CONVERT_EXPR_CODE_P (rhs2_code))
2834     {
2835       conv2_stmt = rhs2_stmt;
2836       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2837       if (TREE_CODE (rhs2) == SSA_NAME)
2838         {
2839           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2840           if (is_gimple_assign (rhs2_stmt))
2841             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2842         }
2843       else
2844         return false;
2845     }
2846
2847   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2848      is_widening_mult_p, but we still need the rhs returns.
2849
2850      It might also appear that it would be sufficient to use the existing
2851      operands of the widening multiply, but that would limit the choice of
2852      multiply-and-accumulate instructions.
2853
2854      If the widened-multiplication result has more than one uses, it is
2855      probably wiser not to do the conversion.  */
2856   if (code == PLUS_EXPR
2857       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2858     {
2859       if (!has_single_use (rhs1)
2860           || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2861                                   &type2, &mult_rhs2))
2862         return false;
2863       add_rhs = rhs2;
2864       conv_stmt = conv1_stmt;
2865     }
2866   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2867     {
2868       if (!has_single_use (rhs2)
2869           || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2870                                   &type2, &mult_rhs2))
2871         return false;
2872       add_rhs = rhs1;
2873       conv_stmt = conv2_stmt;
2874     }
2875   else
2876     return false;
2877
2878   to_mode = TYPE_MODE (type);
2879   from_mode = TYPE_MODE (type1);
2880   from_unsigned1 = TYPE_UNSIGNED (type1);
2881   from_unsigned2 = TYPE_UNSIGNED (type2);
2882   optype = type1;
2883
2884   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2885   if (from_unsigned1 != from_unsigned2)
2886     {
2887       if (!INTEGRAL_TYPE_P (type))
2888         return false;
2889       /* We can use a signed multiply with unsigned types as long as
2890          there is a wider mode to use, or it is the smaller of the two
2891          types that is unsigned.  Note that type1 >= type2, always.  */
2892       if ((from_unsigned1
2893            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2894           || (from_unsigned2
2895               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2896         {
2897           from_mode = GET_MODE_WIDER_MODE (from_mode);
2898           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2899             return false;
2900         }
2901
2902       from_unsigned1 = from_unsigned2 = false;
2903       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2904                                                false);
2905     }
2906
2907   /* If there was a conversion between the multiply and addition
2908      then we need to make sure it fits a multiply-and-accumulate.
2909      The should be a single mode change which does not change the
2910      value.  */
2911   if (conv_stmt)
2912     {
2913       /* We use the original, unmodified data types for this.  */
2914       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2915       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2916       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2917       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2918
2919       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2920         {
2921           /* Conversion is a truncate.  */
2922           if (TYPE_PRECISION (to_type) < data_size)
2923             return false;
2924         }
2925       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2926         {
2927           /* Conversion is an extend.  Check it's the right sort.  */
2928           if (TYPE_UNSIGNED (from_type) != is_unsigned
2929               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2930             return false;
2931         }
2932       /* else convert is a no-op for our purposes.  */
2933     }
2934
2935   /* Verify that the machine can perform a widening multiply
2936      accumulate in this mode/signedness combination, otherwise
2937      this transformation is likely to pessimize code.  */
2938   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2939   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2940                                                   from_mode, 0, &actual_mode);
2941
2942   if (handler == CODE_FOR_nothing)
2943     return false;
2944
2945   /* Ensure that the inputs to the handler are in the correct precison
2946      for the opcode.  This will be the full mode size.  */
2947   actual_precision = GET_MODE_PRECISION (actual_mode);
2948   if (actual_precision != TYPE_PRECISION (type1)
2949       || from_unsigned1 != TYPE_UNSIGNED (type1))
2950     mult_rhs1 = build_and_insert_cast (gsi, loc,
2951                                        build_nonstandard_integer_type
2952                                          (actual_precision, from_unsigned1),
2953                                        mult_rhs1);
2954   if (actual_precision != TYPE_PRECISION (type2)
2955       || from_unsigned2 != TYPE_UNSIGNED (type2))
2956     mult_rhs2 = build_and_insert_cast (gsi, loc,
2957                                        build_nonstandard_integer_type
2958                                          (actual_precision, from_unsigned2),
2959                                        mult_rhs2);
2960
2961   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2962     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2963
2964   /* Handle constants.  */
2965   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2966     mult_rhs1 = fold_convert (type1, mult_rhs1);
2967   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2968     mult_rhs2 = fold_convert (type2, mult_rhs2);
2969
2970   gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
2971                                   add_rhs);
2972   update_stmt (gsi_stmt (*gsi));
2973   widen_mul_stats.maccs_inserted++;
2974   return true;
2975 }
2976
2977 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
2978    with uses in additions and subtractions to form fused multiply-add
2979    operations.  Returns true if successful and MUL_STMT should be removed.  */
2980
2981 static bool
2982 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
2983 {
2984   tree mul_result = gimple_get_lhs (mul_stmt);
2985   tree type = TREE_TYPE (mul_result);
2986   gimple use_stmt, neguse_stmt;
2987   gassign *fma_stmt;
2988   use_operand_p use_p;
2989   imm_use_iterator imm_iter;
2990
2991   if (FLOAT_TYPE_P (type)
2992       && flag_fp_contract_mode == FP_CONTRACT_OFF)
2993     return false;
2994
2995   /* We don't want to do bitfield reduction ops.  */
2996   if (INTEGRAL_TYPE_P (type)
2997       && (TYPE_PRECISION (type)
2998           != GET_MODE_PRECISION (TYPE_MODE (type))))
2999     return false;
3000
3001   /* If the target doesn't support it, don't generate it.  We assume that
3002      if fma isn't available then fms, fnma or fnms are not either.  */
3003   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3004     return false;
3005
3006   /* If the multiplication has zero uses, it is kept around probably because
3007      of -fnon-call-exceptions.  Don't optimize it away in that case,
3008      it is DCE job.  */
3009   if (has_zero_uses (mul_result))
3010     return false;
3011
3012   /* Make sure that the multiplication statement becomes dead after
3013      the transformation, thus that all uses are transformed to FMAs.
3014      This means we assume that an FMA operation has the same cost
3015      as an addition.  */
3016   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3017     {
3018       enum tree_code use_code;
3019       tree result = mul_result;
3020       bool negate_p = false;
3021
3022       use_stmt = USE_STMT (use_p);
3023
3024       if (is_gimple_debug (use_stmt))
3025         continue;
3026
3027       /* For now restrict this operations to single basic blocks.  In theory
3028          we would want to support sinking the multiplication in
3029          m = a*b;
3030          if ()
3031            ma = m + c;
3032          else
3033            d = m;
3034          to form a fma in the then block and sink the multiplication to the
3035          else block.  */
3036       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3037         return false;
3038
3039       if (!is_gimple_assign (use_stmt))
3040         return false;
3041
3042       use_code = gimple_assign_rhs_code (use_stmt);
3043
3044       /* A negate on the multiplication leads to FNMA.  */
3045       if (use_code == NEGATE_EXPR)
3046         {
3047           ssa_op_iter iter;
3048           use_operand_p usep;
3049
3050           result = gimple_assign_lhs (use_stmt);
3051
3052           /* Make sure the negate statement becomes dead with this
3053              single transformation.  */
3054           if (!single_imm_use (gimple_assign_lhs (use_stmt),
3055                                &use_p, &neguse_stmt))
3056             return false;
3057
3058           /* Make sure the multiplication isn't also used on that stmt.  */
3059           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3060             if (USE_FROM_PTR (usep) == mul_result)
3061               return false;
3062
3063           /* Re-validate.  */
3064           use_stmt = neguse_stmt;
3065           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3066             return false;
3067           if (!is_gimple_assign (use_stmt))
3068             return false;
3069
3070           use_code = gimple_assign_rhs_code (use_stmt);
3071           negate_p = true;
3072         }
3073
3074       switch (use_code)
3075         {
3076         case MINUS_EXPR:
3077           if (gimple_assign_rhs2 (use_stmt) == result)
3078             negate_p = !negate_p;
3079           break;
3080         case PLUS_EXPR:
3081           break;
3082         default:
3083           /* FMA can only be formed from PLUS and MINUS.  */
3084           return false;
3085         }
3086
3087       /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3088          by a MULT_EXPR that we'll visit later, we might be able to
3089          get a more profitable match with fnma.
3090          OTOH, if we don't, a negate / fma pair has likely lower latency
3091          that a mult / subtract pair.  */
3092       if (use_code == MINUS_EXPR && !negate_p
3093           && gimple_assign_rhs1 (use_stmt) == result
3094           && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3095           && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3096         {
3097           tree rhs2 = gimple_assign_rhs2 (use_stmt);
3098
3099           if (TREE_CODE (rhs2) == SSA_NAME)
3100             {
3101               gimple stmt2 = SSA_NAME_DEF_STMT (rhs2);
3102               if (has_single_use (rhs2)
3103                   && is_gimple_assign (stmt2)
3104                   && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3105               return false;
3106             }
3107         }
3108
3109       /* We can't handle a * b + a * b.  */
3110       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3111         return false;
3112
3113       /* While it is possible to validate whether or not the exact form
3114          that we've recognized is available in the backend, the assumption
3115          is that the transformation is never a loss.  For instance, suppose
3116          the target only has the plain FMA pattern available.  Consider
3117          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3118          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
3119          still have 3 operations, but in the FMA form the two NEGs are
3120          independent and could be run in parallel.  */
3121     }
3122
3123   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3124     {
3125       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3126       enum tree_code use_code;
3127       tree addop, mulop1 = op1, result = mul_result;
3128       bool negate_p = false;
3129
3130       if (is_gimple_debug (use_stmt))
3131         continue;
3132
3133       use_code = gimple_assign_rhs_code (use_stmt);
3134       if (use_code == NEGATE_EXPR)
3135         {
3136           result = gimple_assign_lhs (use_stmt);
3137           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3138           gsi_remove (&gsi, true);
3139           release_defs (use_stmt);
3140
3141           use_stmt = neguse_stmt;
3142           gsi = gsi_for_stmt (use_stmt);
3143           use_code = gimple_assign_rhs_code (use_stmt);
3144           negate_p = true;
3145         }
3146
3147       if (gimple_assign_rhs1 (use_stmt) == result)
3148         {
3149           addop = gimple_assign_rhs2 (use_stmt);
3150           /* a * b - c -> a * b + (-c)  */
3151           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3152             addop = force_gimple_operand_gsi (&gsi,
3153                                               build1 (NEGATE_EXPR,
3154                                                       type, addop),
3155                                               true, NULL_TREE, true,
3156                                               GSI_SAME_STMT);
3157         }
3158       else
3159         {
3160           addop = gimple_assign_rhs1 (use_stmt);
3161           /* a - b * c -> (-b) * c + a */
3162           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3163             negate_p = !negate_p;
3164         }
3165
3166       if (negate_p)
3167         mulop1 = force_gimple_operand_gsi (&gsi,
3168                                            build1 (NEGATE_EXPR,
3169                                                    type, mulop1),
3170                                            true, NULL_TREE, true,
3171                                            GSI_SAME_STMT);
3172
3173       fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
3174                                       FMA_EXPR, mulop1, op2, addop);
3175       gsi_replace (&gsi, fma_stmt, true);
3176       widen_mul_stats.fmas_inserted++;
3177     }
3178
3179   return true;
3180 }
3181
3182 /* Find integer multiplications where the operands are extended from
3183    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3184    where appropriate.  */
3185
3186 namespace {
3187
3188 const pass_data pass_data_optimize_widening_mul =
3189 {
3190   GIMPLE_PASS, /* type */
3191   "widening_mul", /* name */
3192   OPTGROUP_NONE, /* optinfo_flags */
3193   TV_NONE, /* tv_id */
3194   PROP_ssa, /* properties_required */
3195   0, /* properties_provided */
3196   0, /* properties_destroyed */
3197   0, /* todo_flags_start */
3198   TODO_update_ssa, /* todo_flags_finish */
3199 };
3200
3201 class pass_optimize_widening_mul : public gimple_opt_pass
3202 {
3203 public:
3204   pass_optimize_widening_mul (gcc::context *ctxt)
3205     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3206   {}
3207
3208   /* opt_pass methods: */
3209   virtual bool gate (function *)
3210     {
3211       return flag_expensive_optimizations && optimize;
3212     }
3213
3214   virtual unsigned int execute (function *);
3215
3216 }; // class pass_optimize_widening_mul
3217
3218 unsigned int
3219 pass_optimize_widening_mul::execute (function *fun)
3220 {
3221   basic_block bb;
3222   bool cfg_changed = false;
3223
3224   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3225
3226   FOR_EACH_BB_FN (bb, fun)
3227     {
3228       gimple_stmt_iterator gsi;
3229
3230       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3231         {
3232           gimple stmt = gsi_stmt (gsi);
3233           enum tree_code code;
3234
3235           if (is_gimple_assign (stmt))
3236             {
3237               code = gimple_assign_rhs_code (stmt);
3238               switch (code)
3239                 {
3240                 case MULT_EXPR:
3241                   if (!convert_mult_to_widen (stmt, &gsi)
3242                       && convert_mult_to_fma (stmt,
3243                                               gimple_assign_rhs1 (stmt),
3244                                               gimple_assign_rhs2 (stmt)))
3245                     {
3246                       gsi_remove (&gsi, true);
3247                       release_defs (stmt);
3248                       continue;
3249                     }
3250                   break;
3251
3252                 case PLUS_EXPR:
3253                 case MINUS_EXPR:
3254                   convert_plusminus_to_widen (&gsi, stmt, code);
3255                   break;
3256
3257                 default:;
3258                 }
3259             }
3260           else if (is_gimple_call (stmt)
3261                    && gimple_call_lhs (stmt))
3262             {
3263               tree fndecl = gimple_call_fndecl (stmt);
3264               if (fndecl
3265                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
3266                 {
3267                   switch (DECL_FUNCTION_CODE (fndecl))
3268                     {
3269                       case BUILT_IN_POWF:
3270                       case BUILT_IN_POW:
3271                       case BUILT_IN_POWL:
3272                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3273                             && REAL_VALUES_EQUAL
3274                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3275                                   dconst2)
3276                             && convert_mult_to_fma (stmt,
3277                                                     gimple_call_arg (stmt, 0),
3278                                                     gimple_call_arg (stmt, 0)))
3279                           {
3280                             unlink_stmt_vdef (stmt);
3281                             if (gsi_remove (&gsi, true)
3282                                 && gimple_purge_dead_eh_edges (bb))
3283                               cfg_changed = true;
3284                             release_defs (stmt);
3285                             continue;
3286                           }
3287                           break;
3288
3289                       default:;
3290                     }
3291                 }
3292             }
3293           gsi_next (&gsi);
3294         }
3295     }
3296
3297   statistics_counter_event (fun, "widening multiplications inserted",
3298                             widen_mul_stats.widen_mults_inserted);
3299   statistics_counter_event (fun, "widening maccs inserted",
3300                             widen_mul_stats.maccs_inserted);
3301   statistics_counter_event (fun, "fused multiply-adds inserted",
3302                             widen_mul_stats.fmas_inserted);
3303
3304   return cfg_changed ? TODO_cleanup_cfg : 0;
3305 }
3306
3307 } // anon namespace
3308
3309 gimple_opt_pass *
3310 make_pass_optimize_widening_mul (gcc::context *ctxt)
3311 {
3312   return new pass_optimize_widening_mul (ctxt);
3313 }