gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005-2014 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  21    operations.  These are common in sequences such as this one:
  22
  23         modulus = sqrt(x*x + y*y + z*z);
  24         x = x / modulus;
  25         y = y / modulus;
  26         z = z / modulus;
  27
  28    that can be optimized to
  29
  30         modulus = sqrt(x*x + y*y + z*z);
  31         rmodulus = 1.0 / modulus;
  32         x = x * rmodulus;
  33         y = y * rmodulus;
  34         z = z * rmodulus;
  35
  36    We do this for loop invariant divisors, and with this pass whenever
  37    we notice that a division has the same divisor multiple times.
  38
  39    Of course, like in PRE, we don't insert a division if a dominator
  40    already has one.  However, this cannot be done as an extension of
  41    PRE for several reasons.
  42
  43    First of all, with some experiments it was found out that the
  44    transformation is not always useful if there are only two divisions
  45    hy the same divisor.  This is probably because modern processors
  46    can pipeline the divisions; on older, in-order processors it should
  47    still be effective to optimize two divisions by the same number.
  48    We make this a param, and it shall be called N in the remainder of
  49    this comment.
  50
  51    Second, if trapping math is active, we have less freedom on where
  52    to insert divisions: we can only do so in basic blocks that already
  53    contain one.  (If divisions don't trap, instead, we can insert
  54    divisions elsewhere, which will be in blocks that are common dominators
  55    of those that have the division).
  56
  57    We really don't want to compute the reciprocal unless a division will
  58    be found.  To do this, we won't insert the division in a basic block
  59    that has less than N divisions *post-dominating* it.
  60
  61    The algorithm constructs a subset of the dominator tree, holding the
  62    blocks containing the divisions and the common dominators to them,
  63    and walk it twice.  The first walk is in post-order, and it annotates
  64    each block with the number of divisions that post-dominate it: this
  65    gives information on where divisions can be inserted profitably.
  66    The second walk is in pre-order, and it inserts divisions as explained
  67    above, and replaces divisions by multiplications.
  68
  69    In the best case, the cost of the pass is O(n_statements).  In the
  70    worst-case, the cost is due to creating the dominator tree subset,
  71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  72    for n_statements / n_basic_blocks statements.  So, the amortized cost
  73    of creating the dominator tree subset is O(n_basic_blocks) and the
  74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  75
  76    More practically, the cost will be small because there are few
  77    divisions, and they tend to be in the same basic block, so insert_bb
  78    is called very few times.
  79
  80    If we did this using domwalk.c, an efficient implementation would have
  81    to work on all the variables in a single pass, because we could not
  82    work on just a subset of the dominator tree, as we do now, and the
  83    cost would also be something like O(n_statements * n_basic_blocks).
  84    The data structures would be more complex in order to work on all the
  85    variables in a single pass.  */
  86
  87 #include "config.h"
  88 #include "system.h"
  89 #include "coretypes.h"
  90 #include "tm.h"
  91 #include "flags.h"
  92 #include "tree.h"
  93 #include "predict.h"
  94 #include "vec.h"
  95 #include "hashtab.h"
  96 #include "hash-set.h"
  97 #include "machmode.h"
  98 #include "hard-reg-set.h"
  99 #include "input.h"
 100 #include "function.h"
 101 #include "dominance.h"
 102 #include "cfg.h"
 103 #include "basic-block.h"
 104 #include "tree-ssa-alias.h"
 105 #include "internal-fn.h"
 106 #include "gimple-fold.h"
 107 #include "gimple-expr.h"
 108 #include "is-a.h"
 109 #include "gimple.h"
 110 #include "gimple-iterator.h"
 111 #include "gimplify.h"
 112 #include "gimplify-me.h"
 113 #include "stor-layout.h"
 114 #include "gimple-ssa.h"
 115 #include "tree-cfg.h"
 116 #include "tree-phinodes.h"
 117 #include "ssa-iterators.h"
 118 #include "stringpool.h"
 119 #include "tree-ssanames.h"
 120 #include "expr.h"
 121 #include "tree-dfa.h"
 122 #include "tree-ssa.h"
 123 #include "tree-pass.h"
 124 #include "alloc-pool.h"
 125 #include "target.h"
 126 #include "gimple-pretty-print.h"
 127 #include "builtins.h"
 128
 129 /* FIXME: RTL headers have to be included here for optabs.  */
 130 #include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
 131 #include "expr.h"               /* Because optabs.h wants sepops.  */
 132 #include "insn-codes.h"
 133 #include "optabs.h"
 134
 135 /* This structure represents one basic block that either computes a
 136    division, or is a common dominator for basic block that compute a
 137    division.  */
 138 struct occurrence {
 139   /* The basic block represented by this structure.  */
 140   basic_block bb;
 141
 142   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 143      inserted in BB.  */
 144   tree recip_def;
 145
 146   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 147      was inserted in BB.  */
 148   gimple recip_def_stmt;
 149
 150   /* Pointer to a list of "struct occurrence"s for blocks dominated
 151      by BB.  */
 152   struct occurrence *children;
 153
 154   /* Pointer to the next "struct occurrence"s in the list of blocks
 155      sharing a common dominator.  */
 156   struct occurrence *next;
 157
 158   /* The number of divisions that are in BB before compute_merit.  The
 159      number of divisions that are in BB or post-dominate it after
 160      compute_merit.  */
 161   int num_divisions;
 162
 163   /* True if the basic block has a division, false if it is a common
 164      dominator for basic blocks that do.  If it is false and trapping
 165      math is active, BB is not a candidate for inserting a reciprocal.  */
 166   bool bb_has_division;
 167 };
 168
 169 static struct
 170 {
 171   /* Number of 1.0/X ops inserted.  */
 172   int rdivs_inserted;
 173
 174   /* Number of 1.0/FUNC ops inserted.  */
 175   int rfuncs_inserted;
 176 } reciprocal_stats;
 177
 178 static struct
 179 {
 180   /* Number of cexpi calls inserted.  */
 181   int inserted;
 182 } sincos_stats;
 183
 184 static struct
 185 {
 186   /* Number of hand-written 16-bit nop / bswaps found.  */
 187   int found_16bit;
 188
 189   /* Number of hand-written 32-bit nop / bswaps found.  */
 190   int found_32bit;
 191
 192   /* Number of hand-written 64-bit nop / bswaps found.  */
 193   int found_64bit;
 194 } nop_stats, bswap_stats;
 195
 196 static struct
 197 {
 198   /* Number of widening multiplication ops inserted.  */
 199   int widen_mults_inserted;
 200
 201   /* Number of integer multiply-and-accumulate ops inserted.  */
 202   int maccs_inserted;
 203
 204   /* Number of fp fused multiply-add ops inserted.  */
 205   int fmas_inserted;
 206 } widen_mul_stats;
 207
 208 /* The instance of "struct occurrence" representing the highest
 209    interesting block in the dominator tree.  */
 210 static struct occurrence *occ_head;
 211
 212 /* Allocation pool for getting instances of "struct occurrence".  */
 213 static alloc_pool occ_pool;
 214
 215
 216
 217 /* Allocate and return a new struct occurrence for basic block BB, and
 218    whose children list is headed by CHILDREN.  */
 219 static struct occurrence *
 220 occ_new (basic_block bb, struct occurrence *children)
 221 {
 222   struct occurrence *occ;
 223
 224   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
 225   memset (occ, 0, sizeof (struct occurrence));
 226
 227   occ->bb = bb;
 228   occ->children = children;
 229   return occ;
 230 }
 231
 232
 233 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 234    list of "struct occurrence"s, one per basic block, having IDOM as
 235    their common dominator.
 236
 237    We try to insert NEW_OCC as deep as possible in the tree, and we also
 238    insert any other block that is a common dominator for BB and one
 239    block already in the tree.  */
 240
 241 static void
 242 insert_bb (struct occurrence *new_occ, basic_block idom,
 243            struct occurrence **p_head)
 244 {
 245   struct occurrence *occ, **p_occ;
 246
 247   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 248     {
 249       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 250       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 251       if (dom == bb)
 252         {
 253           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 254              from its list.  */
 255           *p_occ = occ->next;
 256           occ->next = new_occ->children;
 257           new_occ->children = occ;
 258
 259           /* Try the next block (it may as well be dominated by BB).  */
 260         }
 261
 262       else if (dom == occ_bb)
 263         {
 264           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 265           insert_bb (new_occ, dom, &occ->children);
 266           return;
 267         }
 268
 269       else if (dom != idom)
 270         {
 271           gcc_assert (!dom->aux);
 272
 273           /* There is a dominator between IDOM and BB, add it and make
 274              two children out of NEW_OCC and OCC.  First, remove OCC from
 275              its list.  */
 276           *p_occ = occ->next;
 277           new_occ->next = occ;
 278           occ->next = NULL;
 279
 280           /* None of the previous blocks has DOM as a dominator: if we tail
 281              recursed, we would reexamine them uselessly. Just switch BB with
 282              DOM, and go on looking for blocks dominated by DOM.  */
 283           new_occ = occ_new (dom, new_occ);
 284         }
 285
 286       else
 287         {
 288           /* Nothing special, go on with the next element.  */
 289           p_occ = &occ->next;
 290         }
 291     }
 292
 293   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 294   new_occ->next = *p_head;
 295   *p_head = new_occ;
 296 }
 297
 298 /* Register that we found a division in BB.  */
 299
 300 static inline void
 301 register_division_in (basic_block bb)
 302 {
 303   struct occurrence *occ;
 304
 305   occ = (struct occurrence *) bb->aux;
 306   if (!occ)
 307     {
 308       occ = occ_new (bb, NULL);
 309       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
 310     }
 311
 312   occ->bb_has_division = true;
 313   occ->num_divisions++;
 314 }
 315
 316
 317 /* Compute the number of divisions that postdominate each block in OCC and
 318    its children.  */
 319
 320 static void
 321 compute_merit (struct occurrence *occ)
 322 {
 323   struct occurrence *occ_child;
 324   basic_block dom = occ->bb;
 325
 326   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 327     {
 328       basic_block bb;
 329       if (occ_child->children)
 330         compute_merit (occ_child);
 331
 332       if (flag_exceptions)
 333         bb = single_noncomplex_succ (dom);
 334       else
 335         bb = dom;
 336
 337       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 338         occ->num_divisions += occ_child->num_divisions;
 339     }
 340 }
 341
 342
 343 /* Return whether USE_STMT is a floating-point division by DEF.  */
 344 static inline bool
 345 is_division_by (gimple use_stmt, tree def)
 346 {
 347   return is_gimple_assign (use_stmt)
 348          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 349          && gimple_assign_rhs2 (use_stmt) == def
 350          /* Do not recognize x / x as valid division, as we are getting
 351             confused later by replacing all immediate uses x in such
 352             a stmt.  */
 353          && gimple_assign_rhs1 (use_stmt) != def;
 354 }
 355
 356 /* Walk the subset of the dominator tree rooted at OCC, setting the
 357    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 358    the given basic block.  The field may be left NULL, of course,
 359    if it is not possible or profitable to do the optimization.
 360
 361    DEF_BSI is an iterator pointing at the statement defining DEF.
 362    If RECIP_DEF is set, a dominator already has a computation that can
 363    be used.  */
 364
 365 static void
 366 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 367                     tree def, tree recip_def, int threshold)
 368 {
 369   tree type;
 370   gassign *new_stmt;
 371   gimple_stmt_iterator gsi;
 372   struct occurrence *occ_child;
 373
 374   if (!recip_def
 375       && (occ->bb_has_division || !flag_trapping_math)
 376       && occ->num_divisions >= threshold)
 377     {
 378       /* Make a variable with the replacement and substitute it.  */
 379       type = TREE_TYPE (def);
 380       recip_def = create_tmp_reg (type, "reciptmp");
 381       new_stmt = gimple_build_assign_with_ops (RDIV_EXPR, recip_def,
 382                                                build_one_cst (type), def);
 383
 384       if (occ->bb_has_division)
 385         {
 386           /* Case 1: insert before an existing division.  */
 387           gsi = gsi_after_labels (occ->bb);
 388           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 389             gsi_next (&gsi);
 390
 391           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 392         }
 393       else if (def_gsi && occ->bb == def_gsi->bb)
 394         {
 395           /* Case 2: insert right after the definition.  Note that this will
 396              never happen if the definition statement can throw, because in
 397              that case the sole successor of the statement's basic block will
 398              dominate all the uses as well.  */
 399           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 400         }
 401       else
 402         {
 403           /* Case 3: insert in a basic block not containing defs/uses.  */
 404           gsi = gsi_after_labels (occ->bb);
 405           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 406         }
 407
 408       reciprocal_stats.rdivs_inserted++;
 409
 410       occ->recip_def_stmt = new_stmt;
 411     }
 412
 413   occ->recip_def = recip_def;
 414   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 415     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 416 }
 417
 418
 419 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 420    possible.  */
 421
 422 static inline void
 423 replace_reciprocal (use_operand_p use_p)
 424 {
 425   gimple use_stmt = USE_STMT (use_p);
 426   basic_block bb = gimple_bb (use_stmt);
 427   struct occurrence *occ = (struct occurrence *) bb->aux;
 428
 429   if (optimize_bb_for_speed_p (bb)
 430       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 431     {
 432       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 433       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 434       SET_USE (use_p, occ->recip_def);
 435       fold_stmt_inplace (&gsi);
 436       update_stmt (use_stmt);
 437     }
 438 }
 439
 440
 441 /* Free OCC and return one more "struct occurrence" to be freed.  */
 442
 443 static struct occurrence *
 444 free_bb (struct occurrence *occ)
 445 {
 446   struct occurrence *child, *next;
 447
 448   /* First get the two pointers hanging off OCC.  */
 449   next = occ->next;
 450   child = occ->children;
 451   occ->bb->aux = NULL;
 452   pool_free (occ_pool, occ);
 453
 454   /* Now ensure that we don't recurse unless it is necessary.  */
 455   if (!child)
 456     return next;
 457   else
 458     {
 459       while (next)
 460         next = free_bb (next);
 461
 462       return child;
 463     }
 464 }
 465
 466
 467 /* Look for floating-point divisions among DEF's uses, and try to
 468    replace them by multiplications with the reciprocal.  Add
 469    as many statements computing the reciprocal as needed.
 470
 471    DEF must be a GIMPLE register of a floating-point type.  */
 472
 473 static void
 474 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 475 {
 476   use_operand_p use_p;
 477   imm_use_iterator use_iter;
 478   struct occurrence *occ;
 479   int count = 0, threshold;
 480
 481   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 482
 483   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 484     {
 485       gimple use_stmt = USE_STMT (use_p);
 486       if (is_division_by (use_stmt, def))
 487         {
 488           register_division_in (gimple_bb (use_stmt));
 489           count++;
 490         }
 491     }
 492
 493   /* Do the expensive part only if we can hope to optimize something.  */
 494   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 495   if (count >= threshold)
 496     {
 497       gimple use_stmt;
 498       for (occ = occ_head; occ; occ = occ->next)
 499         {
 500           compute_merit (occ);
 501           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 502         }
 503
 504       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 505         {
 506           if (is_division_by (use_stmt, def))
 507             {
 508               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 509                 replace_reciprocal (use_p);
 510             }
 511         }
 512     }
 513
 514   for (occ = occ_head; occ; )
 515     occ = free_bb (occ);
 516
 517   occ_head = NULL;
 518 }
 519
 520 /* Go through all the floating-point SSA_NAMEs, and call
 521    execute_cse_reciprocals_1 on each of them.  */
 522 namespace {
 523
 524 const pass_data pass_data_cse_reciprocals =
 525 {
 526   GIMPLE_PASS, /* type */
 527   "recip", /* name */
 528   OPTGROUP_NONE, /* optinfo_flags */
 529   TV_NONE, /* tv_id */
 530   PROP_ssa, /* properties_required */
 531   0, /* properties_provided */
 532   0, /* properties_destroyed */
 533   0, /* todo_flags_start */
 534   TODO_update_ssa, /* todo_flags_finish */
 535 };
 536
 537 class pass_cse_reciprocals : public gimple_opt_pass
 538 {
 539 public:
 540   pass_cse_reciprocals (gcc::context *ctxt)
 541     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
 542   {}
 543
 544   /* opt_pass methods: */
 545   virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
 546   virtual unsigned int execute (function *);
 547
 548 }; // class pass_cse_reciprocals
 549
 550 unsigned int
 551 pass_cse_reciprocals::execute (function *fun)
 552 {
 553   basic_block bb;
 554   tree arg;
 555
 556   occ_pool = create_alloc_pool ("dominators for recip",
 557                                 sizeof (struct occurrence),
 558                                 n_basic_blocks_for_fn (fun) / 3 + 1);
 559
 560   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 561   calculate_dominance_info (CDI_DOMINATORS);
 562   calculate_dominance_info (CDI_POST_DOMINATORS);
 563
 564 #ifdef ENABLE_CHECKING
 565   FOR_EACH_BB_FN (bb, fun)
 566     gcc_assert (!bb->aux);
 567 #endif
 568
 569   for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
 570     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 571         && is_gimple_reg (arg))
 572       {
 573         tree name = ssa_default_def (fun, arg);
 574         if (name)
 575           execute_cse_reciprocals_1 (NULL, name);
 576       }
 577
 578   FOR_EACH_BB_FN (bb, fun)
 579     {
 580       tree def;
 581
 582       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 583            gsi_next (&gsi))
 584         {
 585           gphi *phi = gsi.phi ();
 586           def = PHI_RESULT (phi);
 587           if (! virtual_operand_p (def)
 588               && FLOAT_TYPE_P (TREE_TYPE (def)))
 589             execute_cse_reciprocals_1 (NULL, def);
 590         }
 591
 592       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 593            gsi_next (&gsi))
 594         {
 595           gimple stmt = gsi_stmt (gsi);
 596
 597           if (gimple_has_lhs (stmt)
 598               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 599               && FLOAT_TYPE_P (TREE_TYPE (def))
 600               && TREE_CODE (def) == SSA_NAME)
 601             execute_cse_reciprocals_1 (&gsi, def);
 602         }
 603
 604       if (optimize_bb_for_size_p (bb))
 605         continue;
 606
 607       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 608       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 609            gsi_next (&gsi))
 610         {
 611           gimple stmt = gsi_stmt (gsi);
 612           tree fndecl;
 613
 614           if (is_gimple_assign (stmt)
 615               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 616             {
 617               tree arg1 = gimple_assign_rhs2 (stmt);
 618               gimple stmt1;
 619
 620               if (TREE_CODE (arg1) != SSA_NAME)
 621                 continue;
 622
 623               stmt1 = SSA_NAME_DEF_STMT (arg1);
 624
 625               if (is_gimple_call (stmt1)
 626                   && gimple_call_lhs (stmt1)
 627                   && (fndecl = gimple_call_fndecl (stmt1))
 628                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 629                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 630                 {
 631                   enum built_in_function code;
 632                   bool md_code, fail;
 633                   imm_use_iterator ui;
 634                   use_operand_p use_p;
 635
 636                   code = DECL_FUNCTION_CODE (fndecl);
 637                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 638
 639                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 640                   if (!fndecl)
 641                     continue;
 642
 643                   /* Check that all uses of the SSA name are divisions,
 644                      otherwise replacing the defining statement will do
 645                      the wrong thing.  */
 646                   fail = false;
 647                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 648                     {
 649                       gimple stmt2 = USE_STMT (use_p);
 650                       if (is_gimple_debug (stmt2))
 651                         continue;
 652                       if (!is_gimple_assign (stmt2)
 653                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 654                           || gimple_assign_rhs1 (stmt2) == arg1
 655                           || gimple_assign_rhs2 (stmt2) != arg1)
 656                         {
 657                           fail = true;
 658                           break;
 659                         }
 660                     }
 661                   if (fail)
 662                     continue;
 663
 664                   gimple_replace_ssa_lhs (stmt1, arg1);
 665                   gimple_call_set_fndecl (stmt1, fndecl);
 666                   update_stmt (stmt1);
 667                   reciprocal_stats.rfuncs_inserted++;
 668
 669                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 670                     {
 671                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 672                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 673                       fold_stmt_inplace (&gsi);
 674                       update_stmt (stmt);
 675                     }
 676                 }
 677             }
 678         }
 679     }
 680
 681   statistics_counter_event (fun, "reciprocal divs inserted",
 682                             reciprocal_stats.rdivs_inserted);
 683   statistics_counter_event (fun, "reciprocal functions inserted",
 684                             reciprocal_stats.rfuncs_inserted);
 685
 686   free_dominance_info (CDI_DOMINATORS);
 687   free_dominance_info (CDI_POST_DOMINATORS);
 688   free_alloc_pool (occ_pool);
 689   return 0;
 690 }
 691
 692 } // anon namespace
 693
 694 gimple_opt_pass *
 695 make_pass_cse_reciprocals (gcc::context *ctxt)
 696 {
 697   return new pass_cse_reciprocals (ctxt);
 698 }
 699
 700 /* Records an occurrence at statement USE_STMT in the vector of trees
 701    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 702    is not yet initialized.  Returns true if the occurrence was pushed on
 703    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 704    statements in the vector.  */
 705
 706 static bool
 707 maybe_record_sincos (vec<gimple> *stmts,
 708                      basic_block *top_bb, gimple use_stmt)
 709 {
 710   basic_block use_bb = gimple_bb (use_stmt);
 711   if (*top_bb
 712       && (*top_bb == use_bb
 713           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 714     stmts->safe_push (use_stmt);
 715   else if (!*top_bb
 716            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 717     {
 718       stmts->safe_push (use_stmt);
 719       *top_bb = use_bb;
 720     }
 721   else
 722     return false;
 723
 724   return true;
 725 }
 726
 727 /* Look for sin, cos and cexpi calls with the same argument NAME and
 728    create a single call to cexpi CSEing the result in this case.
 729    We first walk over all immediate uses of the argument collecting
 730    statements that we can CSE in a vector and in a second pass replace
 731    the statement rhs with a REALPART or IMAGPART expression on the
 732    result of the cexpi call we insert before the use statement that
 733    dominates all other candidates.  */
 734
 735 static bool
 736 execute_cse_sincos_1 (tree name)
 737 {
 738   gimple_stmt_iterator gsi;
 739   imm_use_iterator use_iter;
 740   tree fndecl, res, type;
 741   gimple def_stmt, use_stmt, stmt;
 742   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 743   vec<gimple> stmts = vNULL;
 744   basic_block top_bb = NULL;
 745   int i;
 746   bool cfg_changed = false;
 747
 748   type = TREE_TYPE (name);
 749   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 750     {
 751       if (gimple_code (use_stmt) != GIMPLE_CALL
 752           || !gimple_call_lhs (use_stmt)
 753           || !(fndecl = gimple_call_fndecl (use_stmt))
 754           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 755         continue;
 756
 757       switch (DECL_FUNCTION_CODE (fndecl))
 758         {
 759         CASE_FLT_FN (BUILT_IN_COS):
 760           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 761           break;
 762
 763         CASE_FLT_FN (BUILT_IN_SIN):
 764           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 765           break;
 766
 767         CASE_FLT_FN (BUILT_IN_CEXPI):
 768           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 769           break;
 770
 771         default:;
 772         }
 773     }
 774
 775   if (seen_cos + seen_sin + seen_cexpi <= 1)
 776     {
 777       stmts.release ();
 778       return false;
 779     }
 780
 781   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 782      the name def statement.  */
 783   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 784   if (!fndecl)
 785     return false;
 786   stmt = gimple_build_call (fndecl, 1, name);
 787   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
 788   gimple_call_set_lhs (stmt, res);
 789
 790   def_stmt = SSA_NAME_DEF_STMT (name);
 791   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 792       && gimple_code (def_stmt) != GIMPLE_PHI
 793       && gimple_bb (def_stmt) == top_bb)
 794     {
 795       gsi = gsi_for_stmt (def_stmt);
 796       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 797     }
 798   else
 799     {
 800       gsi = gsi_after_labels (top_bb);
 801       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 802     }
 803   sincos_stats.inserted++;
 804
 805   /* And adjust the recorded old call sites.  */
 806   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
 807     {
 808       tree rhs = NULL;
 809       fndecl = gimple_call_fndecl (use_stmt);
 810
 811       switch (DECL_FUNCTION_CODE (fndecl))
 812         {
 813         CASE_FLT_FN (BUILT_IN_COS):
 814           rhs = fold_build1 (REALPART_EXPR, type, res);
 815           break;
 816
 817         CASE_FLT_FN (BUILT_IN_SIN):
 818           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 819           break;
 820
 821         CASE_FLT_FN (BUILT_IN_CEXPI):
 822           rhs = res;
 823           break;
 824
 825         default:;
 826           gcc_unreachable ();
 827         }
 828
 829         /* Replace call with a copy.  */
 830         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 831
 832         gsi = gsi_for_stmt (use_stmt);
 833         gsi_replace (&gsi, stmt, true);
 834         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 835           cfg_changed = true;
 836     }
 837
 838   stmts.release ();
 839
 840   return cfg_changed;
 841 }
 842
 843 /* To evaluate powi(x,n), the floating point value x raised to the
 844    constant integer exponent n, we use a hybrid algorithm that
 845    combines the "window method" with look-up tables.  For an
 846    introduction to exponentiation algorithms and "addition chains",
 847    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 848    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 849    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 850    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 851
 852 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 853    multiplications to inline before calling the system library's pow
 854    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 855    so this default never requires calling pow, powf or powl.  */
 856
 857 #ifndef POWI_MAX_MULTS
 858 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 859 #endif
 860
 861 /* The size of the "optimal power tree" lookup table.  All
 862    exponents less than this value are simply looked up in the
 863    powi_table below.  This threshold is also used to size the
 864    cache of pseudo registers that hold intermediate results.  */
 865 #define POWI_TABLE_SIZE 256
 866
 867 /* The size, in bits of the window, used in the "window method"
 868    exponentiation algorithm.  This is equivalent to a radix of
 869    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 870 #define POWI_WINDOW_SIZE 3
 871
 872 /* The following table is an efficient representation of an
 873    "optimal power tree".  For each value, i, the corresponding
 874    value, j, in the table states than an optimal evaluation
 875    sequence for calculating pow(x,i) can be found by evaluating
 876    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 877    100 integers is given in Knuth's "Seminumerical algorithms".  */
 878
 879 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 880   {
 881       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 882       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 883       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 884      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 885      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 886      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 887      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 888      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 889      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 890      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 891      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 892      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 893      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 894      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 895      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 896      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 897      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 898      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 899      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 900      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 901      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 902      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 903      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 904      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 905      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 906     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 907     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 908     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 909     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 910     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 911     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 912     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 913   };
 914
 915
 916 /* Return the number of multiplications required to calculate
 917    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 918    subroutine of powi_cost.  CACHE is an array indicating
 919    which exponents have already been calculated.  */
 920
 921 static int
 922 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 923 {
 924   /* If we've already calculated this exponent, then this evaluation
 925      doesn't require any additional multiplications.  */
 926   if (cache[n])
 927     return 0;
 928
 929   cache[n] = true;
 930   return powi_lookup_cost (n - powi_table[n], cache)
 931          + powi_lookup_cost (powi_table[n], cache) + 1;
 932 }
 933
 934 /* Return the number of multiplications required to calculate
 935    powi(x,n) for an arbitrary x, given the exponent N.  This
 936    function needs to be kept in sync with powi_as_mults below.  */
 937
 938 static int
 939 powi_cost (HOST_WIDE_INT n)
 940 {
 941   bool cache[POWI_TABLE_SIZE];
 942   unsigned HOST_WIDE_INT digit;
 943   unsigned HOST_WIDE_INT val;
 944   int result;
 945
 946   if (n == 0)
 947     return 0;
 948
 949   /* Ignore the reciprocal when calculating the cost.  */
 950   val = (n < 0) ? -n : n;
 951
 952   /* Initialize the exponent cache.  */
 953   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 954   cache[1] = true;
 955
 956   result = 0;
 957
 958   while (val >= POWI_TABLE_SIZE)
 959     {
 960       if (val & 1)
 961         {
 962           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 963           result += powi_lookup_cost (digit, cache)
 964                     + POWI_WINDOW_SIZE + 1;
 965           val >>= POWI_WINDOW_SIZE;
 966         }
 967       else
 968         {
 969           val >>= 1;
 970           result++;
 971         }
 972     }
 973
 974   return result + powi_lookup_cost (val, cache);
 975 }
 976
 977 /* Recursive subroutine of powi_as_mults.  This function takes the
 978    array, CACHE, of already calculated exponents and an exponent N and
 979    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 980
 981 static tree
 982 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 983                  HOST_WIDE_INT n, tree *cache)
 984 {
 985   tree op0, op1, ssa_target;
 986   unsigned HOST_WIDE_INT digit;
 987   gassign *mult_stmt;
 988
 989   if (n < POWI_TABLE_SIZE && cache[n])
 990     return cache[n];
 991
 992   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
 993
 994   if (n < POWI_TABLE_SIZE)
 995     {
 996       cache[n] = ssa_target;
 997       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
 998       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
 999     }
1000   else if (n & 1)
1001     {
1002       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1003       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1004       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1005     }
1006   else
1007     {
1008       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1009       op1 = op0;
1010     }
1011
1012   mult_stmt = gimple_build_assign_with_ops (MULT_EXPR, ssa_target, op0, op1);
1013   gimple_set_location (mult_stmt, loc);
1014   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1015
1016   return ssa_target;
1017 }
1018
1019 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1020    This function needs to be kept in sync with powi_cost above.  */
1021
1022 static tree
1023 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1024                tree arg0, HOST_WIDE_INT n)
1025 {
1026   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1027   gassign *div_stmt;
1028   tree target;
1029
1030   if (n == 0)
1031     return build_real (type, dconst1);
1032
1033   memset (cache, 0,  sizeof (cache));
1034   cache[1] = arg0;
1035
1036   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1037   if (n >= 0)
1038     return result;
1039
1040   /* If the original exponent was negative, reciprocate the result.  */
1041   target = make_temp_ssa_name (type, NULL, "powmult");
1042   div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target,
1043                                            build_real (type, dconst1),
1044                                            result);
1045   gimple_set_location (div_stmt, loc);
1046   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1047
1048   return target;
1049 }
1050
1051 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1052    location info LOC.  If the arguments are appropriate, create an
1053    equivalent sequence of statements prior to GSI using an optimal
1054    number of multiplications, and return an expession holding the
1055    result.  */
1056
1057 static tree
1058 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1059                             tree arg0, HOST_WIDE_INT n)
1060 {
1061   /* Avoid largest negative number.  */
1062   if (n != -n
1063       && ((n >= -1 && n <= 2)
1064           || (optimize_function_for_speed_p (cfun)
1065               && powi_cost (n) <= POWI_MAX_MULTS)))
1066     return powi_as_mults (gsi, loc, arg0, n);
1067
1068   return NULL_TREE;
1069 }
1070
1071 /* Build a gimple call statement that calls FN with argument ARG.
1072    Set the lhs of the call statement to a fresh SSA name.  Insert the
1073    statement prior to GSI's current position, and return the fresh
1074    SSA name.  */
1075
1076 static tree
1077 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1078                        tree fn, tree arg)
1079 {
1080   gcall *call_stmt;
1081   tree ssa_target;
1082
1083   call_stmt = gimple_build_call (fn, 1, arg);
1084   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1085   gimple_set_lhs (call_stmt, ssa_target);
1086   gimple_set_location (call_stmt, loc);
1087   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1088
1089   return ssa_target;
1090 }
1091
1092 /* Build a gimple binary operation with the given CODE and arguments
1093    ARG0, ARG1, assigning the result to a new SSA name for variable
1094    TARGET.  Insert the statement prior to GSI's current position, and
1095    return the fresh SSA name.*/
1096
1097 static tree
1098 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1099                         const char *name, enum tree_code code,
1100                         tree arg0, tree arg1)
1101 {
1102   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1103   gassign *stmt = gimple_build_assign_with_ops (code, result, arg0, arg1);
1104   gimple_set_location (stmt, loc);
1105   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1106   return result;
1107 }
1108
1109 /* Build a gimple reference operation with the given CODE and argument
1110    ARG, assigning the result to a new SSA name of TYPE with NAME.
1111    Insert the statement prior to GSI's current position, and return
1112    the fresh SSA name.  */
1113
1114 static inline tree
1115 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1116                       const char *name, enum tree_code code, tree arg0)
1117 {
1118   tree result = make_temp_ssa_name (type, NULL, name);
1119   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1120   gimple_set_location (stmt, loc);
1121   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1122   return result;
1123 }
1124
1125 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1126    prior to GSI's current position, and return the fresh SSA name.  */
1127
1128 static tree
1129 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1130                        tree type, tree val)
1131 {
1132   tree result = make_ssa_name (type, NULL);
1133   gassign *stmt = gimple_build_assign_with_ops (NOP_EXPR, result, val);
1134   gimple_set_location (stmt, loc);
1135   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1136   return result;
1137 }
1138
1139 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1140    with location info LOC.  If possible, create an equivalent and
1141    less expensive sequence of statements prior to GSI, and return an
1142    expession holding the result.  */
1143
1144 static tree
1145 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1146                            tree arg0, tree arg1)
1147 {
1148   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1149   REAL_VALUE_TYPE c2, dconst3;
1150   HOST_WIDE_INT n;
1151   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1152   machine_mode mode;
1153   bool hw_sqrt_exists, c_is_int, c2_is_int;
1154
1155   /* If the exponent isn't a constant, there's nothing of interest
1156      to be done.  */
1157   if (TREE_CODE (arg1) != REAL_CST)
1158     return NULL_TREE;
1159
1160   /* If the exponent is equivalent to an integer, expand to an optimal
1161      multiplication sequence when profitable.  */
1162   c = TREE_REAL_CST (arg1);
1163   n = real_to_integer (&c);
1164   real_from_integer (&cint, VOIDmode, n, SIGNED);
1165   c_is_int = real_identical (&c, &cint);
1166
1167   if (c_is_int
1168       && ((n >= -1 && n <= 2)
1169           || (flag_unsafe_math_optimizations
1170               && optimize_bb_for_speed_p (gsi_bb (*gsi))
1171               && powi_cost (n) <= POWI_MAX_MULTS)))
1172     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1173
1174   /* Attempt various optimizations using sqrt and cbrt.  */
1175   type = TREE_TYPE (arg0);
1176   mode = TYPE_MODE (type);
1177   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1178
1179   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1180      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1181      sqrt(-0) = -0.  */
1182   if (sqrtfn
1183       && REAL_VALUES_EQUAL (c, dconsthalf)
1184       && !HONOR_SIGNED_ZEROS (mode))
1185     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1186
1187   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1188      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1189      so do this optimization even if -Os.  Don't do this optimization
1190      if we don't have a hardware sqrt insn.  */
1191   dconst1_4 = dconst1;
1192   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1193   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1194
1195   if (flag_unsafe_math_optimizations
1196       && sqrtfn
1197       && REAL_VALUES_EQUAL (c, dconst1_4)
1198       && hw_sqrt_exists)
1199     {
1200       /* sqrt(x)  */
1201       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1202
1203       /* sqrt(sqrt(x))  */
1204       return build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1205     }
1206
1207   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1208      optimizing for space.  Don't do this optimization if we don't have
1209      a hardware sqrt insn.  */
1210   real_from_integer (&dconst3_4, VOIDmode, 3, SIGNED);
1211   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1212
1213   if (flag_unsafe_math_optimizations
1214       && sqrtfn
1215       && optimize_function_for_speed_p (cfun)
1216       && REAL_VALUES_EQUAL (c, dconst3_4)
1217       && hw_sqrt_exists)
1218     {
1219       /* sqrt(x)  */
1220       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1221
1222       /* sqrt(sqrt(x))  */
1223       sqrt_sqrt = build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1224
1225       /* sqrt(x) * sqrt(sqrt(x))  */
1226       return build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1227                                      sqrt_arg0, sqrt_sqrt);
1228     }
1229
1230   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1231      optimizations since 1./3. is not exactly representable.  If x
1232      is negative and finite, the correct value of pow(x,1./3.) is
1233      a NaN with the "invalid" exception raised, because the value
1234      of 1./3. actually has an even denominator.  The correct value
1235      of cbrt(x) is a negative real value.  */
1236   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1237   dconst1_3 = real_value_truncate (mode, dconst_third ());
1238
1239   if (flag_unsafe_math_optimizations
1240       && cbrtfn
1241       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1242       && REAL_VALUES_EQUAL (c, dconst1_3))
1243     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1244
1245   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1246      if we don't have a hardware sqrt insn.  */
1247   dconst1_6 = dconst1_3;
1248   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1249
1250   if (flag_unsafe_math_optimizations
1251       && sqrtfn
1252       && cbrtfn
1253       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1254       && optimize_function_for_speed_p (cfun)
1255       && hw_sqrt_exists
1256       && REAL_VALUES_EQUAL (c, dconst1_6))
1257     {
1258       /* sqrt(x)  */
1259       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1260
1261       /* cbrt(sqrt(x))  */
1262       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1263     }
1264
1265   /* Optimize pow(x,c), where n = 2c for some nonzero integer n
1266      and c not an integer, into
1267
1268        sqrt(x) * powi(x, n/2),                n > 0;
1269        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1270
1271      Do not calculate the powi factor when n/2 = 0.  */
1272   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1273   n = real_to_integer (&c2);
1274   real_from_integer (&cint, VOIDmode, n, SIGNED);
1275   c2_is_int = real_identical (&c2, &cint);
1276
1277   if (flag_unsafe_math_optimizations
1278       && sqrtfn
1279       && c2_is_int
1280       && !c_is_int
1281       && optimize_function_for_speed_p (cfun))
1282     {
1283       tree powi_x_ndiv2 = NULL_TREE;
1284
1285       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1286          possible or profitable, give up.  Skip the degenerate case when
1287          n is 1 or -1, where the result is always 1.  */
1288       if (absu_hwi (n) != 1)
1289         {
1290           powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1291                                                      abs_hwi (n / 2));
1292           if (!powi_x_ndiv2)
1293             return NULL_TREE;
1294         }
1295
1296       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1297          result of the optimal multiply sequence just calculated.  */
1298       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1299
1300       if (absu_hwi (n) == 1)
1301         result = sqrt_arg0;
1302       else
1303         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1304                                          sqrt_arg0, powi_x_ndiv2);
1305
1306       /* If n is negative, reciprocate the result.  */
1307       if (n < 0)
1308         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1309                                          build_real (type, dconst1), result);
1310       return result;
1311     }
1312
1313   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1314
1315      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1316      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1317
1318      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1319      different from pow(x, 1./3.) due to rounding and behavior with
1320      negative x, we need to constrain this transformation to unsafe
1321      math and positive x or finite math.  */
1322   real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1323   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1324   real_round (&c2, mode, &c2);
1325   n = real_to_integer (&c2);
1326   real_from_integer (&cint, VOIDmode, n, SIGNED);
1327   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1328   real_convert (&c2, mode, &c2);
1329
1330   if (flag_unsafe_math_optimizations
1331       && cbrtfn
1332       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1333       && real_identical (&c2, &c)
1334       && !c2_is_int
1335       && optimize_function_for_speed_p (cfun)
1336       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1337     {
1338       tree powi_x_ndiv3 = NULL_TREE;
1339
1340       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1341          possible or profitable, give up.  Skip the degenerate case when
1342          abs(n) < 3, where the result is always 1.  */
1343       if (absu_hwi (n) >= 3)
1344         {
1345           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1346                                                      abs_hwi (n / 3));
1347           if (!powi_x_ndiv3)
1348             return NULL_TREE;
1349         }
1350
1351       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1352          as that creates an unnecessary variable.  Instead, just produce
1353          either cbrt(x) or cbrt(x) * cbrt(x).  */
1354       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1355
1356       if (absu_hwi (n) % 3 == 1)
1357         powi_cbrt_x = cbrt_x;
1358       else
1359         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1360                                               cbrt_x, cbrt_x);
1361
1362       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1363       if (absu_hwi (n) < 3)
1364         result = powi_cbrt_x;
1365       else
1366         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1367                                          powi_x_ndiv3, powi_cbrt_x);
1368
1369       /* If n is negative, reciprocate the result.  */
1370       if (n < 0)
1371         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1372                                          build_real (type, dconst1), result);
1373
1374       return result;
1375     }
1376
1377   /* No optimizations succeeded.  */
1378   return NULL_TREE;
1379 }
1380
1381 /* ARG is the argument to a cabs builtin call in GSI with location info
1382    LOC.  Create a sequence of statements prior to GSI that calculates
1383    sqrt(R*R + I*I), where R and I are the real and imaginary components
1384    of ARG, respectively.  Return an expression holding the result.  */
1385
1386 static tree
1387 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1388 {
1389   tree real_part, imag_part, addend1, addend2, sum, result;
1390   tree type = TREE_TYPE (TREE_TYPE (arg));
1391   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1392   machine_mode mode = TYPE_MODE (type);
1393
1394   if (!flag_unsafe_math_optimizations
1395       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1396       || !sqrtfn
1397       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1398     return NULL_TREE;
1399
1400   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1401                                     REALPART_EXPR, arg);
1402   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1403                                     real_part, real_part);
1404   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1405                                     IMAGPART_EXPR, arg);
1406   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1407                                     imag_part, imag_part);
1408   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1409   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1410
1411   return result;
1412 }
1413
1414 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1415    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1416    an optimal number of multiplies, when n is a constant.  */
1417
1418 namespace {
1419
1420 const pass_data pass_data_cse_sincos =
1421 {
1422   GIMPLE_PASS, /* type */
1423   "sincos", /* name */
1424   OPTGROUP_NONE, /* optinfo_flags */
1425   TV_NONE, /* tv_id */
1426   PROP_ssa, /* properties_required */
1427   0, /* properties_provided */
1428   0, /* properties_destroyed */
1429   0, /* todo_flags_start */
1430   TODO_update_ssa, /* todo_flags_finish */
1431 };
1432
1433 class pass_cse_sincos : public gimple_opt_pass
1434 {
1435 public:
1436   pass_cse_sincos (gcc::context *ctxt)
1437     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1438   {}
1439
1440   /* opt_pass methods: */
1441   virtual bool gate (function *)
1442     {
1443       /* We no longer require either sincos or cexp, since powi expansion
1444          piggybacks on this pass.  */
1445       return optimize;
1446     }
1447
1448   virtual unsigned int execute (function *);
1449
1450 }; // class pass_cse_sincos
1451
1452 unsigned int
1453 pass_cse_sincos::execute (function *fun)
1454 {
1455   basic_block bb;
1456   bool cfg_changed = false;
1457
1458   calculate_dominance_info (CDI_DOMINATORS);
1459   memset (&sincos_stats, 0, sizeof (sincos_stats));
1460
1461   FOR_EACH_BB_FN (bb, fun)
1462     {
1463       gimple_stmt_iterator gsi;
1464       bool cleanup_eh = false;
1465
1466       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1467         {
1468           gimple stmt = gsi_stmt (gsi);
1469           tree fndecl;
1470
1471           /* Only the last stmt in a bb could throw, no need to call
1472              gimple_purge_dead_eh_edges if we change something in the middle
1473              of a basic block.  */
1474           cleanup_eh = false;
1475
1476           if (is_gimple_call (stmt)
1477               && gimple_call_lhs (stmt)
1478               && (fndecl = gimple_call_fndecl (stmt))
1479               && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1480             {
1481               tree arg, arg0, arg1, result;
1482               HOST_WIDE_INT n;
1483               location_t loc;
1484
1485               switch (DECL_FUNCTION_CODE (fndecl))
1486                 {
1487                 CASE_FLT_FN (BUILT_IN_COS):
1488                 CASE_FLT_FN (BUILT_IN_SIN):
1489                 CASE_FLT_FN (BUILT_IN_CEXPI):
1490                   /* Make sure we have either sincos or cexp.  */
1491                   if (!targetm.libc_has_function (function_c99_math_complex)
1492                       && !targetm.libc_has_function (function_sincos))
1493                     break;
1494
1495                   arg = gimple_call_arg (stmt, 0);
1496                   if (TREE_CODE (arg) == SSA_NAME)
1497                     cfg_changed |= execute_cse_sincos_1 (arg);
1498                   break;
1499
1500                 CASE_FLT_FN (BUILT_IN_POW):
1501                   arg0 = gimple_call_arg (stmt, 0);
1502                   arg1 = gimple_call_arg (stmt, 1);
1503
1504                   loc = gimple_location (stmt);
1505                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1506
1507                   if (result)
1508                     {
1509                       tree lhs = gimple_get_lhs (stmt);
1510                       gassign *new_stmt = gimple_build_assign (lhs, result);
1511                       gimple_set_location (new_stmt, loc);
1512                       unlink_stmt_vdef (stmt);
1513                       gsi_replace (&gsi, new_stmt, true);
1514                       cleanup_eh = true;
1515                       if (gimple_vdef (stmt))
1516                         release_ssa_name (gimple_vdef (stmt));
1517                     }
1518                   break;
1519
1520                 CASE_FLT_FN (BUILT_IN_POWI):
1521                   arg0 = gimple_call_arg (stmt, 0);
1522                   arg1 = gimple_call_arg (stmt, 1);
1523                   loc = gimple_location (stmt);
1524
1525                   if (real_minus_onep (arg0))
1526                     {
1527                       tree t0, t1, cond, one, minus_one;
1528                       gassign *stmt;
1529
1530                       t0 = TREE_TYPE (arg0);
1531                       t1 = TREE_TYPE (arg1);
1532                       one = build_real (t0, dconst1);
1533                       minus_one = build_real (t0, dconstm1);
1534
1535                       cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1536                       stmt = gimple_build_assign_with_ops (BIT_AND_EXPR, cond,
1537                                                            arg1,
1538                                                            build_int_cst (t1,
1539                                                                           1));
1540                       gimple_set_location (stmt, loc);
1541                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1542
1543                       result = make_temp_ssa_name (t0, NULL, "powi");
1544                       stmt = gimple_build_assign_with_ops (COND_EXPR, result,
1545                                                            cond,
1546                                                            minus_one, one);
1547                       gimple_set_location (stmt, loc);
1548                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1549                     }
1550                   else
1551                     {
1552                       if (!tree_fits_shwi_p (arg1))
1553                         break;
1554
1555                       n = tree_to_shwi (arg1);
1556                       result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1557                     }
1558
1559                   if (result)
1560                     {
1561                       tree lhs = gimple_get_lhs (stmt);
1562                       gassign *new_stmt = gimple_build_assign (lhs, result);
1563                       gimple_set_location (new_stmt, loc);
1564                       unlink_stmt_vdef (stmt);
1565                       gsi_replace (&gsi, new_stmt, true);
1566                       cleanup_eh = true;
1567                       if (gimple_vdef (stmt))
1568                         release_ssa_name (gimple_vdef (stmt));
1569                     }
1570                   break;
1571
1572                 CASE_FLT_FN (BUILT_IN_CABS):
1573                   arg0 = gimple_call_arg (stmt, 0);
1574                   loc = gimple_location (stmt);
1575                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1576
1577                   if (result)
1578                     {
1579                       tree lhs = gimple_get_lhs (stmt);
1580                       gassign *new_stmt = gimple_build_assign (lhs, result);
1581                       gimple_set_location (new_stmt, loc);
1582                       unlink_stmt_vdef (stmt);
1583                       gsi_replace (&gsi, new_stmt, true);
1584                       cleanup_eh = true;
1585                       if (gimple_vdef (stmt))
1586                         release_ssa_name (gimple_vdef (stmt));
1587                     }
1588                   break;
1589
1590                 default:;
1591                 }
1592             }
1593         }
1594       if (cleanup_eh)
1595         cfg_changed |= gimple_purge_dead_eh_edges (bb);
1596     }
1597
1598   statistics_counter_event (fun, "sincos statements inserted",
1599                             sincos_stats.inserted);
1600
1601   free_dominance_info (CDI_DOMINATORS);
1602   return cfg_changed ? TODO_cleanup_cfg : 0;
1603 }
1604
1605 } // anon namespace
1606
1607 gimple_opt_pass *
1608 make_pass_cse_sincos (gcc::context *ctxt)
1609 {
1610   return new pass_cse_sincos (ctxt);
1611 }
1612
1613 /* A symbolic number is used to detect byte permutation and selection
1614    patterns.  Therefore the field N contains an artificial number
1615    consisting of octet sized markers:
1616
1617    0    - target byte has the value 0
1618    FF   - target byte has an unknown value (eg. due to sign extension)
1619    1..size - marker value is the target byte index minus one.
1620
1621    To detect permutations on memory sources (arrays and structures), a symbolic
1622    number is also associated a base address (the array or structure the load is
1623    made from), an offset from the base address and a range which gives the
1624    difference between the highest and lowest accessed memory location to make
1625    such a symbolic number. The range is thus different from size which reflects
1626    the size of the type of current expression. Note that for non memory source,
1627    range holds the same value as size.
1628
1629    For instance, for an array char a[], (short) a[0] | (short) a[3] would have
1630    a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
1631    still have a size of 2 but this time a range of 1.  */
1632
1633 struct symbolic_number {
1634   uint64_t n;
1635   tree type;
1636   tree base_addr;
1637   tree offset;
1638   HOST_WIDE_INT bytepos;
1639   tree alias_set;
1640   tree vuse;
1641   unsigned HOST_WIDE_INT range;
1642 };
1643
1644 #define BITS_PER_MARKER 8
1645 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1646 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1647 #define HEAD_MARKER(n, size) \
1648   ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1649
1650 /* The number which the find_bswap_or_nop_1 result should match in
1651    order to have a nop.  The number is masked according to the size of
1652    the symbolic number before using it.  */
1653 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1654   (uint64_t)0x08070605 << 32 | 0x04030201)
1655
1656 /* The number which the find_bswap_or_nop_1 result should match in
1657    order to have a byte swap.  The number is masked according to the
1658    size of the symbolic number before using it.  */
1659 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1660   (uint64_t)0x01020304 << 32 | 0x05060708)
1661
1662 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1663    number N.  Return false if the requested operation is not permitted
1664    on a symbolic number.  */
1665
1666 static inline bool
1667 do_shift_rotate (enum tree_code code,
1668                  struct symbolic_number *n,
1669                  int count)
1670 {
1671   int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1672   unsigned head_marker;
1673
1674   if (count % BITS_PER_UNIT != 0)
1675     return false;
1676   count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
1677
1678   /* Zero out the extra bits of N in order to avoid them being shifted
1679      into the significant bits.  */
1680   if (size < 64 / BITS_PER_MARKER)
1681     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1682
1683   switch (code)
1684     {
1685     case LSHIFT_EXPR:
1686       n->n <<= count;
1687       break;
1688     case RSHIFT_EXPR:
1689       head_marker = HEAD_MARKER (n->n, size);
1690       n->n >>= count;
1691       /* Arithmetic shift of signed type: result is dependent on the value.  */
1692       if (!TYPE_UNSIGNED (n->type) && head_marker)
1693         for (i = 0; i < count / BITS_PER_MARKER; i++)
1694           n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1695                   << ((size - 1 - i) * BITS_PER_MARKER);
1696       break;
1697     case LROTATE_EXPR:
1698       n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
1699       break;
1700     case RROTATE_EXPR:
1701       n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
1702       break;
1703     default:
1704       return false;
1705     }
1706   /* Zero unused bits for size.  */
1707   if (size < 64 / BITS_PER_MARKER)
1708     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1709   return true;
1710 }
1711
1712 /* Perform sanity checking for the symbolic number N and the gimple
1713    statement STMT.  */
1714
1715 static inline bool
1716 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1717 {
1718   tree lhs_type;
1719
1720   lhs_type = gimple_expr_type (stmt);
1721
1722   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1723     return false;
1724
1725   if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
1726     return false;
1727
1728   return true;
1729 }
1730
1731 /* Initialize the symbolic number N for the bswap pass from the base element
1732    SRC manipulated by the bitwise OR expression.  */
1733
1734 static bool
1735 init_symbolic_number (struct symbolic_number *n, tree src)
1736 {
1737   int size;
1738
1739   n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
1740
1741   /* Set up the symbolic number N by setting each byte to a value between 1 and
1742      the byte size of rhs1.  The highest order byte is set to n->size and the
1743      lowest order byte to 1.  */
1744   n->type = TREE_TYPE (src);
1745   size = TYPE_PRECISION (n->type);
1746   if (size % BITS_PER_UNIT != 0)
1747     return false;
1748   size /= BITS_PER_UNIT;
1749   if (size > 64 / BITS_PER_MARKER)
1750     return false;
1751   n->range = size;
1752   n->n = CMPNOP;
1753
1754   if (size < 64 / BITS_PER_MARKER)
1755     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1756
1757   return true;
1758 }
1759
1760 /* Check if STMT might be a byte swap or a nop from a memory source and returns
1761    the answer. If so, REF is that memory source and the base of the memory area
1762    accessed and the offset of the access from that base are recorded in N.  */
1763
1764 bool
1765 find_bswap_or_nop_load (gimple stmt, tree ref, struct symbolic_number *n)
1766 {
1767   /* Leaf node is an array or component ref. Memorize its base and
1768      offset from base to compare to other such leaf node.  */
1769   HOST_WIDE_INT bitsize, bitpos;
1770   machine_mode mode;
1771   int unsignedp, volatilep;
1772   tree offset, base_addr;
1773
1774   if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
1775     return false;
1776
1777   base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
1778                                    &unsignedp, &volatilep, false);
1779
1780   if (TREE_CODE (base_addr) == MEM_REF)
1781     {
1782       offset_int bit_offset = 0;
1783       tree off = TREE_OPERAND (base_addr, 1);
1784
1785       if (!integer_zerop (off))
1786         {
1787           offset_int boff, coff = mem_ref_offset (base_addr);
1788           boff = wi::lshift (coff, LOG2_BITS_PER_UNIT);
1789           bit_offset += boff;
1790         }
1791
1792       base_addr = TREE_OPERAND (base_addr, 0);
1793
1794       /* Avoid returning a negative bitpos as this may wreak havoc later.  */
1795       if (wi::neg_p (bit_offset))
1796         {
1797           offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
1798           offset_int tem = bit_offset.and_not (mask);
1799           /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
1800              Subtract it to BIT_OFFSET and add it (scaled) to OFFSET.  */
1801           bit_offset -= tem;
1802           tem = wi::arshift (tem, LOG2_BITS_PER_UNIT);
1803           if (offset)
1804             offset = size_binop (PLUS_EXPR, offset,
1805                                     wide_int_to_tree (sizetype, tem));
1806           else
1807             offset = wide_int_to_tree (sizetype, tem);
1808         }
1809
1810       bitpos += bit_offset.to_shwi ();
1811     }
1812
1813   if (bitpos % BITS_PER_UNIT)
1814     return false;
1815   if (bitsize % BITS_PER_UNIT)
1816     return false;
1817
1818   if (!init_symbolic_number (n, ref))
1819     return false;
1820   n->base_addr = base_addr;
1821   n->offset = offset;
1822   n->bytepos = bitpos / BITS_PER_UNIT;
1823   n->alias_set = reference_alias_ptr_type (ref);
1824   n->vuse = gimple_vuse (stmt);
1825   return true;
1826 }
1827
1828 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
1829    the operation given by the rhs of STMT on the result.  If the operation
1830    could successfully be executed the function returns a gimple stmt whose
1831    rhs's first tree is the expression of the source operand and NULL
1832    otherwise.  */
1833
1834 static gimple
1835 find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
1836 {
1837   enum tree_code code;
1838   tree rhs1, rhs2 = NULL;
1839   gimple rhs1_stmt, rhs2_stmt, source_stmt1;
1840   enum gimple_rhs_class rhs_class;
1841
1842   if (!limit || !is_gimple_assign (stmt))
1843     return NULL;
1844
1845   rhs1 = gimple_assign_rhs1 (stmt);
1846
1847   if (find_bswap_or_nop_load (stmt, rhs1, n))
1848     return stmt;
1849
1850   if (TREE_CODE (rhs1) != SSA_NAME)
1851     return NULL;
1852
1853   code = gimple_assign_rhs_code (stmt);
1854   rhs_class = gimple_assign_rhs_class (stmt);
1855   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1856
1857   if (rhs_class == GIMPLE_BINARY_RHS)
1858     rhs2 = gimple_assign_rhs2 (stmt);
1859
1860   /* Handle unary rhs and binary rhs with integer constants as second
1861      operand.  */
1862
1863   if (rhs_class == GIMPLE_UNARY_RHS
1864       || (rhs_class == GIMPLE_BINARY_RHS
1865           && TREE_CODE (rhs2) == INTEGER_CST))
1866     {
1867       if (code != BIT_AND_EXPR
1868           && code != LSHIFT_EXPR
1869           && code != RSHIFT_EXPR
1870           && code != LROTATE_EXPR
1871           && code != RROTATE_EXPR
1872           && !CONVERT_EXPR_CODE_P (code))
1873         return NULL;
1874
1875       source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
1876
1877       /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
1878          we have to initialize the symbolic number.  */
1879       if (!source_stmt1)
1880         {
1881           if (gimple_assign_load_p (stmt)
1882               || !init_symbolic_number (n, rhs1))
1883             return NULL;
1884           source_stmt1 = stmt;
1885         }
1886
1887       switch (code)
1888         {
1889         case BIT_AND_EXPR:
1890           {
1891             int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1892             uint64_t val = int_cst_value (rhs2), mask = 0;
1893             uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
1894
1895             /* Only constants masking full bytes are allowed.  */
1896             for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
1897               if ((val & tmp) != 0 && (val & tmp) != tmp)
1898                 return NULL;
1899               else if (val & tmp)
1900                 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
1901
1902             n->n &= mask;
1903           }
1904           break;
1905         case LSHIFT_EXPR:
1906         case RSHIFT_EXPR:
1907         case LROTATE_EXPR:
1908         case RROTATE_EXPR:
1909           if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
1910             return NULL;
1911           break;
1912         CASE_CONVERT:
1913           {
1914             int i, type_size, old_type_size;
1915             tree type;
1916
1917             type = gimple_expr_type (stmt);
1918             type_size = TYPE_PRECISION (type);
1919             if (type_size % BITS_PER_UNIT != 0)
1920               return NULL;
1921             type_size /= BITS_PER_UNIT;
1922             if (type_size > 64 / BITS_PER_MARKER)
1923               return NULL;
1924
1925             /* Sign extension: result is dependent on the value.  */
1926             old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1927             if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
1928                 && HEAD_MARKER (n->n, old_type_size))
1929               for (i = 0; i < type_size - old_type_size; i++)
1930                 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1931                         << ((type_size - 1 - i) * BITS_PER_MARKER);
1932
1933             if (type_size < 64 / BITS_PER_MARKER)
1934               {
1935                 /* If STMT casts to a smaller type mask out the bits not
1936                    belonging to the target type.  */
1937                 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
1938               }
1939             n->type = type;
1940             if (!n->base_addr)
1941               n->range = type_size;
1942           }
1943           break;
1944         default:
1945           return NULL;
1946         };
1947       return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
1948     }
1949
1950   /* Handle binary rhs.  */
1951
1952   if (rhs_class == GIMPLE_BINARY_RHS)
1953     {
1954       int i, size;
1955       struct symbolic_number n1, n2;
1956       uint64_t mask;
1957       gimple source_stmt2;
1958
1959       if (code != BIT_IOR_EXPR)
1960         return NULL;
1961
1962       if (TREE_CODE (rhs2) != SSA_NAME)
1963         return NULL;
1964
1965       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1966
1967       switch (code)
1968         {
1969         case BIT_IOR_EXPR:
1970           source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
1971
1972           if (!source_stmt1)
1973             return NULL;
1974
1975           source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
1976
1977           if (!source_stmt2)
1978             return NULL;
1979
1980           if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
1981             return NULL;
1982
1983           if (!n1.vuse != !n2.vuse ||
1984           (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
1985             return NULL;
1986
1987           if (gimple_assign_rhs1 (source_stmt1)
1988               != gimple_assign_rhs1 (source_stmt2))
1989             {
1990               int64_t inc;
1991               HOST_WIDE_INT off_sub;
1992               struct symbolic_number *n_ptr;
1993
1994               if (!n1.base_addr || !n2.base_addr
1995                   || !operand_equal_p (n1.base_addr, n2.base_addr, 0))
1996                 return NULL;
1997               if (!n1.offset != !n2.offset ||
1998                   (n1.offset && !operand_equal_p (n1.offset, n2.offset, 0)))
1999                 return NULL;
2000
2001               /* We swap n1 with n2 to have n1 < n2.  */
2002               if (n2.bytepos < n1.bytepos)
2003                 {
2004                   struct symbolic_number tmpn;
2005
2006                   tmpn = n2;
2007                   n2 = n1;
2008                   n1 = tmpn;
2009                   source_stmt1 = source_stmt2;
2010                 }
2011
2012               off_sub = n2.bytepos - n1.bytepos;
2013
2014               /* Check that the range of memory covered can be represented by
2015                  a symbolic number.  */
2016               if (off_sub + n2.range > 64 / BITS_PER_MARKER)
2017                 return NULL;
2018               n->range = n2.range + off_sub;
2019
2020               /* Reinterpret byte marks in symbolic number holding the value of
2021                  bigger weight according to target endianness.  */
2022               inc = BYTES_BIG_ENDIAN ? off_sub + n2.range - n1.range : off_sub;
2023               size = TYPE_PRECISION (n1.type) / BITS_PER_UNIT;
2024               if (BYTES_BIG_ENDIAN)
2025                 n_ptr = &n1;
2026               else
2027                 n_ptr = &n2;
2028               for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
2029                 {
2030                   unsigned marker =
2031                     (n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
2032                   if (marker && marker != MARKER_BYTE_UNKNOWN)
2033                     n_ptr->n += inc;
2034                 }
2035             }
2036           else
2037             n->range = n1.range;
2038
2039           if (!n1.alias_set
2040               || alias_ptr_types_compatible_p (n1.alias_set, n2.alias_set))
2041             n->alias_set = n1.alias_set;
2042           else
2043             n->alias_set = ptr_type_node;
2044           n->vuse = n1.vuse;
2045           n->base_addr = n1.base_addr;
2046           n->offset = n1.offset;
2047           n->bytepos = n1.bytepos;
2048           n->type = n1.type;
2049           size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2050           for (i = 0, mask = MARKER_MASK; i < size;
2051                i++, mask <<= BITS_PER_MARKER)
2052             {
2053               uint64_t masked1, masked2;
2054
2055               masked1 = n1.n & mask;
2056               masked2 = n2.n & mask;
2057               if (masked1 && masked2 && masked1 != masked2)
2058                 return NULL;
2059             }
2060           n->n = n1.n | n2.n;
2061
2062           if (!verify_symbolic_number_p (n, stmt))
2063             return NULL;
2064
2065           break;
2066         default:
2067           return NULL;
2068         }
2069       return source_stmt1;
2070     }
2071   return NULL;
2072 }
2073
2074 /* Check if STMT completes a bswap implementation or a read in a given
2075    endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2076    accordingly.  It also sets N to represent the kind of operations
2077    performed: size of the resulting expression and whether it works on
2078    a memory source, and if so alias-set and vuse.  At last, the
2079    function returns a stmt whose rhs's first tree is the source
2080    expression.  */
2081
2082 static gimple
2083 find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap)
2084 {
2085 /* The number which the find_bswap_or_nop_1 result should match in order
2086    to have a full byte swap.  The number is shifted to the right
2087    according to the size of the symbolic number before using it.  */
2088   uint64_t cmpxchg = CMPXCHG;
2089   uint64_t cmpnop = CMPNOP;
2090
2091   gimple source_stmt;
2092   int limit;
2093
2094   /* The last parameter determines the depth search limit.  It usually
2095      correlates directly to the number n of bytes to be touched.  We
2096      increase that number by log2(n) + 1 here in order to also
2097      cover signed -> unsigned conversions of the src operand as can be seen
2098      in libgcc, and for initial shift/and operation of the src operand.  */
2099   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2100   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2101   source_stmt =  find_bswap_or_nop_1 (stmt, n, limit);
2102
2103   if (!source_stmt)
2104     return NULL;
2105
2106   /* Find real size of result (highest non zero byte).  */
2107   if (n->base_addr)
2108     {
2109       int rsize;
2110       uint64_t tmpn;
2111
2112       for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2113       n->range = rsize;
2114     }
2115
2116   /* Zero out the extra bits of N and CMP*.  */
2117   if (n->range < (int) sizeof (int64_t))
2118     {
2119       uint64_t mask;
2120
2121       mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2122       cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2123       cmpnop &= mask;
2124     }
2125
2126   /* A complete byte swap should make the symbolic number to start with
2127      the largest digit in the highest order byte. Unchanged symbolic
2128      number indicates a read with same endianness as target architecture.  */
2129   if (n->n == cmpnop)
2130     *bswap = false;
2131   else if (n->n == cmpxchg)
2132     *bswap = true;
2133   else
2134     return NULL;
2135
2136   /* Useless bit manipulation performed by code.  */
2137   if (!n->base_addr && n->n == cmpnop)
2138     return NULL;
2139
2140   n->range *= BITS_PER_UNIT;
2141   return source_stmt;
2142 }
2143
2144 namespace {
2145
2146 const pass_data pass_data_optimize_bswap =
2147 {
2148   GIMPLE_PASS, /* type */
2149   "bswap", /* name */
2150   OPTGROUP_NONE, /* optinfo_flags */
2151   TV_NONE, /* tv_id */
2152   PROP_ssa, /* properties_required */
2153   0, /* properties_provided */
2154   0, /* properties_destroyed */
2155   0, /* todo_flags_start */
2156   0, /* todo_flags_finish */
2157 };
2158
2159 class pass_optimize_bswap : public gimple_opt_pass
2160 {
2161 public:
2162   pass_optimize_bswap (gcc::context *ctxt)
2163     : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2164   {}
2165
2166   /* opt_pass methods: */
2167   virtual bool gate (function *)
2168     {
2169       return flag_expensive_optimizations && optimize;
2170     }
2171
2172   virtual unsigned int execute (function *);
2173
2174 }; // class pass_optimize_bswap
2175
2176 /* Perform the bswap optimization: replace the expression computed in the rhs
2177    of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2178    Which of these alternatives replace the rhs is given by N->base_addr (non
2179    null if a load is needed) and BSWAP.  The type, VUSE and set-alias of the
2180    load to perform are also given in N while the builtin bswap invoke is given
2181    in FNDEL.  Finally, if a load is involved, SRC_STMT refers to one of the
2182    load statements involved to construct the rhs in CUR_STMT and N->range gives
2183    the size of the rhs expression for maintaining some statistics.
2184
2185    Note that if the replacement involve a load, CUR_STMT is moved just after
2186    SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2187    changing of basic block.  */
2188
2189 static bool
2190 bswap_replace (gimple cur_stmt, gimple src_stmt, tree fndecl, tree bswap_type,
2191                tree load_type, struct symbolic_number *n, bool bswap)
2192 {
2193   gimple_stmt_iterator gsi;
2194   tree src, tmp, tgt;
2195   gimple bswap_stmt;
2196
2197   gsi = gsi_for_stmt (cur_stmt);
2198   src = gimple_assign_rhs1 (src_stmt);
2199   tgt = gimple_assign_lhs (cur_stmt);
2200
2201   /* Need to load the value from memory first.  */
2202   if (n->base_addr)
2203     {
2204       gimple_stmt_iterator gsi_ins = gsi_for_stmt (src_stmt);
2205       tree addr_expr, addr_tmp, val_expr, val_tmp;
2206       tree load_offset_ptr, aligned_load_type;
2207       gimple addr_stmt, load_stmt;
2208       unsigned align;
2209
2210       align = get_object_alignment (src);
2211       if (bswap
2212           && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
2213           && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
2214         return false;
2215
2216       /* Move cur_stmt just before  one of the load of the original
2217          to ensure it has the same VUSE.  See PR61517 for what could
2218          go wrong.  */
2219       gsi_move_before (&gsi, &gsi_ins);
2220       gsi = gsi_for_stmt (cur_stmt);
2221
2222       /*  Compute address to load from and cast according to the size
2223           of the load.  */
2224       addr_expr = build_fold_addr_expr (unshare_expr (src));
2225       if (is_gimple_min_invariant (addr_expr))
2226         addr_tmp = addr_expr;
2227       else
2228         {
2229           addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2230                                          "load_src");
2231           addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2232           gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2233         }
2234
2235       /* Perform the load.  */
2236       aligned_load_type = load_type;
2237       if (align < TYPE_ALIGN (load_type))
2238         aligned_load_type = build_aligned_type (load_type, align);
2239       load_offset_ptr = build_int_cst (n->alias_set, 0);
2240       val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2241                               load_offset_ptr);
2242
2243       if (!bswap)
2244         {
2245           if (n->range == 16)
2246             nop_stats.found_16bit++;
2247           else if (n->range == 32)
2248             nop_stats.found_32bit++;
2249           else
2250             {
2251               gcc_assert (n->range == 64);
2252               nop_stats.found_64bit++;
2253             }
2254
2255           /* Convert the result of load if necessary.  */
2256           if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2257             {
2258               val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2259                                             "load_dst");
2260               load_stmt = gimple_build_assign (val_tmp, val_expr);
2261               gimple_set_vuse (load_stmt, n->vuse);
2262               gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2263               gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2264             }
2265           else
2266             {
2267               gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2268               gimple_set_vuse (cur_stmt, n->vuse);
2269             }
2270           update_stmt (cur_stmt);
2271
2272           if (dump_file)
2273             {
2274               fprintf (dump_file,
2275                        "%d bit load in target endianness found at: ",
2276                        (int)n->range);
2277               print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2278             }
2279           return true;
2280         }
2281       else
2282         {
2283           val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2284           load_stmt = gimple_build_assign (val_tmp, val_expr);
2285           gimple_set_vuse (load_stmt, n->vuse);
2286           gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2287         }
2288       src = val_tmp;
2289     }
2290
2291   if (n->range == 16)
2292     bswap_stats.found_16bit++;
2293   else if (n->range == 32)
2294     bswap_stats.found_32bit++;
2295   else
2296     {
2297       gcc_assert (n->range == 64);
2298       bswap_stats.found_64bit++;
2299     }
2300
2301   tmp = src;
2302
2303   /* Canonical form for 16 bit bswap is a rotate expression.  Only 16bit values
2304      are considered as rotation of 2N bit values by N bits is generally not
2305      equivalent to a bswap.  Consider for instance 0x01020304 >> 16 which gives
2306      0x03040102 while a bswap for that value is 0x04030201.  */
2307   if (bswap && n->range == 16)
2308     {
2309       tree count = build_int_cst (NULL, BITS_PER_UNIT);
2310       bswap_type = TREE_TYPE (src);
2311       src = fold_build2 (LROTATE_EXPR, bswap_type, src, count);
2312       bswap_stmt = gimple_build_assign (NULL, src);
2313     }
2314   else
2315     {
2316       /* Convert the src expression if necessary.  */
2317       if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2318         {
2319           gimple convert_stmt;
2320           tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2321           convert_stmt = gimple_build_assign_with_ops (NOP_EXPR, tmp, src);
2322           gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2323         }
2324
2325       bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2326     }
2327
2328   tmp = tgt;
2329
2330   /* Convert the result if necessary.  */
2331   if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2332     {
2333       gimple convert_stmt;
2334       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2335       convert_stmt = gimple_build_assign_with_ops (NOP_EXPR, tgt, tmp);
2336       gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2337     }
2338
2339   gimple_set_lhs (bswap_stmt, tmp);
2340
2341   if (dump_file)
2342     {
2343       fprintf (dump_file, "%d bit bswap implementation found at: ",
2344                (int)n->range);
2345       print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2346     }
2347
2348   gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2349   gsi_remove (&gsi, true);
2350   return true;
2351 }
2352
2353 /* Find manual byte swap implementations as well as load in a given
2354    endianness. Byte swaps are turned into a bswap builtin invokation
2355    while endian loads are converted to bswap builtin invokation or
2356    simple load according to the target endianness.  */
2357
2358 unsigned int
2359 pass_optimize_bswap::execute (function *fun)
2360 {
2361   basic_block bb;
2362   bool bswap16_p, bswap32_p, bswap64_p;
2363   bool changed = false;
2364   tree bswap16_type = NULL_TREE, bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2365
2366   if (BITS_PER_UNIT != 8)
2367     return 0;
2368
2369   bswap16_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP16)
2370                && optab_handler (bswap_optab, HImode) != CODE_FOR_nothing);
2371   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2372                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2373   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2374                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2375                    || (bswap32_p && word_mode == SImode)));
2376
2377   /* Determine the argument type of the builtins.  The code later on
2378      assumes that the return and argument type are the same.  */
2379   if (bswap16_p)
2380     {
2381       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
2382       bswap16_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2383     }
2384
2385   if (bswap32_p)
2386     {
2387       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2388       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2389     }
2390
2391   if (bswap64_p)
2392     {
2393       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2394       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2395     }
2396
2397   memset (&nop_stats, 0, sizeof (nop_stats));
2398   memset (&bswap_stats, 0, sizeof (bswap_stats));
2399
2400   FOR_EACH_BB_FN (bb, fun)
2401     {
2402       gimple_stmt_iterator gsi;
2403
2404       /* We do a reverse scan for bswap patterns to make sure we get the
2405          widest match. As bswap pattern matching doesn't handle previously
2406          inserted smaller bswap replacements as sub-patterns, the wider
2407          variant wouldn't be detected.  */
2408       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2409         {
2410           gimple src_stmt, cur_stmt = gsi_stmt (gsi);
2411           tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2412           enum tree_code code;
2413           struct symbolic_number n;
2414           bool bswap;
2415
2416           /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2417              might be moved to a different basic block by bswap_replace and gsi
2418              must not points to it if that's the case.  Moving the gsi_prev
2419              there make sure that gsi points to the statement previous to
2420              cur_stmt while still making sure that all statements are
2421              considered in this basic block.  */
2422           gsi_prev (&gsi);
2423
2424           if (!is_gimple_assign (cur_stmt))
2425             continue;
2426
2427           code = gimple_assign_rhs_code (cur_stmt);
2428           switch (code)
2429             {
2430             case LROTATE_EXPR:
2431             case RROTATE_EXPR:
2432               if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2433                   || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2434                      % BITS_PER_UNIT)
2435                 continue;
2436               /* Fall through.  */
2437             case BIT_IOR_EXPR:
2438               break;
2439             default:
2440               continue;
2441             }
2442
2443           src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2444
2445           if (!src_stmt)
2446             continue;
2447
2448           switch (n.range)
2449             {
2450             case 16:
2451               /* Already in canonical form, nothing to do.  */
2452               if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2453                 continue;
2454               load_type = uint16_type_node;
2455               if (bswap16_p)
2456                 {
2457                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
2458                   bswap_type = bswap16_type;
2459                 }
2460               break;
2461             case 32:
2462               load_type = uint32_type_node;
2463               if (bswap32_p)
2464                 {
2465                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2466                   bswap_type = bswap32_type;
2467                 }
2468               break;
2469             case 64:
2470               load_type = uint64_type_node;
2471               if (bswap64_p)
2472                 {
2473                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2474                   bswap_type = bswap64_type;
2475                 }
2476               break;
2477             default:
2478               continue;
2479             }
2480
2481           if (bswap && !fndecl)
2482             continue;
2483
2484           if (bswap_replace (cur_stmt, src_stmt, fndecl, bswap_type, load_type,
2485                              &n, bswap))
2486             changed = true;
2487         }
2488     }
2489
2490   statistics_counter_event (fun, "16-bit nop implementations found",
2491                             nop_stats.found_16bit);
2492   statistics_counter_event (fun, "32-bit nop implementations found",
2493                             nop_stats.found_32bit);
2494   statistics_counter_event (fun, "64-bit nop implementations found",
2495                             nop_stats.found_64bit);
2496   statistics_counter_event (fun, "16-bit bswap implementations found",
2497                             bswap_stats.found_16bit);
2498   statistics_counter_event (fun, "32-bit bswap implementations found",
2499                             bswap_stats.found_32bit);
2500   statistics_counter_event (fun, "64-bit bswap implementations found",
2501                             bswap_stats.found_64bit);
2502
2503   return (changed ? TODO_update_ssa : 0);
2504 }
2505
2506 } // anon namespace
2507
2508 gimple_opt_pass *
2509 make_pass_optimize_bswap (gcc::context *ctxt)
2510 {
2511   return new pass_optimize_bswap (ctxt);
2512 }
2513
2514 /* Return true if stmt is a type conversion operation that can be stripped
2515    when used in a widening multiply operation.  */
2516 static bool
2517 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
2518 {
2519   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2520
2521   if (TREE_CODE (result_type) == INTEGER_TYPE)
2522     {
2523       tree op_type;
2524       tree inner_op_type;
2525
2526       if (!CONVERT_EXPR_CODE_P (rhs_code))
2527         return false;
2528
2529       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2530
2531       /* If the type of OP has the same precision as the result, then
2532          we can strip this conversion.  The multiply operation will be
2533          selected to create the correct extension as a by-product.  */
2534       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2535         return true;
2536
2537       /* We can also strip a conversion if it preserves the signed-ness of
2538          the operation and doesn't narrow the range.  */
2539       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2540
2541       /* If the inner-most type is unsigned, then we can strip any
2542          intermediate widening operation.  If it's signed, then the
2543          intermediate widening operation must also be signed.  */
2544       if ((TYPE_UNSIGNED (inner_op_type)
2545            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2546           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2547         return true;
2548
2549       return false;
2550     }
2551
2552   return rhs_code == FIXED_CONVERT_EXPR;
2553 }
2554
2555 /* Return true if RHS is a suitable operand for a widening multiplication,
2556    assuming a target type of TYPE.
2557    There are two cases:
2558
2559      - RHS makes some value at least twice as wide.  Store that value
2560        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2561
2562      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2563        but leave *TYPE_OUT untouched.  */
2564
2565 static bool
2566 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2567                         tree *new_rhs_out)
2568 {
2569   gimple stmt;
2570   tree type1, rhs1;
2571
2572   if (TREE_CODE (rhs) == SSA_NAME)
2573     {
2574       stmt = SSA_NAME_DEF_STMT (rhs);
2575       if (is_gimple_assign (stmt))
2576         {
2577           if (! widening_mult_conversion_strippable_p (type, stmt))
2578             rhs1 = rhs;
2579           else
2580             {
2581               rhs1 = gimple_assign_rhs1 (stmt);
2582
2583               if (TREE_CODE (rhs1) == INTEGER_CST)
2584                 {
2585                   *new_rhs_out = rhs1;
2586                   *type_out = NULL;
2587                   return true;
2588                 }
2589             }
2590         }
2591       else
2592         rhs1 = rhs;
2593
2594       type1 = TREE_TYPE (rhs1);
2595
2596       if (TREE_CODE (type1) != TREE_CODE (type)
2597           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2598         return false;
2599
2600       *new_rhs_out = rhs1;
2601       *type_out = type1;
2602       return true;
2603     }
2604
2605   if (TREE_CODE (rhs) == INTEGER_CST)
2606     {
2607       *new_rhs_out = rhs;
2608       *type_out = NULL;
2609       return true;
2610     }
2611
2612   return false;
2613 }
2614
2615 /* Return true if STMT performs a widening multiplication, assuming the
2616    output type is TYPE.  If so, store the unwidened types of the operands
2617    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2618    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2619    and *TYPE2_OUT would give the operands of the multiplication.  */
2620
2621 static bool
2622 is_widening_mult_p (gimple stmt,
2623                     tree *type1_out, tree *rhs1_out,
2624                     tree *type2_out, tree *rhs2_out)
2625 {
2626   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2627
2628   if (TREE_CODE (type) != INTEGER_TYPE
2629       && TREE_CODE (type) != FIXED_POINT_TYPE)
2630     return false;
2631
2632   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2633                                rhs1_out))
2634     return false;
2635
2636   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2637                                rhs2_out))
2638     return false;
2639
2640   if (*type1_out == NULL)
2641     {
2642       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2643         return false;
2644       *type1_out = *type2_out;
2645     }
2646
2647   if (*type2_out == NULL)
2648     {
2649       if (!int_fits_type_p (*rhs2_out, *type1_out))
2650         return false;
2651       *type2_out = *type1_out;
2652     }
2653
2654   /* Ensure that the larger of the two operands comes first. */
2655   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2656     {
2657       tree tmp;
2658       tmp = *type1_out;
2659       *type1_out = *type2_out;
2660       *type2_out = tmp;
2661       tmp = *rhs1_out;
2662       *rhs1_out = *rhs2_out;
2663       *rhs2_out = tmp;
2664     }
2665
2666   return true;
2667 }
2668
2669 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2670    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2671    value is true iff we converted the statement.  */
2672
2673 static bool
2674 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2675 {
2676   tree lhs, rhs1, rhs2, type, type1, type2;
2677   enum insn_code handler;
2678   machine_mode to_mode, from_mode, actual_mode;
2679   optab op;
2680   int actual_precision;
2681   location_t loc = gimple_location (stmt);
2682   bool from_unsigned1, from_unsigned2;
2683
2684   lhs = gimple_assign_lhs (stmt);
2685   type = TREE_TYPE (lhs);
2686   if (TREE_CODE (type) != INTEGER_TYPE)
2687     return false;
2688
2689   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2690     return false;
2691
2692   to_mode = TYPE_MODE (type);
2693   from_mode = TYPE_MODE (type1);
2694   from_unsigned1 = TYPE_UNSIGNED (type1);
2695   from_unsigned2 = TYPE_UNSIGNED (type2);
2696
2697   if (from_unsigned1 && from_unsigned2)
2698     op = umul_widen_optab;
2699   else if (!from_unsigned1 && !from_unsigned2)
2700     op = smul_widen_optab;
2701   else
2702     op = usmul_widen_optab;
2703
2704   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2705                                                   0, &actual_mode);
2706
2707   if (handler == CODE_FOR_nothing)
2708     {
2709       if (op != smul_widen_optab)
2710         {
2711           /* We can use a signed multiply with unsigned types as long as
2712              there is a wider mode to use, or it is the smaller of the two
2713              types that is unsigned.  Note that type1 >= type2, always.  */
2714           if ((TYPE_UNSIGNED (type1)
2715                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2716               || (TYPE_UNSIGNED (type2)
2717                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2718             {
2719               from_mode = GET_MODE_WIDER_MODE (from_mode);
2720               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2721                 return false;
2722             }
2723
2724           op = smul_widen_optab;
2725           handler = find_widening_optab_handler_and_mode (op, to_mode,
2726                                                           from_mode, 0,
2727                                                           &actual_mode);
2728
2729           if (handler == CODE_FOR_nothing)
2730             return false;
2731
2732           from_unsigned1 = from_unsigned2 = false;
2733         }
2734       else
2735         return false;
2736     }
2737
2738   /* Ensure that the inputs to the handler are in the correct precison
2739      for the opcode.  This will be the full mode size.  */
2740   actual_precision = GET_MODE_PRECISION (actual_mode);
2741   if (2 * actual_precision > TYPE_PRECISION (type))
2742     return false;
2743   if (actual_precision != TYPE_PRECISION (type1)
2744       || from_unsigned1 != TYPE_UNSIGNED (type1))
2745     rhs1 = build_and_insert_cast (gsi, loc,
2746                                   build_nonstandard_integer_type
2747                                     (actual_precision, from_unsigned1), rhs1);
2748   if (actual_precision != TYPE_PRECISION (type2)
2749       || from_unsigned2 != TYPE_UNSIGNED (type2))
2750     rhs2 = build_and_insert_cast (gsi, loc,
2751                                   build_nonstandard_integer_type
2752                                     (actual_precision, from_unsigned2), rhs2);
2753
2754   /* Handle constants.  */
2755   if (TREE_CODE (rhs1) == INTEGER_CST)
2756     rhs1 = fold_convert (type1, rhs1);
2757   if (TREE_CODE (rhs2) == INTEGER_CST)
2758     rhs2 = fold_convert (type2, rhs2);
2759
2760   gimple_assign_set_rhs1 (stmt, rhs1);
2761   gimple_assign_set_rhs2 (stmt, rhs2);
2762   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2763   update_stmt (stmt);
2764   widen_mul_stats.widen_mults_inserted++;
2765   return true;
2766 }
2767
2768 /* Process a single gimple statement STMT, which is found at the
2769    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2770    rhs (given by CODE), and try to convert it into a
2771    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2772    is true iff we converted the statement.  */
2773
2774 static bool
2775 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2776                             enum tree_code code)
2777 {
2778   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2779   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2780   tree type, type1, type2, optype;
2781   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2782   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2783   optab this_optab;
2784   enum tree_code wmult_code;
2785   enum insn_code handler;
2786   machine_mode to_mode, from_mode, actual_mode;
2787   location_t loc = gimple_location (stmt);
2788   int actual_precision;
2789   bool from_unsigned1, from_unsigned2;
2790
2791   lhs = gimple_assign_lhs (stmt);
2792   type = TREE_TYPE (lhs);
2793   if (TREE_CODE (type) != INTEGER_TYPE
2794       && TREE_CODE (type) != FIXED_POINT_TYPE)
2795     return false;
2796
2797   if (code == MINUS_EXPR)
2798     wmult_code = WIDEN_MULT_MINUS_EXPR;
2799   else
2800     wmult_code = WIDEN_MULT_PLUS_EXPR;
2801
2802   rhs1 = gimple_assign_rhs1 (stmt);
2803   rhs2 = gimple_assign_rhs2 (stmt);
2804
2805   if (TREE_CODE (rhs1) == SSA_NAME)
2806     {
2807       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2808       if (is_gimple_assign (rhs1_stmt))
2809         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2810     }
2811
2812   if (TREE_CODE (rhs2) == SSA_NAME)
2813     {
2814       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2815       if (is_gimple_assign (rhs2_stmt))
2816         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2817     }
2818
2819   /* Allow for one conversion statement between the multiply
2820      and addition/subtraction statement.  If there are more than
2821      one conversions then we assume they would invalidate this
2822      transformation.  If that's not the case then they should have
2823      been folded before now.  */
2824   if (CONVERT_EXPR_CODE_P (rhs1_code))
2825     {
2826       conv1_stmt = rhs1_stmt;
2827       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2828       if (TREE_CODE (rhs1) == SSA_NAME)
2829         {
2830           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2831           if (is_gimple_assign (rhs1_stmt))
2832             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2833         }
2834       else
2835         return false;
2836     }
2837   if (CONVERT_EXPR_CODE_P (rhs2_code))
2838     {
2839       conv2_stmt = rhs2_stmt;
2840       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2841       if (TREE_CODE (rhs2) == SSA_NAME)
2842         {
2843           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2844           if (is_gimple_assign (rhs2_stmt))
2845             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2846         }
2847       else
2848         return false;
2849     }
2850
2851   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2852      is_widening_mult_p, but we still need the rhs returns.
2853
2854      It might also appear that it would be sufficient to use the existing
2855      operands of the widening multiply, but that would limit the choice of
2856      multiply-and-accumulate instructions.
2857
2858      If the widened-multiplication result has more than one uses, it is
2859      probably wiser not to do the conversion.  */
2860   if (code == PLUS_EXPR
2861       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2862     {
2863       if (!has_single_use (rhs1)
2864           || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2865                                   &type2, &mult_rhs2))
2866         return false;
2867       add_rhs = rhs2;
2868       conv_stmt = conv1_stmt;
2869     }
2870   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2871     {
2872       if (!has_single_use (rhs2)
2873           || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2874                                   &type2, &mult_rhs2))
2875         return false;
2876       add_rhs = rhs1;
2877       conv_stmt = conv2_stmt;
2878     }
2879   else
2880     return false;
2881
2882   to_mode = TYPE_MODE (type);
2883   from_mode = TYPE_MODE (type1);
2884   from_unsigned1 = TYPE_UNSIGNED (type1);
2885   from_unsigned2 = TYPE_UNSIGNED (type2);
2886   optype = type1;
2887
2888   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2889   if (from_unsigned1 != from_unsigned2)
2890     {
2891       if (!INTEGRAL_TYPE_P (type))
2892         return false;
2893       /* We can use a signed multiply with unsigned types as long as
2894          there is a wider mode to use, or it is the smaller of the two
2895          types that is unsigned.  Note that type1 >= type2, always.  */
2896       if ((from_unsigned1
2897            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2898           || (from_unsigned2
2899               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2900         {
2901           from_mode = GET_MODE_WIDER_MODE (from_mode);
2902           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2903             return false;
2904         }
2905
2906       from_unsigned1 = from_unsigned2 = false;
2907       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2908                                                false);
2909     }
2910
2911   /* If there was a conversion between the multiply and addition
2912      then we need to make sure it fits a multiply-and-accumulate.
2913      The should be a single mode change which does not change the
2914      value.  */
2915   if (conv_stmt)
2916     {
2917       /* We use the original, unmodified data types for this.  */
2918       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2919       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2920       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2921       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2922
2923       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2924         {
2925           /* Conversion is a truncate.  */
2926           if (TYPE_PRECISION (to_type) < data_size)
2927             return false;
2928         }
2929       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2930         {
2931           /* Conversion is an extend.  Check it's the right sort.  */
2932           if (TYPE_UNSIGNED (from_type) != is_unsigned
2933               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2934             return false;
2935         }
2936       /* else convert is a no-op for our purposes.  */
2937     }
2938
2939   /* Verify that the machine can perform a widening multiply
2940      accumulate in this mode/signedness combination, otherwise
2941      this transformation is likely to pessimize code.  */
2942   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2943   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2944                                                   from_mode, 0, &actual_mode);
2945
2946   if (handler == CODE_FOR_nothing)
2947     return false;
2948
2949   /* Ensure that the inputs to the handler are in the correct precison
2950      for the opcode.  This will be the full mode size.  */
2951   actual_precision = GET_MODE_PRECISION (actual_mode);
2952   if (actual_precision != TYPE_PRECISION (type1)
2953       || from_unsigned1 != TYPE_UNSIGNED (type1))
2954     mult_rhs1 = build_and_insert_cast (gsi, loc,
2955                                        build_nonstandard_integer_type
2956                                          (actual_precision, from_unsigned1),
2957                                        mult_rhs1);
2958   if (actual_precision != TYPE_PRECISION (type2)
2959       || from_unsigned2 != TYPE_UNSIGNED (type2))
2960     mult_rhs2 = build_and_insert_cast (gsi, loc,
2961                                        build_nonstandard_integer_type
2962                                          (actual_precision, from_unsigned2),
2963                                        mult_rhs2);
2964
2965   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2966     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2967
2968   /* Handle constants.  */
2969   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2970     mult_rhs1 = fold_convert (type1, mult_rhs1);
2971   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2972     mult_rhs2 = fold_convert (type2, mult_rhs2);
2973
2974   gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
2975                                   add_rhs);
2976   update_stmt (gsi_stmt (*gsi));
2977   widen_mul_stats.maccs_inserted++;
2978   return true;
2979 }
2980
2981 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
2982    with uses in additions and subtractions to form fused multiply-add
2983    operations.  Returns true if successful and MUL_STMT should be removed.  */
2984
2985 static bool
2986 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
2987 {
2988   tree mul_result = gimple_get_lhs (mul_stmt);
2989   tree type = TREE_TYPE (mul_result);
2990   gimple use_stmt, neguse_stmt;
2991   gassign *fma_stmt;
2992   use_operand_p use_p;
2993   imm_use_iterator imm_iter;
2994
2995   if (FLOAT_TYPE_P (type)
2996       && flag_fp_contract_mode == FP_CONTRACT_OFF)
2997     return false;
2998
2999   /* We don't want to do bitfield reduction ops.  */
3000   if (INTEGRAL_TYPE_P (type)
3001       && (TYPE_PRECISION (type)
3002           != GET_MODE_PRECISION (TYPE_MODE (type))))
3003     return false;
3004
3005   /* If the target doesn't support it, don't generate it.  We assume that
3006      if fma isn't available then fms, fnma or fnms are not either.  */
3007   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3008     return false;
3009
3010   /* If the multiplication has zero uses, it is kept around probably because
3011      of -fnon-call-exceptions.  Don't optimize it away in that case,
3012      it is DCE job.  */
3013   if (has_zero_uses (mul_result))
3014     return false;
3015
3016   /* Make sure that the multiplication statement becomes dead after
3017      the transformation, thus that all uses are transformed to FMAs.
3018      This means we assume that an FMA operation has the same cost
3019      as an addition.  */
3020   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3021     {
3022       enum tree_code use_code;
3023       tree result = mul_result;
3024       bool negate_p = false;
3025
3026       use_stmt = USE_STMT (use_p);
3027
3028       if (is_gimple_debug (use_stmt))
3029         continue;
3030
3031       /* For now restrict this operations to single basic blocks.  In theory
3032          we would want to support sinking the multiplication in
3033          m = a*b;
3034          if ()
3035            ma = m + c;
3036          else
3037            d = m;
3038          to form a fma in the then block and sink the multiplication to the
3039          else block.  */
3040       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3041         return false;
3042
3043       if (!is_gimple_assign (use_stmt))
3044         return false;
3045
3046       use_code = gimple_assign_rhs_code (use_stmt);
3047
3048       /* A negate on the multiplication leads to FNMA.  */
3049       if (use_code == NEGATE_EXPR)
3050         {
3051           ssa_op_iter iter;
3052           use_operand_p usep;
3053
3054           result = gimple_assign_lhs (use_stmt);
3055
3056           /* Make sure the negate statement becomes dead with this
3057              single transformation.  */
3058           if (!single_imm_use (gimple_assign_lhs (use_stmt),
3059                                &use_p, &neguse_stmt))
3060             return false;
3061
3062           /* Make sure the multiplication isn't also used on that stmt.  */
3063           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3064             if (USE_FROM_PTR (usep) == mul_result)
3065               return false;
3066
3067           /* Re-validate.  */
3068           use_stmt = neguse_stmt;
3069           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3070             return false;
3071           if (!is_gimple_assign (use_stmt))
3072             return false;
3073
3074           use_code = gimple_assign_rhs_code (use_stmt);
3075           negate_p = true;
3076         }
3077
3078       switch (use_code)
3079         {
3080         case MINUS_EXPR:
3081           if (gimple_assign_rhs2 (use_stmt) == result)
3082             negate_p = !negate_p;
3083           break;
3084         case PLUS_EXPR:
3085           break;
3086         default:
3087           /* FMA can only be formed from PLUS and MINUS.  */
3088           return false;
3089         }
3090
3091       /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3092          by a MULT_EXPR that we'll visit later, we might be able to
3093          get a more profitable match with fnma.
3094          OTOH, if we don't, a negate / fma pair has likely lower latency
3095          that a mult / subtract pair.  */
3096       if (use_code == MINUS_EXPR && !negate_p
3097           && gimple_assign_rhs1 (use_stmt) == result
3098           && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3099           && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3100         {
3101           tree rhs2 = gimple_assign_rhs2 (use_stmt);
3102
3103           if (TREE_CODE (rhs2) == SSA_NAME)
3104             {
3105               gimple stmt2 = SSA_NAME_DEF_STMT (rhs2);
3106               if (has_single_use (rhs2)
3107                   && is_gimple_assign (stmt2)
3108                   && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3109               return false;
3110             }
3111         }
3112
3113       /* We can't handle a * b + a * b.  */
3114       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3115         return false;
3116
3117       /* While it is possible to validate whether or not the exact form
3118          that we've recognized is available in the backend, the assumption
3119          is that the transformation is never a loss.  For instance, suppose
3120          the target only has the plain FMA pattern available.  Consider
3121          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3122          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
3123          still have 3 operations, but in the FMA form the two NEGs are
3124          independent and could be run in parallel.  */
3125     }
3126
3127   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3128     {
3129       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3130       enum tree_code use_code;
3131       tree addop, mulop1 = op1, result = mul_result;
3132       bool negate_p = false;
3133
3134       if (is_gimple_debug (use_stmt))
3135         continue;
3136
3137       use_code = gimple_assign_rhs_code (use_stmt);
3138       if (use_code == NEGATE_EXPR)
3139         {
3140           result = gimple_assign_lhs (use_stmt);
3141           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3142           gsi_remove (&gsi, true);
3143           release_defs (use_stmt);
3144
3145           use_stmt = neguse_stmt;
3146           gsi = gsi_for_stmt (use_stmt);
3147           use_code = gimple_assign_rhs_code (use_stmt);
3148           negate_p = true;
3149         }
3150
3151       if (gimple_assign_rhs1 (use_stmt) == result)
3152         {
3153           addop = gimple_assign_rhs2 (use_stmt);
3154           /* a * b - c -> a * b + (-c)  */
3155           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3156             addop = force_gimple_operand_gsi (&gsi,
3157                                               build1 (NEGATE_EXPR,
3158                                                       type, addop),
3159                                               true, NULL_TREE, true,
3160                                               GSI_SAME_STMT);
3161         }
3162       else
3163         {
3164           addop = gimple_assign_rhs1 (use_stmt);
3165           /* a - b * c -> (-b) * c + a */
3166           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3167             negate_p = !negate_p;
3168         }
3169
3170       if (negate_p)
3171         mulop1 = force_gimple_operand_gsi (&gsi,
3172                                            build1 (NEGATE_EXPR,
3173                                                    type, mulop1),
3174                                            true, NULL_TREE, true,
3175                                            GSI_SAME_STMT);
3176
3177       fma_stmt = gimple_build_assign_with_ops (FMA_EXPR,
3178                                                gimple_assign_lhs (use_stmt),
3179                                                mulop1, op2, addop);
3180       gsi_replace (&gsi, fma_stmt, true);
3181       widen_mul_stats.fmas_inserted++;
3182     }
3183
3184   return true;
3185 }
3186
3187 /* Find integer multiplications where the operands are extended from
3188    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3189    where appropriate.  */
3190
3191 namespace {
3192
3193 const pass_data pass_data_optimize_widening_mul =
3194 {
3195   GIMPLE_PASS, /* type */
3196   "widening_mul", /* name */
3197   OPTGROUP_NONE, /* optinfo_flags */
3198   TV_NONE, /* tv_id */
3199   PROP_ssa, /* properties_required */
3200   0, /* properties_provided */
3201   0, /* properties_destroyed */
3202   0, /* todo_flags_start */
3203   TODO_update_ssa, /* todo_flags_finish */
3204 };
3205
3206 class pass_optimize_widening_mul : public gimple_opt_pass
3207 {
3208 public:
3209   pass_optimize_widening_mul (gcc::context *ctxt)
3210     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3211   {}
3212
3213   /* opt_pass methods: */
3214   virtual bool gate (function *)
3215     {
3216       return flag_expensive_optimizations && optimize;
3217     }
3218
3219   virtual unsigned int execute (function *);
3220
3221 }; // class pass_optimize_widening_mul
3222
3223 unsigned int
3224 pass_optimize_widening_mul::execute (function *fun)
3225 {
3226   basic_block bb;
3227   bool cfg_changed = false;
3228
3229   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3230
3231   FOR_EACH_BB_FN (bb, fun)
3232     {
3233       gimple_stmt_iterator gsi;
3234
3235       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3236         {
3237           gimple stmt = gsi_stmt (gsi);
3238           enum tree_code code;
3239
3240           if (is_gimple_assign (stmt))
3241             {
3242               code = gimple_assign_rhs_code (stmt);
3243               switch (code)
3244                 {
3245                 case MULT_EXPR:
3246                   if (!convert_mult_to_widen (stmt, &gsi)
3247                       && convert_mult_to_fma (stmt,
3248                                               gimple_assign_rhs1 (stmt),
3249                                               gimple_assign_rhs2 (stmt)))
3250                     {
3251                       gsi_remove (&gsi, true);
3252                       release_defs (stmt);
3253                       continue;
3254                     }
3255                   break;
3256
3257                 case PLUS_EXPR:
3258                 case MINUS_EXPR:
3259                   convert_plusminus_to_widen (&gsi, stmt, code);
3260                   break;
3261
3262                 default:;
3263                 }
3264             }
3265           else if (is_gimple_call (stmt)
3266                    && gimple_call_lhs (stmt))
3267             {
3268               tree fndecl = gimple_call_fndecl (stmt);
3269               if (fndecl
3270                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
3271                 {
3272                   switch (DECL_FUNCTION_CODE (fndecl))
3273                     {
3274                       case BUILT_IN_POWF:
3275                       case BUILT_IN_POW:
3276                       case BUILT_IN_POWL:
3277                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3278                             && REAL_VALUES_EQUAL
3279                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3280                                   dconst2)
3281                             && convert_mult_to_fma (stmt,
3282                                                     gimple_call_arg (stmt, 0),
3283                                                     gimple_call_arg (stmt, 0)))
3284                           {
3285                             unlink_stmt_vdef (stmt);
3286                             if (gsi_remove (&gsi, true)
3287                                 && gimple_purge_dead_eh_edges (bb))
3288                               cfg_changed = true;
3289                             release_defs (stmt);
3290                             continue;
3291                           }
3292                           break;
3293
3294                       default:;
3295                     }
3296                 }
3297             }
3298           gsi_next (&gsi);
3299         }
3300     }
3301
3302   statistics_counter_event (fun, "widening multiplications inserted",
3303                             widen_mul_stats.widen_mults_inserted);
3304   statistics_counter_event (fun, "widening maccs inserted",
3305                             widen_mul_stats.maccs_inserted);
3306   statistics_counter_event (fun, "fused multiply-adds inserted",
3307                             widen_mul_stats.fmas_inserted);
3308
3309   return cfg_changed ? TODO_cleanup_cfg : 0;
3310 }
3311
3312 } // anon namespace
3313
3314 gimple_opt_pass *
3315 make_pass_optimize_widening_mul (gcc::context *ctxt)
3316 {
3317   return new pass_optimize_widening_mul (ctxt);
3318 }