gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005-2014 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  21    operations.  These are common in sequences such as this one:
  22
  23         modulus = sqrt(x*x + y*y + z*z);
  24         x = x / modulus;
  25         y = y / modulus;
  26         z = z / modulus;
  27
  28    that can be optimized to
  29
  30         modulus = sqrt(x*x + y*y + z*z);
  31         rmodulus = 1.0 / modulus;
  32         x = x * rmodulus;
  33         y = y * rmodulus;
  34         z = z * rmodulus;
  35
  36    We do this for loop invariant divisors, and with this pass whenever
  37    we notice that a division has the same divisor multiple times.
  38
  39    Of course, like in PRE, we don't insert a division if a dominator
  40    already has one.  However, this cannot be done as an extension of
  41    PRE for several reasons.
  42
  43    First of all, with some experiments it was found out that the
  44    transformation is not always useful if there are only two divisions
  45    hy the same divisor.  This is probably because modern processors
  46    can pipeline the divisions; on older, in-order processors it should
  47    still be effective to optimize two divisions by the same number.
  48    We make this a param, and it shall be called N in the remainder of
  49    this comment.
  50
  51    Second, if trapping math is active, we have less freedom on where
  52    to insert divisions: we can only do so in basic blocks that already
  53    contain one.  (If divisions don't trap, instead, we can insert
  54    divisions elsewhere, which will be in blocks that are common dominators
  55    of those that have the division).
  56
  57    We really don't want to compute the reciprocal unless a division will
  58    be found.  To do this, we won't insert the division in a basic block
  59    that has less than N divisions *post-dominating* it.
  60
  61    The algorithm constructs a subset of the dominator tree, holding the
  62    blocks containing the divisions and the common dominators to them,
  63    and walk it twice.  The first walk is in post-order, and it annotates
  64    each block with the number of divisions that post-dominate it: this
  65    gives information on where divisions can be inserted profitably.
  66    The second walk is in pre-order, and it inserts divisions as explained
  67    above, and replaces divisions by multiplications.
  68
  69    In the best case, the cost of the pass is O(n_statements).  In the
  70    worst-case, the cost is due to creating the dominator tree subset,
  71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  72    for n_statements / n_basic_blocks statements.  So, the amortized cost
  73    of creating the dominator tree subset is O(n_basic_blocks) and the
  74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  75
  76    More practically, the cost will be small because there are few
  77    divisions, and they tend to be in the same basic block, so insert_bb
  78    is called very few times.
  79
  80    If we did this using domwalk.c, an efficient implementation would have
  81    to work on all the variables in a single pass, because we could not
  82    work on just a subset of the dominator tree, as we do now, and the
  83    cost would also be something like O(n_statements * n_basic_blocks).
  84    The data structures would be more complex in order to work on all the
  85    variables in a single pass.  */
  86
  87 #include "config.h"
  88 #include "system.h"
  89 #include "coretypes.h"
  90 #include "tm.h"
  91 #include "flags.h"
  92 #include "tree.h"
  93 #include "predict.h"
  94 #include "vec.h"
  95 #include "hashtab.h"
  96 #include "hash-set.h"
  97 #include "machmode.h"
  98 #include "hard-reg-set.h"
  99 #include "input.h"
 100 #include "function.h"
 101 #include "dominance.h"
 102 #include "cfg.h"
 103 #include "basic-block.h"
 104 #include "tree-ssa-alias.h"
 105 #include "internal-fn.h"
 106 #include "gimple-fold.h"
 107 #include "gimple-expr.h"
 108 #include "is-a.h"
 109 #include "gimple.h"
 110 #include "gimple-iterator.h"
 111 #include "gimplify.h"
 112 #include "gimplify-me.h"
 113 #include "stor-layout.h"
 114 #include "gimple-ssa.h"
 115 #include "tree-cfg.h"
 116 #include "tree-phinodes.h"
 117 #include "ssa-iterators.h"
 118 #include "stringpool.h"
 119 #include "tree-ssanames.h"
 120 #include "expr.h"
 121 #include "tree-dfa.h"
 122 #include "tree-ssa.h"
 123 #include "tree-pass.h"
 124 #include "alloc-pool.h"
 125 #include "target.h"
 126 #include "gimple-pretty-print.h"
 127 #include "builtins.h"
 128
 129 /* FIXME: RTL headers have to be included here for optabs.  */
 130 #include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
 131 #include "expr.h"               /* Because optabs.h wants sepops.  */
 132 #include "insn-codes.h"
 133 #include "optabs.h"
 134
 135 /* This structure represents one basic block that either computes a
 136    division, or is a common dominator for basic block that compute a
 137    division.  */
 138 struct occurrence {
 139   /* The basic block represented by this structure.  */
 140   basic_block bb;
 141
 142   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 143      inserted in BB.  */
 144   tree recip_def;
 145
 146   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 147      was inserted in BB.  */
 148   gimple recip_def_stmt;
 149
 150   /* Pointer to a list of "struct occurrence"s for blocks dominated
 151      by BB.  */
 152   struct occurrence *children;
 153
 154   /* Pointer to the next "struct occurrence"s in the list of blocks
 155      sharing a common dominator.  */
 156   struct occurrence *next;
 157
 158   /* The number of divisions that are in BB before compute_merit.  The
 159      number of divisions that are in BB or post-dominate it after
 160      compute_merit.  */
 161   int num_divisions;
 162
 163   /* True if the basic block has a division, false if it is a common
 164      dominator for basic blocks that do.  If it is false and trapping
 165      math is active, BB is not a candidate for inserting a reciprocal.  */
 166   bool bb_has_division;
 167 };
 168
 169 static struct
 170 {
 171   /* Number of 1.0/X ops inserted.  */
 172   int rdivs_inserted;
 173
 174   /* Number of 1.0/FUNC ops inserted.  */
 175   int rfuncs_inserted;
 176 } reciprocal_stats;
 177
 178 static struct
 179 {
 180   /* Number of cexpi calls inserted.  */
 181   int inserted;
 182 } sincos_stats;
 183
 184 static struct
 185 {
 186   /* Number of hand-written 16-bit nop / bswaps found.  */
 187   int found_16bit;
 188
 189   /* Number of hand-written 32-bit nop / bswaps found.  */
 190   int found_32bit;
 191
 192   /* Number of hand-written 64-bit nop / bswaps found.  */
 193   int found_64bit;
 194 } nop_stats, bswap_stats;
 195
 196 static struct
 197 {
 198   /* Number of widening multiplication ops inserted.  */
 199   int widen_mults_inserted;
 200
 201   /* Number of integer multiply-and-accumulate ops inserted.  */
 202   int maccs_inserted;
 203
 204   /* Number of fp fused multiply-add ops inserted.  */
 205   int fmas_inserted;
 206 } widen_mul_stats;
 207
 208 /* The instance of "struct occurrence" representing the highest
 209    interesting block in the dominator tree.  */
 210 static struct occurrence *occ_head;
 211
 212 /* Allocation pool for getting instances of "struct occurrence".  */
 213 static alloc_pool occ_pool;
 214
 215
 216
 217 /* Allocate and return a new struct occurrence for basic block BB, and
 218    whose children list is headed by CHILDREN.  */
 219 static struct occurrence *
 220 occ_new (basic_block bb, struct occurrence *children)
 221 {
 222   struct occurrence *occ;
 223
 224   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
 225   memset (occ, 0, sizeof (struct occurrence));
 226
 227   occ->bb = bb;
 228   occ->children = children;
 229   return occ;
 230 }
 231
 232
 233 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 234    list of "struct occurrence"s, one per basic block, having IDOM as
 235    their common dominator.
 236
 237    We try to insert NEW_OCC as deep as possible in the tree, and we also
 238    insert any other block that is a common dominator for BB and one
 239    block already in the tree.  */
 240
 241 static void
 242 insert_bb (struct occurrence *new_occ, basic_block idom,
 243            struct occurrence **p_head)
 244 {
 245   struct occurrence *occ, **p_occ;
 246
 247   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 248     {
 249       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 250       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 251       if (dom == bb)
 252         {
 253           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 254              from its list.  */
 255           *p_occ = occ->next;
 256           occ->next = new_occ->children;
 257           new_occ->children = occ;
 258
 259           /* Try the next block (it may as well be dominated by BB).  */
 260         }
 261
 262       else if (dom == occ_bb)
 263         {
 264           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 265           insert_bb (new_occ, dom, &occ->children);
 266           return;
 267         }
 268
 269       else if (dom != idom)
 270         {
 271           gcc_assert (!dom->aux);
 272
 273           /* There is a dominator between IDOM and BB, add it and make
 274              two children out of NEW_OCC and OCC.  First, remove OCC from
 275              its list.  */
 276           *p_occ = occ->next;
 277           new_occ->next = occ;
 278           occ->next = NULL;
 279
 280           /* None of the previous blocks has DOM as a dominator: if we tail
 281              recursed, we would reexamine them uselessly. Just switch BB with
 282              DOM, and go on looking for blocks dominated by DOM.  */
 283           new_occ = occ_new (dom, new_occ);
 284         }
 285
 286       else
 287         {
 288           /* Nothing special, go on with the next element.  */
 289           p_occ = &occ->next;
 290         }
 291     }
 292
 293   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 294   new_occ->next = *p_head;
 295   *p_head = new_occ;
 296 }
 297
 298 /* Register that we found a division in BB.  */
 299
 300 static inline void
 301 register_division_in (basic_block bb)
 302 {
 303   struct occurrence *occ;
 304
 305   occ = (struct occurrence *) bb->aux;
 306   if (!occ)
 307     {
 308       occ = occ_new (bb, NULL);
 309       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
 310     }
 311
 312   occ->bb_has_division = true;
 313   occ->num_divisions++;
 314 }
 315
 316
 317 /* Compute the number of divisions that postdominate each block in OCC and
 318    its children.  */
 319
 320 static void
 321 compute_merit (struct occurrence *occ)
 322 {
 323   struct occurrence *occ_child;
 324   basic_block dom = occ->bb;
 325
 326   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 327     {
 328       basic_block bb;
 329       if (occ_child->children)
 330         compute_merit (occ_child);
 331
 332       if (flag_exceptions)
 333         bb = single_noncomplex_succ (dom);
 334       else
 335         bb = dom;
 336
 337       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 338         occ->num_divisions += occ_child->num_divisions;
 339     }
 340 }
 341
 342
 343 /* Return whether USE_STMT is a floating-point division by DEF.  */
 344 static inline bool
 345 is_division_by (gimple use_stmt, tree def)
 346 {
 347   return is_gimple_assign (use_stmt)
 348          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 349          && gimple_assign_rhs2 (use_stmt) == def
 350          /* Do not recognize x / x as valid division, as we are getting
 351             confused later by replacing all immediate uses x in such
 352             a stmt.  */
 353          && gimple_assign_rhs1 (use_stmt) != def;
 354 }
 355
 356 /* Walk the subset of the dominator tree rooted at OCC, setting the
 357    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 358    the given basic block.  The field may be left NULL, of course,
 359    if it is not possible or profitable to do the optimization.
 360
 361    DEF_BSI is an iterator pointing at the statement defining DEF.
 362    If RECIP_DEF is set, a dominator already has a computation that can
 363    be used.  */
 364
 365 static void
 366 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 367                     tree def, tree recip_def, int threshold)
 368 {
 369   tree type;
 370   gimple new_stmt;
 371   gimple_stmt_iterator gsi;
 372   struct occurrence *occ_child;
 373
 374   if (!recip_def
 375       && (occ->bb_has_division || !flag_trapping_math)
 376       && occ->num_divisions >= threshold)
 377     {
 378       /* Make a variable with the replacement and substitute it.  */
 379       type = TREE_TYPE (def);
 380       recip_def = create_tmp_reg (type, "reciptmp");
 381       new_stmt = gimple_build_assign_with_ops (RDIV_EXPR, recip_def,
 382                                                build_one_cst (type), def);
 383
 384       if (occ->bb_has_division)
 385         {
 386           /* Case 1: insert before an existing division.  */
 387           gsi = gsi_after_labels (occ->bb);
 388           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 389             gsi_next (&gsi);
 390
 391           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 392         }
 393       else if (def_gsi && occ->bb == def_gsi->bb)
 394         {
 395           /* Case 2: insert right after the definition.  Note that this will
 396              never happen if the definition statement can throw, because in
 397              that case the sole successor of the statement's basic block will
 398              dominate all the uses as well.  */
 399           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 400         }
 401       else
 402         {
 403           /* Case 3: insert in a basic block not containing defs/uses.  */
 404           gsi = gsi_after_labels (occ->bb);
 405           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 406         }
 407
 408       reciprocal_stats.rdivs_inserted++;
 409
 410       occ->recip_def_stmt = new_stmt;
 411     }
 412
 413   occ->recip_def = recip_def;
 414   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 415     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 416 }
 417
 418
 419 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 420    possible.  */
 421
 422 static inline void
 423 replace_reciprocal (use_operand_p use_p)
 424 {
 425   gimple use_stmt = USE_STMT (use_p);
 426   basic_block bb = gimple_bb (use_stmt);
 427   struct occurrence *occ = (struct occurrence *) bb->aux;
 428
 429   if (optimize_bb_for_speed_p (bb)
 430       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 431     {
 432       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 433       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 434       SET_USE (use_p, occ->recip_def);
 435       fold_stmt_inplace (&gsi);
 436       update_stmt (use_stmt);
 437     }
 438 }
 439
 440
 441 /* Free OCC and return one more "struct occurrence" to be freed.  */
 442
 443 static struct occurrence *
 444 free_bb (struct occurrence *occ)
 445 {
 446   struct occurrence *child, *next;
 447
 448   /* First get the two pointers hanging off OCC.  */
 449   next = occ->next;
 450   child = occ->children;
 451   occ->bb->aux = NULL;
 452   pool_free (occ_pool, occ);
 453
 454   /* Now ensure that we don't recurse unless it is necessary.  */
 455   if (!child)
 456     return next;
 457   else
 458     {
 459       while (next)
 460         next = free_bb (next);
 461
 462       return child;
 463     }
 464 }
 465
 466
 467 /* Look for floating-point divisions among DEF's uses, and try to
 468    replace them by multiplications with the reciprocal.  Add
 469    as many statements computing the reciprocal as needed.
 470
 471    DEF must be a GIMPLE register of a floating-point type.  */
 472
 473 static void
 474 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 475 {
 476   use_operand_p use_p;
 477   imm_use_iterator use_iter;
 478   struct occurrence *occ;
 479   int count = 0, threshold;
 480
 481   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 482
 483   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 484     {
 485       gimple use_stmt = USE_STMT (use_p);
 486       if (is_division_by (use_stmt, def))
 487         {
 488           register_division_in (gimple_bb (use_stmt));
 489           count++;
 490         }
 491     }
 492
 493   /* Do the expensive part only if we can hope to optimize something.  */
 494   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 495   if (count >= threshold)
 496     {
 497       gimple use_stmt;
 498       for (occ = occ_head; occ; occ = occ->next)
 499         {
 500           compute_merit (occ);
 501           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 502         }
 503
 504       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 505         {
 506           if (is_division_by (use_stmt, def))
 507             {
 508               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 509                 replace_reciprocal (use_p);
 510             }
 511         }
 512     }
 513
 514   for (occ = occ_head; occ; )
 515     occ = free_bb (occ);
 516
 517   occ_head = NULL;
 518 }
 519
 520 /* Go through all the floating-point SSA_NAMEs, and call
 521    execute_cse_reciprocals_1 on each of them.  */
 522 namespace {
 523
 524 const pass_data pass_data_cse_reciprocals =
 525 {
 526   GIMPLE_PASS, /* type */
 527   "recip", /* name */
 528   OPTGROUP_NONE, /* optinfo_flags */
 529   TV_NONE, /* tv_id */
 530   PROP_ssa, /* properties_required */
 531   0, /* properties_provided */
 532   0, /* properties_destroyed */
 533   0, /* todo_flags_start */
 534   TODO_update_ssa, /* todo_flags_finish */
 535 };
 536
 537 class pass_cse_reciprocals : public gimple_opt_pass
 538 {
 539 public:
 540   pass_cse_reciprocals (gcc::context *ctxt)
 541     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
 542   {}
 543
 544   /* opt_pass methods: */
 545   virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
 546   virtual unsigned int execute (function *);
 547
 548 }; // class pass_cse_reciprocals
 549
 550 unsigned int
 551 pass_cse_reciprocals::execute (function *fun)
 552 {
 553   basic_block bb;
 554   tree arg;
 555
 556   occ_pool = create_alloc_pool ("dominators for recip",
 557                                 sizeof (struct occurrence),
 558                                 n_basic_blocks_for_fn (fun) / 3 + 1);
 559
 560   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 561   calculate_dominance_info (CDI_DOMINATORS);
 562   calculate_dominance_info (CDI_POST_DOMINATORS);
 563
 564 #ifdef ENABLE_CHECKING
 565   FOR_EACH_BB_FN (bb, fun)
 566     gcc_assert (!bb->aux);
 567 #endif
 568
 569   for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
 570     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 571         && is_gimple_reg (arg))
 572       {
 573         tree name = ssa_default_def (fun, arg);
 574         if (name)
 575           execute_cse_reciprocals_1 (NULL, name);
 576       }
 577
 578   FOR_EACH_BB_FN (bb, fun)
 579     {
 580       gimple_stmt_iterator gsi;
 581       gimple phi;
 582       tree def;
 583
 584       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 585         {
 586           phi = gsi_stmt (gsi);
 587           def = PHI_RESULT (phi);
 588           if (! virtual_operand_p (def)
 589               && FLOAT_TYPE_P (TREE_TYPE (def)))
 590             execute_cse_reciprocals_1 (NULL, def);
 591         }
 592
 593       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 594         {
 595           gimple stmt = gsi_stmt (gsi);
 596
 597           if (gimple_has_lhs (stmt)
 598               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 599               && FLOAT_TYPE_P (TREE_TYPE (def))
 600               && TREE_CODE (def) == SSA_NAME)
 601             execute_cse_reciprocals_1 (&gsi, def);
 602         }
 603
 604       if (optimize_bb_for_size_p (bb))
 605         continue;
 606
 607       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 608       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 609         {
 610           gimple stmt = gsi_stmt (gsi);
 611           tree fndecl;
 612
 613           if (is_gimple_assign (stmt)
 614               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 615             {
 616               tree arg1 = gimple_assign_rhs2 (stmt);
 617               gimple stmt1;
 618
 619               if (TREE_CODE (arg1) != SSA_NAME)
 620                 continue;
 621
 622               stmt1 = SSA_NAME_DEF_STMT (arg1);
 623
 624               if (is_gimple_call (stmt1)
 625                   && gimple_call_lhs (stmt1)
 626                   && (fndecl = gimple_call_fndecl (stmt1))
 627                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 628                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 629                 {
 630                   enum built_in_function code;
 631                   bool md_code, fail;
 632                   imm_use_iterator ui;
 633                   use_operand_p use_p;
 634
 635                   code = DECL_FUNCTION_CODE (fndecl);
 636                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 637
 638                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 639                   if (!fndecl)
 640                     continue;
 641
 642                   /* Check that all uses of the SSA name are divisions,
 643                      otherwise replacing the defining statement will do
 644                      the wrong thing.  */
 645                   fail = false;
 646                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 647                     {
 648                       gimple stmt2 = USE_STMT (use_p);
 649                       if (is_gimple_debug (stmt2))
 650                         continue;
 651                       if (!is_gimple_assign (stmt2)
 652                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 653                           || gimple_assign_rhs1 (stmt2) == arg1
 654                           || gimple_assign_rhs2 (stmt2) != arg1)
 655                         {
 656                           fail = true;
 657                           break;
 658                         }
 659                     }
 660                   if (fail)
 661                     continue;
 662
 663                   gimple_replace_ssa_lhs (stmt1, arg1);
 664                   gimple_call_set_fndecl (stmt1, fndecl);
 665                   update_stmt (stmt1);
 666                   reciprocal_stats.rfuncs_inserted++;
 667
 668                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 669                     {
 670                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 671                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 672                       fold_stmt_inplace (&gsi);
 673                       update_stmt (stmt);
 674                     }
 675                 }
 676             }
 677         }
 678     }
 679
 680   statistics_counter_event (fun, "reciprocal divs inserted",
 681                             reciprocal_stats.rdivs_inserted);
 682   statistics_counter_event (fun, "reciprocal functions inserted",
 683                             reciprocal_stats.rfuncs_inserted);
 684
 685   free_dominance_info (CDI_DOMINATORS);
 686   free_dominance_info (CDI_POST_DOMINATORS);
 687   free_alloc_pool (occ_pool);
 688   return 0;
 689 }
 690
 691 } // anon namespace
 692
 693 gimple_opt_pass *
 694 make_pass_cse_reciprocals (gcc::context *ctxt)
 695 {
 696   return new pass_cse_reciprocals (ctxt);
 697 }
 698
 699 /* Records an occurrence at statement USE_STMT in the vector of trees
 700    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 701    is not yet initialized.  Returns true if the occurrence was pushed on
 702    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 703    statements in the vector.  */
 704
 705 static bool
 706 maybe_record_sincos (vec<gimple> *stmts,
 707                      basic_block *top_bb, gimple use_stmt)
 708 {
 709   basic_block use_bb = gimple_bb (use_stmt);
 710   if (*top_bb
 711       && (*top_bb == use_bb
 712           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 713     stmts->safe_push (use_stmt);
 714   else if (!*top_bb
 715            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 716     {
 717       stmts->safe_push (use_stmt);
 718       *top_bb = use_bb;
 719     }
 720   else
 721     return false;
 722
 723   return true;
 724 }
 725
 726 /* Look for sin, cos and cexpi calls with the same argument NAME and
 727    create a single call to cexpi CSEing the result in this case.
 728    We first walk over all immediate uses of the argument collecting
 729    statements that we can CSE in a vector and in a second pass replace
 730    the statement rhs with a REALPART or IMAGPART expression on the
 731    result of the cexpi call we insert before the use statement that
 732    dominates all other candidates.  */
 733
 734 static bool
 735 execute_cse_sincos_1 (tree name)
 736 {
 737   gimple_stmt_iterator gsi;
 738   imm_use_iterator use_iter;
 739   tree fndecl, res, type;
 740   gimple def_stmt, use_stmt, stmt;
 741   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 742   vec<gimple> stmts = vNULL;
 743   basic_block top_bb = NULL;
 744   int i;
 745   bool cfg_changed = false;
 746
 747   type = TREE_TYPE (name);
 748   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 749     {
 750       if (gimple_code (use_stmt) != GIMPLE_CALL
 751           || !gimple_call_lhs (use_stmt)
 752           || !(fndecl = gimple_call_fndecl (use_stmt))
 753           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 754         continue;
 755
 756       switch (DECL_FUNCTION_CODE (fndecl))
 757         {
 758         CASE_FLT_FN (BUILT_IN_COS):
 759           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 760           break;
 761
 762         CASE_FLT_FN (BUILT_IN_SIN):
 763           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 764           break;
 765
 766         CASE_FLT_FN (BUILT_IN_CEXPI):
 767           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 768           break;
 769
 770         default:;
 771         }
 772     }
 773
 774   if (seen_cos + seen_sin + seen_cexpi <= 1)
 775     {
 776       stmts.release ();
 777       return false;
 778     }
 779
 780   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 781      the name def statement.  */
 782   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 783   if (!fndecl)
 784     return false;
 785   stmt = gimple_build_call (fndecl, 1, name);
 786   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
 787   gimple_call_set_lhs (stmt, res);
 788
 789   def_stmt = SSA_NAME_DEF_STMT (name);
 790   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 791       && gimple_code (def_stmt) != GIMPLE_PHI
 792       && gimple_bb (def_stmt) == top_bb)
 793     {
 794       gsi = gsi_for_stmt (def_stmt);
 795       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 796     }
 797   else
 798     {
 799       gsi = gsi_after_labels (top_bb);
 800       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 801     }
 802   sincos_stats.inserted++;
 803
 804   /* And adjust the recorded old call sites.  */
 805   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
 806     {
 807       tree rhs = NULL;
 808       fndecl = gimple_call_fndecl (use_stmt);
 809
 810       switch (DECL_FUNCTION_CODE (fndecl))
 811         {
 812         CASE_FLT_FN (BUILT_IN_COS):
 813           rhs = fold_build1 (REALPART_EXPR, type, res);
 814           break;
 815
 816         CASE_FLT_FN (BUILT_IN_SIN):
 817           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 818           break;
 819
 820         CASE_FLT_FN (BUILT_IN_CEXPI):
 821           rhs = res;
 822           break;
 823
 824         default:;
 825           gcc_unreachable ();
 826         }
 827
 828         /* Replace call with a copy.  */
 829         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 830
 831         gsi = gsi_for_stmt (use_stmt);
 832         gsi_replace (&gsi, stmt, true);
 833         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 834           cfg_changed = true;
 835     }
 836
 837   stmts.release ();
 838
 839   return cfg_changed;
 840 }
 841
 842 /* To evaluate powi(x,n), the floating point value x raised to the
 843    constant integer exponent n, we use a hybrid algorithm that
 844    combines the "window method" with look-up tables.  For an
 845    introduction to exponentiation algorithms and "addition chains",
 846    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 847    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 848    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 849    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 850
 851 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 852    multiplications to inline before calling the system library's pow
 853    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 854    so this default never requires calling pow, powf or powl.  */
 855
 856 #ifndef POWI_MAX_MULTS
 857 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 858 #endif
 859
 860 /* The size of the "optimal power tree" lookup table.  All
 861    exponents less than this value are simply looked up in the
 862    powi_table below.  This threshold is also used to size the
 863    cache of pseudo registers that hold intermediate results.  */
 864 #define POWI_TABLE_SIZE 256
 865
 866 /* The size, in bits of the window, used in the "window method"
 867    exponentiation algorithm.  This is equivalent to a radix of
 868    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 869 #define POWI_WINDOW_SIZE 3
 870
 871 /* The following table is an efficient representation of an
 872    "optimal power tree".  For each value, i, the corresponding
 873    value, j, in the table states than an optimal evaluation
 874    sequence for calculating pow(x,i) can be found by evaluating
 875    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 876    100 integers is given in Knuth's "Seminumerical algorithms".  */
 877
 878 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 879   {
 880       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 881       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 882       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 883      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 884      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 885      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 886      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 887      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 888      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 889      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 890      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 891      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 892      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 893      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 894      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 895      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 896      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 897      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 898      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 899      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 900      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 901      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 902      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 903      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 904      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 905     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 906     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 907     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 908     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 909     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 910     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 911     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 912   };
 913
 914
 915 /* Return the number of multiplications required to calculate
 916    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 917    subroutine of powi_cost.  CACHE is an array indicating
 918    which exponents have already been calculated.  */
 919
 920 static int
 921 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 922 {
 923   /* If we've already calculated this exponent, then this evaluation
 924      doesn't require any additional multiplications.  */
 925   if (cache[n])
 926     return 0;
 927
 928   cache[n] = true;
 929   return powi_lookup_cost (n - powi_table[n], cache)
 930          + powi_lookup_cost (powi_table[n], cache) + 1;
 931 }
 932
 933 /* Return the number of multiplications required to calculate
 934    powi(x,n) for an arbitrary x, given the exponent N.  This
 935    function needs to be kept in sync with powi_as_mults below.  */
 936
 937 static int
 938 powi_cost (HOST_WIDE_INT n)
 939 {
 940   bool cache[POWI_TABLE_SIZE];
 941   unsigned HOST_WIDE_INT digit;
 942   unsigned HOST_WIDE_INT val;
 943   int result;
 944
 945   if (n == 0)
 946     return 0;
 947
 948   /* Ignore the reciprocal when calculating the cost.  */
 949   val = (n < 0) ? -n : n;
 950
 951   /* Initialize the exponent cache.  */
 952   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 953   cache[1] = true;
 954
 955   result = 0;
 956
 957   while (val >= POWI_TABLE_SIZE)
 958     {
 959       if (val & 1)
 960         {
 961           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 962           result += powi_lookup_cost (digit, cache)
 963                     + POWI_WINDOW_SIZE + 1;
 964           val >>= POWI_WINDOW_SIZE;
 965         }
 966       else
 967         {
 968           val >>= 1;
 969           result++;
 970         }
 971     }
 972
 973   return result + powi_lookup_cost (val, cache);
 974 }
 975
 976 /* Recursive subroutine of powi_as_mults.  This function takes the
 977    array, CACHE, of already calculated exponents and an exponent N and
 978    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 979
 980 static tree
 981 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 982                  HOST_WIDE_INT n, tree *cache)
 983 {
 984   tree op0, op1, ssa_target;
 985   unsigned HOST_WIDE_INT digit;
 986   gimple mult_stmt;
 987
 988   if (n < POWI_TABLE_SIZE && cache[n])
 989     return cache[n];
 990
 991   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
 992
 993   if (n < POWI_TABLE_SIZE)
 994     {
 995       cache[n] = ssa_target;
 996       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
 997       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
 998     }
 999   else if (n & 1)
1000     {
1001       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1002       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1003       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1004     }
1005   else
1006     {
1007       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1008       op1 = op0;
1009     }
1010
1011   mult_stmt = gimple_build_assign_with_ops (MULT_EXPR, ssa_target, op0, op1);
1012   gimple_set_location (mult_stmt, loc);
1013   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1014
1015   return ssa_target;
1016 }
1017
1018 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1019    This function needs to be kept in sync with powi_cost above.  */
1020
1021 static tree
1022 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1023                tree arg0, HOST_WIDE_INT n)
1024 {
1025   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1026   gimple div_stmt;
1027   tree target;
1028
1029   if (n == 0)
1030     return build_real (type, dconst1);
1031
1032   memset (cache, 0,  sizeof (cache));
1033   cache[1] = arg0;
1034
1035   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1036   if (n >= 0)
1037     return result;
1038
1039   /* If the original exponent was negative, reciprocate the result.  */
1040   target = make_temp_ssa_name (type, NULL, "powmult");
1041   div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target,
1042                                            build_real (type, dconst1),
1043                                            result);
1044   gimple_set_location (div_stmt, loc);
1045   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1046
1047   return target;
1048 }
1049
1050 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1051    location info LOC.  If the arguments are appropriate, create an
1052    equivalent sequence of statements prior to GSI using an optimal
1053    number of multiplications, and return an expession holding the
1054    result.  */
1055
1056 static tree
1057 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1058                             tree arg0, HOST_WIDE_INT n)
1059 {
1060   /* Avoid largest negative number.  */
1061   if (n != -n
1062       && ((n >= -1 && n <= 2)
1063           || (optimize_function_for_speed_p (cfun)
1064               && powi_cost (n) <= POWI_MAX_MULTS)))
1065     return powi_as_mults (gsi, loc, arg0, n);
1066
1067   return NULL_TREE;
1068 }
1069
1070 /* Build a gimple call statement that calls FN with argument ARG.
1071    Set the lhs of the call statement to a fresh SSA name.  Insert the
1072    statement prior to GSI's current position, and return the fresh
1073    SSA name.  */
1074
1075 static tree
1076 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1077                        tree fn, tree arg)
1078 {
1079   gimple call_stmt;
1080   tree ssa_target;
1081
1082   call_stmt = gimple_build_call (fn, 1, arg);
1083   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1084   gimple_set_lhs (call_stmt, ssa_target);
1085   gimple_set_location (call_stmt, loc);
1086   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1087
1088   return ssa_target;
1089 }
1090
1091 /* Build a gimple binary operation with the given CODE and arguments
1092    ARG0, ARG1, assigning the result to a new SSA name for variable
1093    TARGET.  Insert the statement prior to GSI's current position, and
1094    return the fresh SSA name.*/
1095
1096 static tree
1097 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1098                         const char *name, enum tree_code code,
1099                         tree arg0, tree arg1)
1100 {
1101   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1102   gimple stmt = gimple_build_assign_with_ops (code, result, arg0, arg1);
1103   gimple_set_location (stmt, loc);
1104   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1105   return result;
1106 }
1107
1108 /* Build a gimple reference operation with the given CODE and argument
1109    ARG, assigning the result to a new SSA name of TYPE with NAME.
1110    Insert the statement prior to GSI's current position, and return
1111    the fresh SSA name.  */
1112
1113 static inline tree
1114 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1115                       const char *name, enum tree_code code, tree arg0)
1116 {
1117   tree result = make_temp_ssa_name (type, NULL, name);
1118   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1119   gimple_set_location (stmt, loc);
1120   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1121   return result;
1122 }
1123
1124 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1125    prior to GSI's current position, and return the fresh SSA name.  */
1126
1127 static tree
1128 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1129                        tree type, tree val)
1130 {
1131   tree result = make_ssa_name (type, NULL);
1132   gimple stmt = gimple_build_assign_with_ops (NOP_EXPR, result, val, NULL_TREE);
1133   gimple_set_location (stmt, loc);
1134   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1135   return result;
1136 }
1137
1138 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1139    with location info LOC.  If possible, create an equivalent and
1140    less expensive sequence of statements prior to GSI, and return an
1141    expession holding the result.  */
1142
1143 static tree
1144 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1145                            tree arg0, tree arg1)
1146 {
1147   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1148   REAL_VALUE_TYPE c2, dconst3;
1149   HOST_WIDE_INT n;
1150   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1151   machine_mode mode;
1152   bool hw_sqrt_exists, c_is_int, c2_is_int;
1153
1154   /* If the exponent isn't a constant, there's nothing of interest
1155      to be done.  */
1156   if (TREE_CODE (arg1) != REAL_CST)
1157     return NULL_TREE;
1158
1159   /* If the exponent is equivalent to an integer, expand to an optimal
1160      multiplication sequence when profitable.  */
1161   c = TREE_REAL_CST (arg1);
1162   n = real_to_integer (&c);
1163   real_from_integer (&cint, VOIDmode, n, SIGNED);
1164   c_is_int = real_identical (&c, &cint);
1165
1166   if (c_is_int
1167       && ((n >= -1 && n <= 2)
1168           || (flag_unsafe_math_optimizations
1169               && optimize_bb_for_speed_p (gsi_bb (*gsi))
1170               && powi_cost (n) <= POWI_MAX_MULTS)))
1171     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1172
1173   /* Attempt various optimizations using sqrt and cbrt.  */
1174   type = TREE_TYPE (arg0);
1175   mode = TYPE_MODE (type);
1176   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1177
1178   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1179      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1180      sqrt(-0) = -0.  */
1181   if (sqrtfn
1182       && REAL_VALUES_EQUAL (c, dconsthalf)
1183       && !HONOR_SIGNED_ZEROS (mode))
1184     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1185
1186   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1187      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1188      so do this optimization even if -Os.  Don't do this optimization
1189      if we don't have a hardware sqrt insn.  */
1190   dconst1_4 = dconst1;
1191   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1192   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1193
1194   if (flag_unsafe_math_optimizations
1195       && sqrtfn
1196       && REAL_VALUES_EQUAL (c, dconst1_4)
1197       && hw_sqrt_exists)
1198     {
1199       /* sqrt(x)  */
1200       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1201
1202       /* sqrt(sqrt(x))  */
1203       return build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1204     }
1205
1206   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1207      optimizing for space.  Don't do this optimization if we don't have
1208      a hardware sqrt insn.  */
1209   real_from_integer (&dconst3_4, VOIDmode, 3, SIGNED);
1210   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1211
1212   if (flag_unsafe_math_optimizations
1213       && sqrtfn
1214       && optimize_function_for_speed_p (cfun)
1215       && REAL_VALUES_EQUAL (c, dconst3_4)
1216       && hw_sqrt_exists)
1217     {
1218       /* sqrt(x)  */
1219       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1220
1221       /* sqrt(sqrt(x))  */
1222       sqrt_sqrt = build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1223
1224       /* sqrt(x) * sqrt(sqrt(x))  */
1225       return build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1226                                      sqrt_arg0, sqrt_sqrt);
1227     }
1228
1229   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1230      optimizations since 1./3. is not exactly representable.  If x
1231      is negative and finite, the correct value of pow(x,1./3.) is
1232      a NaN with the "invalid" exception raised, because the value
1233      of 1./3. actually has an even denominator.  The correct value
1234      of cbrt(x) is a negative real value.  */
1235   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1236   dconst1_3 = real_value_truncate (mode, dconst_third ());
1237
1238   if (flag_unsafe_math_optimizations
1239       && cbrtfn
1240       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1241       && REAL_VALUES_EQUAL (c, dconst1_3))
1242     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1243
1244   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1245      if we don't have a hardware sqrt insn.  */
1246   dconst1_6 = dconst1_3;
1247   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1248
1249   if (flag_unsafe_math_optimizations
1250       && sqrtfn
1251       && cbrtfn
1252       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1253       && optimize_function_for_speed_p (cfun)
1254       && hw_sqrt_exists
1255       && REAL_VALUES_EQUAL (c, dconst1_6))
1256     {
1257       /* sqrt(x)  */
1258       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1259
1260       /* cbrt(sqrt(x))  */
1261       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1262     }
1263
1264   /* Optimize pow(x,c), where n = 2c for some nonzero integer n
1265      and c not an integer, into
1266
1267        sqrt(x) * powi(x, n/2),                n > 0;
1268        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1269
1270      Do not calculate the powi factor when n/2 = 0.  */
1271   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1272   n = real_to_integer (&c2);
1273   real_from_integer (&cint, VOIDmode, n, SIGNED);
1274   c2_is_int = real_identical (&c2, &cint);
1275
1276   if (flag_unsafe_math_optimizations
1277       && sqrtfn
1278       && c2_is_int
1279       && !c_is_int
1280       && optimize_function_for_speed_p (cfun))
1281     {
1282       tree powi_x_ndiv2 = NULL_TREE;
1283
1284       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1285          possible or profitable, give up.  Skip the degenerate case when
1286          n is 1 or -1, where the result is always 1.  */
1287       if (absu_hwi (n) != 1)
1288         {
1289           powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1290                                                      abs_hwi (n / 2));
1291           if (!powi_x_ndiv2)
1292             return NULL_TREE;
1293         }
1294
1295       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1296          result of the optimal multiply sequence just calculated.  */
1297       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1298
1299       if (absu_hwi (n) == 1)
1300         result = sqrt_arg0;
1301       else
1302         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1303                                          sqrt_arg0, powi_x_ndiv2);
1304
1305       /* If n is negative, reciprocate the result.  */
1306       if (n < 0)
1307         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1308                                          build_real (type, dconst1), result);
1309       return result;
1310     }
1311
1312   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1313
1314      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1315      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1316
1317      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1318      different from pow(x, 1./3.) due to rounding and behavior with
1319      negative x, we need to constrain this transformation to unsafe
1320      math and positive x or finite math.  */
1321   real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1322   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1323   real_round (&c2, mode, &c2);
1324   n = real_to_integer (&c2);
1325   real_from_integer (&cint, VOIDmode, n, SIGNED);
1326   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1327   real_convert (&c2, mode, &c2);
1328
1329   if (flag_unsafe_math_optimizations
1330       && cbrtfn
1331       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1332       && real_identical (&c2, &c)
1333       && !c2_is_int
1334       && optimize_function_for_speed_p (cfun)
1335       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1336     {
1337       tree powi_x_ndiv3 = NULL_TREE;
1338
1339       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1340          possible or profitable, give up.  Skip the degenerate case when
1341          abs(n) < 3, where the result is always 1.  */
1342       if (absu_hwi (n) >= 3)
1343         {
1344           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1345                                                      abs_hwi (n / 3));
1346           if (!powi_x_ndiv3)
1347             return NULL_TREE;
1348         }
1349
1350       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1351          as that creates an unnecessary variable.  Instead, just produce
1352          either cbrt(x) or cbrt(x) * cbrt(x).  */
1353       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1354
1355       if (absu_hwi (n) % 3 == 1)
1356         powi_cbrt_x = cbrt_x;
1357       else
1358         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1359                                               cbrt_x, cbrt_x);
1360
1361       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1362       if (absu_hwi (n) < 3)
1363         result = powi_cbrt_x;
1364       else
1365         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1366                                          powi_x_ndiv3, powi_cbrt_x);
1367
1368       /* If n is negative, reciprocate the result.  */
1369       if (n < 0)
1370         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1371                                          build_real (type, dconst1), result);
1372
1373       return result;
1374     }
1375
1376   /* No optimizations succeeded.  */
1377   return NULL_TREE;
1378 }
1379
1380 /* ARG is the argument to a cabs builtin call in GSI with location info
1381    LOC.  Create a sequence of statements prior to GSI that calculates
1382    sqrt(R*R + I*I), where R and I are the real and imaginary components
1383    of ARG, respectively.  Return an expression holding the result.  */
1384
1385 static tree
1386 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1387 {
1388   tree real_part, imag_part, addend1, addend2, sum, result;
1389   tree type = TREE_TYPE (TREE_TYPE (arg));
1390   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1391   machine_mode mode = TYPE_MODE (type);
1392
1393   if (!flag_unsafe_math_optimizations
1394       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1395       || !sqrtfn
1396       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1397     return NULL_TREE;
1398
1399   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1400                                     REALPART_EXPR, arg);
1401   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1402                                     real_part, real_part);
1403   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1404                                     IMAGPART_EXPR, arg);
1405   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1406                                     imag_part, imag_part);
1407   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1408   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1409
1410   return result;
1411 }
1412
1413 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1414    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1415    an optimal number of multiplies, when n is a constant.  */
1416
1417 namespace {
1418
1419 const pass_data pass_data_cse_sincos =
1420 {
1421   GIMPLE_PASS, /* type */
1422   "sincos", /* name */
1423   OPTGROUP_NONE, /* optinfo_flags */
1424   TV_NONE, /* tv_id */
1425   PROP_ssa, /* properties_required */
1426   0, /* properties_provided */
1427   0, /* properties_destroyed */
1428   0, /* todo_flags_start */
1429   TODO_update_ssa, /* todo_flags_finish */
1430 };
1431
1432 class pass_cse_sincos : public gimple_opt_pass
1433 {
1434 public:
1435   pass_cse_sincos (gcc::context *ctxt)
1436     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1437   {}
1438
1439   /* opt_pass methods: */
1440   virtual bool gate (function *)
1441     {
1442       /* We no longer require either sincos or cexp, since powi expansion
1443          piggybacks on this pass.  */
1444       return optimize;
1445     }
1446
1447   virtual unsigned int execute (function *);
1448
1449 }; // class pass_cse_sincos
1450
1451 unsigned int
1452 pass_cse_sincos::execute (function *fun)
1453 {
1454   basic_block bb;
1455   bool cfg_changed = false;
1456
1457   calculate_dominance_info (CDI_DOMINATORS);
1458   memset (&sincos_stats, 0, sizeof (sincos_stats));
1459
1460   FOR_EACH_BB_FN (bb, fun)
1461     {
1462       gimple_stmt_iterator gsi;
1463       bool cleanup_eh = false;
1464
1465       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1466         {
1467           gimple stmt = gsi_stmt (gsi);
1468           tree fndecl;
1469
1470           /* Only the last stmt in a bb could throw, no need to call
1471              gimple_purge_dead_eh_edges if we change something in the middle
1472              of a basic block.  */
1473           cleanup_eh = false;
1474
1475           if (is_gimple_call (stmt)
1476               && gimple_call_lhs (stmt)
1477               && (fndecl = gimple_call_fndecl (stmt))
1478               && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1479             {
1480               tree arg, arg0, arg1, result;
1481               HOST_WIDE_INT n;
1482               location_t loc;
1483
1484               switch (DECL_FUNCTION_CODE (fndecl))
1485                 {
1486                 CASE_FLT_FN (BUILT_IN_COS):
1487                 CASE_FLT_FN (BUILT_IN_SIN):
1488                 CASE_FLT_FN (BUILT_IN_CEXPI):
1489                   /* Make sure we have either sincos or cexp.  */
1490                   if (!targetm.libc_has_function (function_c99_math_complex)
1491                       && !targetm.libc_has_function (function_sincos))
1492                     break;
1493
1494                   arg = gimple_call_arg (stmt, 0);
1495                   if (TREE_CODE (arg) == SSA_NAME)
1496                     cfg_changed |= execute_cse_sincos_1 (arg);
1497                   break;
1498
1499                 CASE_FLT_FN (BUILT_IN_POW):
1500                   arg0 = gimple_call_arg (stmt, 0);
1501                   arg1 = gimple_call_arg (stmt, 1);
1502
1503                   loc = gimple_location (stmt);
1504                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1505
1506                   if (result)
1507                     {
1508                       tree lhs = gimple_get_lhs (stmt);
1509                       gimple new_stmt = gimple_build_assign (lhs, result);
1510                       gimple_set_location (new_stmt, loc);
1511                       unlink_stmt_vdef (stmt);
1512                       gsi_replace (&gsi, new_stmt, true);
1513                       cleanup_eh = true;
1514                       if (gimple_vdef (stmt))
1515                         release_ssa_name (gimple_vdef (stmt));
1516                     }
1517                   break;
1518
1519                 CASE_FLT_FN (BUILT_IN_POWI):
1520                   arg0 = gimple_call_arg (stmt, 0);
1521                   arg1 = gimple_call_arg (stmt, 1);
1522                   loc = gimple_location (stmt);
1523
1524                   if (real_minus_onep (arg0))
1525                     {
1526                       tree t0, t1, cond, one, minus_one;
1527                       gimple stmt;
1528
1529                       t0 = TREE_TYPE (arg0);
1530                       t1 = TREE_TYPE (arg1);
1531                       one = build_real (t0, dconst1);
1532                       minus_one = build_real (t0, dconstm1);
1533
1534                       cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1535                       stmt = gimple_build_assign_with_ops (BIT_AND_EXPR, cond,
1536                                                            arg1,
1537                                                            build_int_cst (t1,
1538                                                                           1));
1539                       gimple_set_location (stmt, loc);
1540                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1541
1542                       result = make_temp_ssa_name (t0, NULL, "powi");
1543                       stmt = gimple_build_assign_with_ops (COND_EXPR, result,
1544                                                            cond,
1545                                                            minus_one, one);
1546                       gimple_set_location (stmt, loc);
1547                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1548                     }
1549                   else
1550                     {
1551                       if (!tree_fits_shwi_p (arg1))
1552                         break;
1553
1554                       n = tree_to_shwi (arg1);
1555                       result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1556                     }
1557
1558                   if (result)
1559                     {
1560                       tree lhs = gimple_get_lhs (stmt);
1561                       gimple new_stmt = gimple_build_assign (lhs, result);
1562                       gimple_set_location (new_stmt, loc);
1563                       unlink_stmt_vdef (stmt);
1564                       gsi_replace (&gsi, new_stmt, true);
1565                       cleanup_eh = true;
1566                       if (gimple_vdef (stmt))
1567                         release_ssa_name (gimple_vdef (stmt));
1568                     }
1569                   break;
1570
1571                 CASE_FLT_FN (BUILT_IN_CABS):
1572                   arg0 = gimple_call_arg (stmt, 0);
1573                   loc = gimple_location (stmt);
1574                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1575
1576                   if (result)
1577                     {
1578                       tree lhs = gimple_get_lhs (stmt);
1579                       gimple new_stmt = gimple_build_assign (lhs, result);
1580                       gimple_set_location (new_stmt, loc);
1581                       unlink_stmt_vdef (stmt);
1582                       gsi_replace (&gsi, new_stmt, true);
1583                       cleanup_eh = true;
1584                       if (gimple_vdef (stmt))
1585                         release_ssa_name (gimple_vdef (stmt));
1586                     }
1587                   break;
1588
1589                 default:;
1590                 }
1591             }
1592         }
1593       if (cleanup_eh)
1594         cfg_changed |= gimple_purge_dead_eh_edges (bb);
1595     }
1596
1597   statistics_counter_event (fun, "sincos statements inserted",
1598                             sincos_stats.inserted);
1599
1600   free_dominance_info (CDI_DOMINATORS);
1601   return cfg_changed ? TODO_cleanup_cfg : 0;
1602 }
1603
1604 } // anon namespace
1605
1606 gimple_opt_pass *
1607 make_pass_cse_sincos (gcc::context *ctxt)
1608 {
1609   return new pass_cse_sincos (ctxt);
1610 }
1611
1612 /* A symbolic number is used to detect byte permutation and selection
1613    patterns.  Therefore the field N contains an artificial number
1614    consisting of octet sized markers:
1615
1616    0    - target byte has the value 0
1617    FF   - target byte has an unknown value (eg. due to sign extension)
1618    1..size - marker value is the target byte index minus one.
1619
1620    To detect permutations on memory sources (arrays and structures), a symbolic
1621    number is also associated a base address (the array or structure the load is
1622    made from), an offset from the base address and a range which gives the
1623    difference between the highest and lowest accessed memory location to make
1624    such a symbolic number. The range is thus different from size which reflects
1625    the size of the type of current expression. Note that for non memory source,
1626    range holds the same value as size.
1627
1628    For instance, for an array char a[], (short) a[0] | (short) a[3] would have
1629    a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
1630    still have a size of 2 but this time a range of 1.  */
1631
1632 struct symbolic_number {
1633   uint64_t n;
1634   tree type;
1635   tree base_addr;
1636   tree offset;
1637   HOST_WIDE_INT bytepos;
1638   tree alias_set;
1639   tree vuse;
1640   unsigned HOST_WIDE_INT range;
1641 };
1642
1643 #define BITS_PER_MARKER 8
1644 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1645 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1646 #define HEAD_MARKER(n, size) \
1647   ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1648
1649 /* The number which the find_bswap_or_nop_1 result should match in
1650    order to have a nop.  The number is masked according to the size of
1651    the symbolic number before using it.  */
1652 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1653   (uint64_t)0x08070605 << 32 | 0x04030201)
1654
1655 /* The number which the find_bswap_or_nop_1 result should match in
1656    order to have a byte swap.  The number is masked according to the
1657    size of the symbolic number before using it.  */
1658 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1659   (uint64_t)0x01020304 << 32 | 0x05060708)
1660
1661 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1662    number N.  Return false if the requested operation is not permitted
1663    on a symbolic number.  */
1664
1665 static inline bool
1666 do_shift_rotate (enum tree_code code,
1667                  struct symbolic_number *n,
1668                  int count)
1669 {
1670   int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1671   unsigned head_marker;
1672
1673   if (count % BITS_PER_UNIT != 0)
1674     return false;
1675   count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
1676
1677   /* Zero out the extra bits of N in order to avoid them being shifted
1678      into the significant bits.  */
1679   if (size < 64 / BITS_PER_MARKER)
1680     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1681
1682   switch (code)
1683     {
1684     case LSHIFT_EXPR:
1685       n->n <<= count;
1686       break;
1687     case RSHIFT_EXPR:
1688       head_marker = HEAD_MARKER (n->n, size);
1689       n->n >>= count;
1690       /* Arithmetic shift of signed type: result is dependent on the value.  */
1691       if (!TYPE_UNSIGNED (n->type) && head_marker)
1692         for (i = 0; i < count / BITS_PER_MARKER; i++)
1693           n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1694                   << ((size - 1 - i) * BITS_PER_MARKER);
1695       break;
1696     case LROTATE_EXPR:
1697       n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
1698       break;
1699     case RROTATE_EXPR:
1700       n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
1701       break;
1702     default:
1703       return false;
1704     }
1705   /* Zero unused bits for size.  */
1706   if (size < 64 / BITS_PER_MARKER)
1707     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1708   return true;
1709 }
1710
1711 /* Perform sanity checking for the symbolic number N and the gimple
1712    statement STMT.  */
1713
1714 static inline bool
1715 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1716 {
1717   tree lhs_type;
1718
1719   lhs_type = gimple_expr_type (stmt);
1720
1721   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1722     return false;
1723
1724   if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
1725     return false;
1726
1727   return true;
1728 }
1729
1730 /* Initialize the symbolic number N for the bswap pass from the base element
1731    SRC manipulated by the bitwise OR expression.  */
1732
1733 static bool
1734 init_symbolic_number (struct symbolic_number *n, tree src)
1735 {
1736   int size;
1737
1738   n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
1739
1740   /* Set up the symbolic number N by setting each byte to a value between 1 and
1741      the byte size of rhs1.  The highest order byte is set to n->size and the
1742      lowest order byte to 1.  */
1743   n->type = TREE_TYPE (src);
1744   size = TYPE_PRECISION (n->type);
1745   if (size % BITS_PER_UNIT != 0)
1746     return false;
1747   size /= BITS_PER_UNIT;
1748   if (size > 64 / BITS_PER_MARKER)
1749     return false;
1750   n->range = size;
1751   n->n = CMPNOP;
1752
1753   if (size < 64 / BITS_PER_MARKER)
1754     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1755
1756   return true;
1757 }
1758
1759 /* Check if STMT might be a byte swap or a nop from a memory source and returns
1760    the answer. If so, REF is that memory source and the base of the memory area
1761    accessed and the offset of the access from that base are recorded in N.  */
1762
1763 bool
1764 find_bswap_or_nop_load (gimple stmt, tree ref, struct symbolic_number *n)
1765 {
1766   /* Leaf node is an array or component ref. Memorize its base and
1767      offset from base to compare to other such leaf node.  */
1768   HOST_WIDE_INT bitsize, bitpos;
1769   machine_mode mode;
1770   int unsignedp, volatilep;
1771   tree offset, base_addr;
1772
1773   if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
1774     return false;
1775
1776   base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
1777                                    &unsignedp, &volatilep, false);
1778
1779   if (TREE_CODE (base_addr) == MEM_REF)
1780     {
1781       offset_int bit_offset = 0;
1782       tree off = TREE_OPERAND (base_addr, 1);
1783
1784       if (!integer_zerop (off))
1785         {
1786           offset_int boff, coff = mem_ref_offset (base_addr);
1787           boff = wi::lshift (coff, LOG2_BITS_PER_UNIT);
1788           bit_offset += boff;
1789         }
1790
1791       base_addr = TREE_OPERAND (base_addr, 0);
1792
1793       /* Avoid returning a negative bitpos as this may wreak havoc later.  */
1794       if (wi::neg_p (bit_offset))
1795         {
1796           offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
1797           offset_int tem = bit_offset.and_not (mask);
1798           /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
1799              Subtract it to BIT_OFFSET and add it (scaled) to OFFSET.  */
1800           bit_offset -= tem;
1801           tem = wi::arshift (tem, LOG2_BITS_PER_UNIT);
1802           if (offset)
1803             offset = size_binop (PLUS_EXPR, offset,
1804                                     wide_int_to_tree (sizetype, tem));
1805           else
1806             offset = wide_int_to_tree (sizetype, tem);
1807         }
1808
1809       bitpos += bit_offset.to_shwi ();
1810     }
1811
1812   if (bitpos % BITS_PER_UNIT)
1813     return false;
1814   if (bitsize % BITS_PER_UNIT)
1815     return false;
1816
1817   if (!init_symbolic_number (n, ref))
1818     return false;
1819   n->base_addr = base_addr;
1820   n->offset = offset;
1821   n->bytepos = bitpos / BITS_PER_UNIT;
1822   n->alias_set = reference_alias_ptr_type (ref);
1823   n->vuse = gimple_vuse (stmt);
1824   return true;
1825 }
1826
1827 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
1828    the operation given by the rhs of STMT on the result.  If the operation
1829    could successfully be executed the function returns a gimple stmt whose
1830    rhs's first tree is the expression of the source operand and NULL
1831    otherwise.  */
1832
1833 static gimple
1834 find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
1835 {
1836   enum tree_code code;
1837   tree rhs1, rhs2 = NULL;
1838   gimple rhs1_stmt, rhs2_stmt, source_stmt1;
1839   enum gimple_rhs_class rhs_class;
1840
1841   if (!limit || !is_gimple_assign (stmt))
1842     return NULL;
1843
1844   rhs1 = gimple_assign_rhs1 (stmt);
1845
1846   if (find_bswap_or_nop_load (stmt, rhs1, n))
1847     return stmt;
1848
1849   if (TREE_CODE (rhs1) != SSA_NAME)
1850     return NULL;
1851
1852   code = gimple_assign_rhs_code (stmt);
1853   rhs_class = gimple_assign_rhs_class (stmt);
1854   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1855
1856   if (rhs_class == GIMPLE_BINARY_RHS)
1857     rhs2 = gimple_assign_rhs2 (stmt);
1858
1859   /* Handle unary rhs and binary rhs with integer constants as second
1860      operand.  */
1861
1862   if (rhs_class == GIMPLE_UNARY_RHS
1863       || (rhs_class == GIMPLE_BINARY_RHS
1864           && TREE_CODE (rhs2) == INTEGER_CST))
1865     {
1866       if (code != BIT_AND_EXPR
1867           && code != LSHIFT_EXPR
1868           && code != RSHIFT_EXPR
1869           && code != LROTATE_EXPR
1870           && code != RROTATE_EXPR
1871           && !CONVERT_EXPR_CODE_P (code))
1872         return NULL;
1873
1874       source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
1875
1876       /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
1877          we have to initialize the symbolic number.  */
1878       if (!source_stmt1)
1879         {
1880           if (gimple_assign_load_p (stmt)
1881               || !init_symbolic_number (n, rhs1))
1882             return NULL;
1883           source_stmt1 = stmt;
1884         }
1885
1886       switch (code)
1887         {
1888         case BIT_AND_EXPR:
1889           {
1890             int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1891             uint64_t val = int_cst_value (rhs2), mask = 0;
1892             uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
1893
1894             /* Only constants masking full bytes are allowed.  */
1895             for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
1896               if ((val & tmp) != 0 && (val & tmp) != tmp)
1897                 return NULL;
1898               else if (val & tmp)
1899                 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
1900
1901             n->n &= mask;
1902           }
1903           break;
1904         case LSHIFT_EXPR:
1905         case RSHIFT_EXPR:
1906         case LROTATE_EXPR:
1907         case RROTATE_EXPR:
1908           if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
1909             return NULL;
1910           break;
1911         CASE_CONVERT:
1912           {
1913             int i, type_size, old_type_size;
1914             tree type;
1915
1916             type = gimple_expr_type (stmt);
1917             type_size = TYPE_PRECISION (type);
1918             if (type_size % BITS_PER_UNIT != 0)
1919               return NULL;
1920             type_size /= BITS_PER_UNIT;
1921             if (type_size > 64 / BITS_PER_MARKER)
1922               return NULL;
1923
1924             /* Sign extension: result is dependent on the value.  */
1925             old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1926             if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
1927                 && HEAD_MARKER (n->n, old_type_size))
1928               for (i = 0; i < type_size - old_type_size; i++)
1929                 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1930                         << ((type_size - 1 - i) * BITS_PER_MARKER);
1931
1932             if (type_size < 64 / BITS_PER_MARKER)
1933               {
1934                 /* If STMT casts to a smaller type mask out the bits not
1935                    belonging to the target type.  */
1936                 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
1937               }
1938             n->type = type;
1939             if (!n->base_addr)
1940               n->range = type_size;
1941           }
1942           break;
1943         default:
1944           return NULL;
1945         };
1946       return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
1947     }
1948
1949   /* Handle binary rhs.  */
1950
1951   if (rhs_class == GIMPLE_BINARY_RHS)
1952     {
1953       int i, size;
1954       struct symbolic_number n1, n2;
1955       uint64_t mask;
1956       gimple source_stmt2;
1957
1958       if (code != BIT_IOR_EXPR)
1959         return NULL;
1960
1961       if (TREE_CODE (rhs2) != SSA_NAME)
1962         return NULL;
1963
1964       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1965
1966       switch (code)
1967         {
1968         case BIT_IOR_EXPR:
1969           source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
1970
1971           if (!source_stmt1)
1972             return NULL;
1973
1974           source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
1975
1976           if (!source_stmt2)
1977             return NULL;
1978
1979           if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
1980             return NULL;
1981
1982           if (!n1.vuse != !n2.vuse ||
1983           (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
1984             return NULL;
1985
1986           if (gimple_assign_rhs1 (source_stmt1)
1987               != gimple_assign_rhs1 (source_stmt2))
1988             {
1989               int64_t inc;
1990               HOST_WIDE_INT off_sub;
1991               struct symbolic_number *n_ptr;
1992
1993               if (!n1.base_addr || !n2.base_addr
1994                   || !operand_equal_p (n1.base_addr, n2.base_addr, 0))
1995                 return NULL;
1996               if (!n1.offset != !n2.offset ||
1997                   (n1.offset && !operand_equal_p (n1.offset, n2.offset, 0)))
1998                 return NULL;
1999
2000               /* We swap n1 with n2 to have n1 < n2.  */
2001               if (n2.bytepos < n1.bytepos)
2002                 {
2003                   struct symbolic_number tmpn;
2004
2005                   tmpn = n2;
2006                   n2 = n1;
2007                   n1 = tmpn;
2008                   source_stmt1 = source_stmt2;
2009                 }
2010
2011               off_sub = n2.bytepos - n1.bytepos;
2012
2013               /* Check that the range of memory covered can be represented by
2014                  a symbolic number.  */
2015               if (off_sub + n2.range > 64 / BITS_PER_MARKER)
2016                 return NULL;
2017               n->range = n2.range + off_sub;
2018
2019               /* Reinterpret byte marks in symbolic number holding the value of
2020                  bigger weight according to target endianness.  */
2021               inc = BYTES_BIG_ENDIAN ? off_sub + n2.range - n1.range : off_sub;
2022               size = TYPE_PRECISION (n1.type) / BITS_PER_UNIT;
2023               if (BYTES_BIG_ENDIAN)
2024                 n_ptr = &n1;
2025               else
2026                 n_ptr = &n2;
2027               for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
2028                 {
2029                   unsigned marker =
2030                     (n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
2031                   if (marker && marker != MARKER_BYTE_UNKNOWN)
2032                     n_ptr->n += inc;
2033                 }
2034             }
2035           else
2036             n->range = n1.range;
2037
2038           if (!n1.alias_set
2039               || alias_ptr_types_compatible_p (n1.alias_set, n2.alias_set))
2040             n->alias_set = n1.alias_set;
2041           else
2042             n->alias_set = ptr_type_node;
2043           n->vuse = n1.vuse;
2044           n->base_addr = n1.base_addr;
2045           n->offset = n1.offset;
2046           n->bytepos = n1.bytepos;
2047           n->type = n1.type;
2048           size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2049           for (i = 0, mask = MARKER_MASK; i < size;
2050                i++, mask <<= BITS_PER_MARKER)
2051             {
2052               uint64_t masked1, masked2;
2053
2054               masked1 = n1.n & mask;
2055               masked2 = n2.n & mask;
2056               if (masked1 && masked2 && masked1 != masked2)
2057                 return NULL;
2058             }
2059           n->n = n1.n | n2.n;
2060
2061           if (!verify_symbolic_number_p (n, stmt))
2062             return NULL;
2063
2064           break;
2065         default:
2066           return NULL;
2067         }
2068       return source_stmt1;
2069     }
2070   return NULL;
2071 }
2072
2073 /* Check if STMT completes a bswap implementation or a read in a given
2074    endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2075    accordingly.  It also sets N to represent the kind of operations
2076    performed: size of the resulting expression and whether it works on
2077    a memory source, and if so alias-set and vuse.  At last, the
2078    function returns a stmt whose rhs's first tree is the source
2079    expression.  */
2080
2081 static gimple
2082 find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap)
2083 {
2084 /* The number which the find_bswap_or_nop_1 result should match in order
2085    to have a full byte swap.  The number is shifted to the right
2086    according to the size of the symbolic number before using it.  */
2087   uint64_t cmpxchg = CMPXCHG;
2088   uint64_t cmpnop = CMPNOP;
2089
2090   gimple source_stmt;
2091   int limit;
2092
2093   /* The last parameter determines the depth search limit.  It usually
2094      correlates directly to the number n of bytes to be touched.  We
2095      increase that number by log2(n) + 1 here in order to also
2096      cover signed -> unsigned conversions of the src operand as can be seen
2097      in libgcc, and for initial shift/and operation of the src operand.  */
2098   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2099   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2100   source_stmt =  find_bswap_or_nop_1 (stmt, n, limit);
2101
2102   if (!source_stmt)
2103     return NULL;
2104
2105   /* Find real size of result (highest non zero byte).  */
2106   if (n->base_addr)
2107     {
2108       int rsize;
2109       uint64_t tmpn;
2110
2111       for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2112       n->range = rsize;
2113     }
2114
2115   /* Zero out the extra bits of N and CMP*.  */
2116   if (n->range < (int) sizeof (int64_t))
2117     {
2118       uint64_t mask;
2119
2120       mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2121       cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2122       cmpnop &= mask;
2123     }
2124
2125   /* A complete byte swap should make the symbolic number to start with
2126      the largest digit in the highest order byte. Unchanged symbolic
2127      number indicates a read with same endianness as target architecture.  */
2128   if (n->n == cmpnop)
2129     *bswap = false;
2130   else if (n->n == cmpxchg)
2131     *bswap = true;
2132   else
2133     return NULL;
2134
2135   /* Useless bit manipulation performed by code.  */
2136   if (!n->base_addr && n->n == cmpnop)
2137     return NULL;
2138
2139   n->range *= BITS_PER_UNIT;
2140   return source_stmt;
2141 }
2142
2143 namespace {
2144
2145 const pass_data pass_data_optimize_bswap =
2146 {
2147   GIMPLE_PASS, /* type */
2148   "bswap", /* name */
2149   OPTGROUP_NONE, /* optinfo_flags */
2150   TV_NONE, /* tv_id */
2151   PROP_ssa, /* properties_required */
2152   0, /* properties_provided */
2153   0, /* properties_destroyed */
2154   0, /* todo_flags_start */
2155   0, /* todo_flags_finish */
2156 };
2157
2158 class pass_optimize_bswap : public gimple_opt_pass
2159 {
2160 public:
2161   pass_optimize_bswap (gcc::context *ctxt)
2162     : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2163   {}
2164
2165   /* opt_pass methods: */
2166   virtual bool gate (function *)
2167     {
2168       return flag_expensive_optimizations && optimize;
2169     }
2170
2171   virtual unsigned int execute (function *);
2172
2173 }; // class pass_optimize_bswap
2174
2175 /* Perform the bswap optimization: replace the expression computed in the rhs
2176    of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2177    Which of these alternatives replace the rhs is given by N->base_addr (non
2178    null if a load is needed) and BSWAP.  The type, VUSE and set-alias of the
2179    load to perform are also given in N while the builtin bswap invoke is given
2180    in FNDEL.  Finally, if a load is involved, SRC_STMT refers to one of the
2181    load statements involved to construct the rhs in CUR_STMT and N->range gives
2182    the size of the rhs expression for maintaining some statistics.
2183
2184    Note that if the replacement involve a load, CUR_STMT is moved just after
2185    SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2186    changing of basic block.  */
2187
2188 static bool
2189 bswap_replace (gimple cur_stmt, gimple src_stmt, tree fndecl, tree bswap_type,
2190                tree load_type, struct symbolic_number *n, bool bswap)
2191 {
2192   gimple_stmt_iterator gsi;
2193   tree src, tmp, tgt;
2194   gimple bswap_stmt;
2195
2196   gsi = gsi_for_stmt (cur_stmt);
2197   src = gimple_assign_rhs1 (src_stmt);
2198   tgt = gimple_assign_lhs (cur_stmt);
2199
2200   /* Need to load the value from memory first.  */
2201   if (n->base_addr)
2202     {
2203       gimple_stmt_iterator gsi_ins = gsi_for_stmt (src_stmt);
2204       tree addr_expr, addr_tmp, val_expr, val_tmp;
2205       tree load_offset_ptr, aligned_load_type;
2206       gimple addr_stmt, load_stmt;
2207       unsigned align;
2208
2209       align = get_object_alignment (src);
2210       if (bswap
2211           && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
2212           && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
2213         return false;
2214
2215       /* Move cur_stmt just before  one of the load of the original
2216          to ensure it has the same VUSE.  See PR61517 for what could
2217          go wrong.  */
2218       gsi_move_before (&gsi, &gsi_ins);
2219       gsi = gsi_for_stmt (cur_stmt);
2220
2221       /*  Compute address to load from and cast according to the size
2222           of the load.  */
2223       addr_expr = build_fold_addr_expr (unshare_expr (src));
2224       if (is_gimple_min_invariant (addr_expr))
2225         addr_tmp = addr_expr;
2226       else
2227         {
2228           addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2229                                          "load_src");
2230           addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2231           gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2232         }
2233
2234       /* Perform the load.  */
2235       aligned_load_type = load_type;
2236       if (align < TYPE_ALIGN (load_type))
2237         aligned_load_type = build_aligned_type (load_type, align);
2238       load_offset_ptr = build_int_cst (n->alias_set, 0);
2239       val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2240                               load_offset_ptr);
2241
2242       if (!bswap)
2243         {
2244           if (n->range == 16)
2245             nop_stats.found_16bit++;
2246           else if (n->range == 32)
2247             nop_stats.found_32bit++;
2248           else
2249             {
2250               gcc_assert (n->range == 64);
2251               nop_stats.found_64bit++;
2252             }
2253
2254           /* Convert the result of load if necessary.  */
2255           if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2256             {
2257               val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2258                                             "load_dst");
2259               load_stmt = gimple_build_assign (val_tmp, val_expr);
2260               gimple_set_vuse (load_stmt, n->vuse);
2261               gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2262               gimple_assign_set_rhs_with_ops_1 (&gsi, NOP_EXPR, val_tmp,
2263                                                 NULL_TREE, NULL_TREE);
2264             }
2265           else
2266             {
2267               gimple_assign_set_rhs_with_ops_1 (&gsi, MEM_REF, val_expr,
2268                                                 NULL_TREE, NULL_TREE);
2269               gimple_set_vuse (cur_stmt, n->vuse);
2270             }
2271           update_stmt (cur_stmt);
2272
2273           if (dump_file)
2274             {
2275               fprintf (dump_file,
2276                        "%d bit load in target endianness found at: ",
2277                        (int)n->range);
2278               print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2279             }
2280           return true;
2281         }
2282       else
2283         {
2284           val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2285           load_stmt = gimple_build_assign (val_tmp, val_expr);
2286           gimple_set_vuse (load_stmt, n->vuse);
2287           gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2288         }
2289       src = val_tmp;
2290     }
2291
2292   if (n->range == 16)
2293     bswap_stats.found_16bit++;
2294   else if (n->range == 32)
2295     bswap_stats.found_32bit++;
2296   else
2297     {
2298       gcc_assert (n->range == 64);
2299       bswap_stats.found_64bit++;
2300     }
2301
2302   tmp = src;
2303
2304   /* Canonical form for 16 bit bswap is a rotate expression.  Only 16bit values
2305      are considered as rotation of 2N bit values by N bits is generally not
2306      equivalent to a bswap.  Consider for instance 0x01020304 >> 16 which gives
2307      0x03040102 while a bswap for that value is 0x04030201.  */
2308   if (bswap && n->range == 16)
2309     {
2310       tree count = build_int_cst (NULL, BITS_PER_UNIT);
2311       bswap_type = TREE_TYPE (src);
2312       src = fold_build2 (LROTATE_EXPR, bswap_type, src, count);
2313       bswap_stmt = gimple_build_assign (NULL, src);
2314     }
2315   else
2316     {
2317       /* Convert the src expression if necessary.  */
2318       if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2319         {
2320           gimple convert_stmt;
2321           tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2322           convert_stmt = gimple_build_assign_with_ops (NOP_EXPR, tmp, src,
2323                                                        NULL);
2324           gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2325         }
2326
2327       bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2328     }
2329
2330   tmp = tgt;
2331
2332   /* Convert the result if necessary.  */
2333   if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2334     {
2335       gimple convert_stmt;
2336       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2337       convert_stmt = gimple_build_assign_with_ops (NOP_EXPR, tgt, tmp, NULL);
2338       gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2339     }
2340
2341   gimple_set_lhs (bswap_stmt, tmp);
2342
2343   if (dump_file)
2344     {
2345       fprintf (dump_file, "%d bit bswap implementation found at: ",
2346                (int)n->range);
2347       print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2348     }
2349
2350   gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2351   gsi_remove (&gsi, true);
2352   return true;
2353 }
2354
2355 /* Find manual byte swap implementations as well as load in a given
2356    endianness. Byte swaps are turned into a bswap builtin invokation
2357    while endian loads are converted to bswap builtin invokation or
2358    simple load according to the target endianness.  */
2359
2360 unsigned int
2361 pass_optimize_bswap::execute (function *fun)
2362 {
2363   basic_block bb;
2364   bool bswap16_p, bswap32_p, bswap64_p;
2365   bool changed = false;
2366   tree bswap16_type = NULL_TREE, bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2367
2368   if (BITS_PER_UNIT != 8)
2369     return 0;
2370
2371   bswap16_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP16)
2372                && optab_handler (bswap_optab, HImode) != CODE_FOR_nothing);
2373   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2374                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2375   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2376                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2377                    || (bswap32_p && word_mode == SImode)));
2378
2379   /* Determine the argument type of the builtins.  The code later on
2380      assumes that the return and argument type are the same.  */
2381   if (bswap16_p)
2382     {
2383       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
2384       bswap16_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2385     }
2386
2387   if (bswap32_p)
2388     {
2389       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2390       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2391     }
2392
2393   if (bswap64_p)
2394     {
2395       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2396       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2397     }
2398
2399   memset (&nop_stats, 0, sizeof (nop_stats));
2400   memset (&bswap_stats, 0, sizeof (bswap_stats));
2401
2402   FOR_EACH_BB_FN (bb, fun)
2403     {
2404       gimple_stmt_iterator gsi;
2405
2406       /* We do a reverse scan for bswap patterns to make sure we get the
2407          widest match. As bswap pattern matching doesn't handle previously
2408          inserted smaller bswap replacements as sub-patterns, the wider
2409          variant wouldn't be detected.  */
2410       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2411         {
2412           gimple src_stmt, cur_stmt = gsi_stmt (gsi);
2413           tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2414           enum tree_code code;
2415           struct symbolic_number n;
2416           bool bswap;
2417
2418           /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2419              might be moved to a different basic block by bswap_replace and gsi
2420              must not points to it if that's the case.  Moving the gsi_prev
2421              there make sure that gsi points to the statement previous to
2422              cur_stmt while still making sure that all statements are
2423              considered in this basic block.  */
2424           gsi_prev (&gsi);
2425
2426           if (!is_gimple_assign (cur_stmt))
2427             continue;
2428
2429           code = gimple_assign_rhs_code (cur_stmt);
2430           switch (code)
2431             {
2432             case LROTATE_EXPR:
2433             case RROTATE_EXPR:
2434               if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2435                   || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2436                      % BITS_PER_UNIT)
2437                 continue;
2438               /* Fall through.  */
2439             case BIT_IOR_EXPR:
2440               break;
2441             default:
2442               continue;
2443             }
2444
2445           src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2446
2447           if (!src_stmt)
2448             continue;
2449
2450           switch (n.range)
2451             {
2452             case 16:
2453               /* Already in canonical form, nothing to do.  */
2454               if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2455                 continue;
2456               load_type = uint16_type_node;
2457               if (bswap16_p)
2458                 {
2459                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
2460                   bswap_type = bswap16_type;
2461                 }
2462               break;
2463             case 32:
2464               load_type = uint32_type_node;
2465               if (bswap32_p)
2466                 {
2467                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2468                   bswap_type = bswap32_type;
2469                 }
2470               break;
2471             case 64:
2472               load_type = uint64_type_node;
2473               if (bswap64_p)
2474                 {
2475                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2476                   bswap_type = bswap64_type;
2477                 }
2478               break;
2479             default:
2480               continue;
2481             }
2482
2483           if (bswap && !fndecl)
2484             continue;
2485
2486           if (bswap_replace (cur_stmt, src_stmt, fndecl, bswap_type, load_type,
2487                              &n, bswap))
2488             changed = true;
2489         }
2490     }
2491
2492   statistics_counter_event (fun, "16-bit nop implementations found",
2493                             nop_stats.found_16bit);
2494   statistics_counter_event (fun, "32-bit nop implementations found",
2495                             nop_stats.found_32bit);
2496   statistics_counter_event (fun, "64-bit nop implementations found",
2497                             nop_stats.found_64bit);
2498   statistics_counter_event (fun, "16-bit bswap implementations found",
2499                             bswap_stats.found_16bit);
2500   statistics_counter_event (fun, "32-bit bswap implementations found",
2501                             bswap_stats.found_32bit);
2502   statistics_counter_event (fun, "64-bit bswap implementations found",
2503                             bswap_stats.found_64bit);
2504
2505   return (changed ? TODO_update_ssa : 0);
2506 }
2507
2508 } // anon namespace
2509
2510 gimple_opt_pass *
2511 make_pass_optimize_bswap (gcc::context *ctxt)
2512 {
2513   return new pass_optimize_bswap (ctxt);
2514 }
2515
2516 /* Return true if stmt is a type conversion operation that can be stripped
2517    when used in a widening multiply operation.  */
2518 static bool
2519 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
2520 {
2521   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2522
2523   if (TREE_CODE (result_type) == INTEGER_TYPE)
2524     {
2525       tree op_type;
2526       tree inner_op_type;
2527
2528       if (!CONVERT_EXPR_CODE_P (rhs_code))
2529         return false;
2530
2531       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2532
2533       /* If the type of OP has the same precision as the result, then
2534          we can strip this conversion.  The multiply operation will be
2535          selected to create the correct extension as a by-product.  */
2536       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2537         return true;
2538
2539       /* We can also strip a conversion if it preserves the signed-ness of
2540          the operation and doesn't narrow the range.  */
2541       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2542
2543       /* If the inner-most type is unsigned, then we can strip any
2544          intermediate widening operation.  If it's signed, then the
2545          intermediate widening operation must also be signed.  */
2546       if ((TYPE_UNSIGNED (inner_op_type)
2547            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2548           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2549         return true;
2550
2551       return false;
2552     }
2553
2554   return rhs_code == FIXED_CONVERT_EXPR;
2555 }
2556
2557 /* Return true if RHS is a suitable operand for a widening multiplication,
2558    assuming a target type of TYPE.
2559    There are two cases:
2560
2561      - RHS makes some value at least twice as wide.  Store that value
2562        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2563
2564      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2565        but leave *TYPE_OUT untouched.  */
2566
2567 static bool
2568 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2569                         tree *new_rhs_out)
2570 {
2571   gimple stmt;
2572   tree type1, rhs1;
2573
2574   if (TREE_CODE (rhs) == SSA_NAME)
2575     {
2576       stmt = SSA_NAME_DEF_STMT (rhs);
2577       if (is_gimple_assign (stmt))
2578         {
2579           if (! widening_mult_conversion_strippable_p (type, stmt))
2580             rhs1 = rhs;
2581           else
2582             {
2583               rhs1 = gimple_assign_rhs1 (stmt);
2584
2585               if (TREE_CODE (rhs1) == INTEGER_CST)
2586                 {
2587                   *new_rhs_out = rhs1;
2588                   *type_out = NULL;
2589                   return true;
2590                 }
2591             }
2592         }
2593       else
2594         rhs1 = rhs;
2595
2596       type1 = TREE_TYPE (rhs1);
2597
2598       if (TREE_CODE (type1) != TREE_CODE (type)
2599           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2600         return false;
2601
2602       *new_rhs_out = rhs1;
2603       *type_out = type1;
2604       return true;
2605     }
2606
2607   if (TREE_CODE (rhs) == INTEGER_CST)
2608     {
2609       *new_rhs_out = rhs;
2610       *type_out = NULL;
2611       return true;
2612     }
2613
2614   return false;
2615 }
2616
2617 /* Return true if STMT performs a widening multiplication, assuming the
2618    output type is TYPE.  If so, store the unwidened types of the operands
2619    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2620    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2621    and *TYPE2_OUT would give the operands of the multiplication.  */
2622
2623 static bool
2624 is_widening_mult_p (gimple stmt,
2625                     tree *type1_out, tree *rhs1_out,
2626                     tree *type2_out, tree *rhs2_out)
2627 {
2628   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2629
2630   if (TREE_CODE (type) != INTEGER_TYPE
2631       && TREE_CODE (type) != FIXED_POINT_TYPE)
2632     return false;
2633
2634   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2635                                rhs1_out))
2636     return false;
2637
2638   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2639                                rhs2_out))
2640     return false;
2641
2642   if (*type1_out == NULL)
2643     {
2644       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2645         return false;
2646       *type1_out = *type2_out;
2647     }
2648
2649   if (*type2_out == NULL)
2650     {
2651       if (!int_fits_type_p (*rhs2_out, *type1_out))
2652         return false;
2653       *type2_out = *type1_out;
2654     }
2655
2656   /* Ensure that the larger of the two operands comes first. */
2657   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2658     {
2659       tree tmp;
2660       tmp = *type1_out;
2661       *type1_out = *type2_out;
2662       *type2_out = tmp;
2663       tmp = *rhs1_out;
2664       *rhs1_out = *rhs2_out;
2665       *rhs2_out = tmp;
2666     }
2667
2668   return true;
2669 }
2670
2671 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2672    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2673    value is true iff we converted the statement.  */
2674
2675 static bool
2676 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2677 {
2678   tree lhs, rhs1, rhs2, type, type1, type2;
2679   enum insn_code handler;
2680   machine_mode to_mode, from_mode, actual_mode;
2681   optab op;
2682   int actual_precision;
2683   location_t loc = gimple_location (stmt);
2684   bool from_unsigned1, from_unsigned2;
2685
2686   lhs = gimple_assign_lhs (stmt);
2687   type = TREE_TYPE (lhs);
2688   if (TREE_CODE (type) != INTEGER_TYPE)
2689     return false;
2690
2691   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2692     return false;
2693
2694   to_mode = TYPE_MODE (type);
2695   from_mode = TYPE_MODE (type1);
2696   from_unsigned1 = TYPE_UNSIGNED (type1);
2697   from_unsigned2 = TYPE_UNSIGNED (type2);
2698
2699   if (from_unsigned1 && from_unsigned2)
2700     op = umul_widen_optab;
2701   else if (!from_unsigned1 && !from_unsigned2)
2702     op = smul_widen_optab;
2703   else
2704     op = usmul_widen_optab;
2705
2706   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2707                                                   0, &actual_mode);
2708
2709   if (handler == CODE_FOR_nothing)
2710     {
2711       if (op != smul_widen_optab)
2712         {
2713           /* We can use a signed multiply with unsigned types as long as
2714              there is a wider mode to use, or it is the smaller of the two
2715              types that is unsigned.  Note that type1 >= type2, always.  */
2716           if ((TYPE_UNSIGNED (type1)
2717                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2718               || (TYPE_UNSIGNED (type2)
2719                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2720             {
2721               from_mode = GET_MODE_WIDER_MODE (from_mode);
2722               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2723                 return false;
2724             }
2725
2726           op = smul_widen_optab;
2727           handler = find_widening_optab_handler_and_mode (op, to_mode,
2728                                                           from_mode, 0,
2729                                                           &actual_mode);
2730
2731           if (handler == CODE_FOR_nothing)
2732             return false;
2733
2734           from_unsigned1 = from_unsigned2 = false;
2735         }
2736       else
2737         return false;
2738     }
2739
2740   /* Ensure that the inputs to the handler are in the correct precison
2741      for the opcode.  This will be the full mode size.  */
2742   actual_precision = GET_MODE_PRECISION (actual_mode);
2743   if (2 * actual_precision > TYPE_PRECISION (type))
2744     return false;
2745   if (actual_precision != TYPE_PRECISION (type1)
2746       || from_unsigned1 != TYPE_UNSIGNED (type1))
2747     rhs1 = build_and_insert_cast (gsi, loc,
2748                                   build_nonstandard_integer_type
2749                                     (actual_precision, from_unsigned1), rhs1);
2750   if (actual_precision != TYPE_PRECISION (type2)
2751       || from_unsigned2 != TYPE_UNSIGNED (type2))
2752     rhs2 = build_and_insert_cast (gsi, loc,
2753                                   build_nonstandard_integer_type
2754                                     (actual_precision, from_unsigned2), rhs2);
2755
2756   /* Handle constants.  */
2757   if (TREE_CODE (rhs1) == INTEGER_CST)
2758     rhs1 = fold_convert (type1, rhs1);
2759   if (TREE_CODE (rhs2) == INTEGER_CST)
2760     rhs2 = fold_convert (type2, rhs2);
2761
2762   gimple_assign_set_rhs1 (stmt, rhs1);
2763   gimple_assign_set_rhs2 (stmt, rhs2);
2764   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2765   update_stmt (stmt);
2766   widen_mul_stats.widen_mults_inserted++;
2767   return true;
2768 }
2769
2770 /* Process a single gimple statement STMT, which is found at the
2771    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2772    rhs (given by CODE), and try to convert it into a
2773    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2774    is true iff we converted the statement.  */
2775
2776 static bool
2777 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2778                             enum tree_code code)
2779 {
2780   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2781   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2782   tree type, type1, type2, optype;
2783   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2784   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2785   optab this_optab;
2786   enum tree_code wmult_code;
2787   enum insn_code handler;
2788   machine_mode to_mode, from_mode, actual_mode;
2789   location_t loc = gimple_location (stmt);
2790   int actual_precision;
2791   bool from_unsigned1, from_unsigned2;
2792
2793   lhs = gimple_assign_lhs (stmt);
2794   type = TREE_TYPE (lhs);
2795   if (TREE_CODE (type) != INTEGER_TYPE
2796       && TREE_CODE (type) != FIXED_POINT_TYPE)
2797     return false;
2798
2799   if (code == MINUS_EXPR)
2800     wmult_code = WIDEN_MULT_MINUS_EXPR;
2801   else
2802     wmult_code = WIDEN_MULT_PLUS_EXPR;
2803
2804   rhs1 = gimple_assign_rhs1 (stmt);
2805   rhs2 = gimple_assign_rhs2 (stmt);
2806
2807   if (TREE_CODE (rhs1) == SSA_NAME)
2808     {
2809       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2810       if (is_gimple_assign (rhs1_stmt))
2811         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2812     }
2813
2814   if (TREE_CODE (rhs2) == SSA_NAME)
2815     {
2816       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2817       if (is_gimple_assign (rhs2_stmt))
2818         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2819     }
2820
2821   /* Allow for one conversion statement between the multiply
2822      and addition/subtraction statement.  If there are more than
2823      one conversions then we assume they would invalidate this
2824      transformation.  If that's not the case then they should have
2825      been folded before now.  */
2826   if (CONVERT_EXPR_CODE_P (rhs1_code))
2827     {
2828       conv1_stmt = rhs1_stmt;
2829       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2830       if (TREE_CODE (rhs1) == SSA_NAME)
2831         {
2832           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2833           if (is_gimple_assign (rhs1_stmt))
2834             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2835         }
2836       else
2837         return false;
2838     }
2839   if (CONVERT_EXPR_CODE_P (rhs2_code))
2840     {
2841       conv2_stmt = rhs2_stmt;
2842       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2843       if (TREE_CODE (rhs2) == SSA_NAME)
2844         {
2845           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2846           if (is_gimple_assign (rhs2_stmt))
2847             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2848         }
2849       else
2850         return false;
2851     }
2852
2853   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2854      is_widening_mult_p, but we still need the rhs returns.
2855
2856      It might also appear that it would be sufficient to use the existing
2857      operands of the widening multiply, but that would limit the choice of
2858      multiply-and-accumulate instructions.
2859
2860      If the widened-multiplication result has more than one uses, it is
2861      probably wiser not to do the conversion.  */
2862   if (code == PLUS_EXPR
2863       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2864     {
2865       if (!has_single_use (rhs1)
2866           || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2867                                   &type2, &mult_rhs2))
2868         return false;
2869       add_rhs = rhs2;
2870       conv_stmt = conv1_stmt;
2871     }
2872   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2873     {
2874       if (!has_single_use (rhs2)
2875           || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2876                                   &type2, &mult_rhs2))
2877         return false;
2878       add_rhs = rhs1;
2879       conv_stmt = conv2_stmt;
2880     }
2881   else
2882     return false;
2883
2884   to_mode = TYPE_MODE (type);
2885   from_mode = TYPE_MODE (type1);
2886   from_unsigned1 = TYPE_UNSIGNED (type1);
2887   from_unsigned2 = TYPE_UNSIGNED (type2);
2888   optype = type1;
2889
2890   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2891   if (from_unsigned1 != from_unsigned2)
2892     {
2893       if (!INTEGRAL_TYPE_P (type))
2894         return false;
2895       /* We can use a signed multiply with unsigned types as long as
2896          there is a wider mode to use, or it is the smaller of the two
2897          types that is unsigned.  Note that type1 >= type2, always.  */
2898       if ((from_unsigned1
2899            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2900           || (from_unsigned2
2901               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2902         {
2903           from_mode = GET_MODE_WIDER_MODE (from_mode);
2904           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2905             return false;
2906         }
2907
2908       from_unsigned1 = from_unsigned2 = false;
2909       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2910                                                false);
2911     }
2912
2913   /* If there was a conversion between the multiply and addition
2914      then we need to make sure it fits a multiply-and-accumulate.
2915      The should be a single mode change which does not change the
2916      value.  */
2917   if (conv_stmt)
2918     {
2919       /* We use the original, unmodified data types for this.  */
2920       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2921       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2922       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2923       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2924
2925       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2926         {
2927           /* Conversion is a truncate.  */
2928           if (TYPE_PRECISION (to_type) < data_size)
2929             return false;
2930         }
2931       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2932         {
2933           /* Conversion is an extend.  Check it's the right sort.  */
2934           if (TYPE_UNSIGNED (from_type) != is_unsigned
2935               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2936             return false;
2937         }
2938       /* else convert is a no-op for our purposes.  */
2939     }
2940
2941   /* Verify that the machine can perform a widening multiply
2942      accumulate in this mode/signedness combination, otherwise
2943      this transformation is likely to pessimize code.  */
2944   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2945   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2946                                                   from_mode, 0, &actual_mode);
2947
2948   if (handler == CODE_FOR_nothing)
2949     return false;
2950
2951   /* Ensure that the inputs to the handler are in the correct precison
2952      for the opcode.  This will be the full mode size.  */
2953   actual_precision = GET_MODE_PRECISION (actual_mode);
2954   if (actual_precision != TYPE_PRECISION (type1)
2955       || from_unsigned1 != TYPE_UNSIGNED (type1))
2956     mult_rhs1 = build_and_insert_cast (gsi, loc,
2957                                        build_nonstandard_integer_type
2958                                          (actual_precision, from_unsigned1),
2959                                        mult_rhs1);
2960   if (actual_precision != TYPE_PRECISION (type2)
2961       || from_unsigned2 != TYPE_UNSIGNED (type2))
2962     mult_rhs2 = build_and_insert_cast (gsi, loc,
2963                                        build_nonstandard_integer_type
2964                                          (actual_precision, from_unsigned2),
2965                                        mult_rhs2);
2966
2967   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2968     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2969
2970   /* Handle constants.  */
2971   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2972     mult_rhs1 = fold_convert (type1, mult_rhs1);
2973   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2974     mult_rhs2 = fold_convert (type2, mult_rhs2);
2975
2976   gimple_assign_set_rhs_with_ops_1 (gsi, wmult_code, mult_rhs1, mult_rhs2,
2977                                     add_rhs);
2978   update_stmt (gsi_stmt (*gsi));
2979   widen_mul_stats.maccs_inserted++;
2980   return true;
2981 }
2982
2983 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
2984    with uses in additions and subtractions to form fused multiply-add
2985    operations.  Returns true if successful and MUL_STMT should be removed.  */
2986
2987 static bool
2988 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
2989 {
2990   tree mul_result = gimple_get_lhs (mul_stmt);
2991   tree type = TREE_TYPE (mul_result);
2992   gimple use_stmt, neguse_stmt, fma_stmt;
2993   use_operand_p use_p;
2994   imm_use_iterator imm_iter;
2995
2996   if (FLOAT_TYPE_P (type)
2997       && flag_fp_contract_mode == FP_CONTRACT_OFF)
2998     return false;
2999
3000   /* We don't want to do bitfield reduction ops.  */
3001   if (INTEGRAL_TYPE_P (type)
3002       && (TYPE_PRECISION (type)
3003           != GET_MODE_PRECISION (TYPE_MODE (type))))
3004     return false;
3005
3006   /* If the target doesn't support it, don't generate it.  We assume that
3007      if fma isn't available then fms, fnma or fnms are not either.  */
3008   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3009     return false;
3010
3011   /* If the multiplication has zero uses, it is kept around probably because
3012      of -fnon-call-exceptions.  Don't optimize it away in that case,
3013      it is DCE job.  */
3014   if (has_zero_uses (mul_result))
3015     return false;
3016
3017   /* Make sure that the multiplication statement becomes dead after
3018      the transformation, thus that all uses are transformed to FMAs.
3019      This means we assume that an FMA operation has the same cost
3020      as an addition.  */
3021   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3022     {
3023       enum tree_code use_code;
3024       tree result = mul_result;
3025       bool negate_p = false;
3026
3027       use_stmt = USE_STMT (use_p);
3028
3029       if (is_gimple_debug (use_stmt))
3030         continue;
3031
3032       /* For now restrict this operations to single basic blocks.  In theory
3033          we would want to support sinking the multiplication in
3034          m = a*b;
3035          if ()
3036            ma = m + c;
3037          else
3038            d = m;
3039          to form a fma in the then block and sink the multiplication to the
3040          else block.  */
3041       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3042         return false;
3043
3044       if (!is_gimple_assign (use_stmt))
3045         return false;
3046
3047       use_code = gimple_assign_rhs_code (use_stmt);
3048
3049       /* A negate on the multiplication leads to FNMA.  */
3050       if (use_code == NEGATE_EXPR)
3051         {
3052           ssa_op_iter iter;
3053           use_operand_p usep;
3054
3055           result = gimple_assign_lhs (use_stmt);
3056
3057           /* Make sure the negate statement becomes dead with this
3058              single transformation.  */
3059           if (!single_imm_use (gimple_assign_lhs (use_stmt),
3060                                &use_p, &neguse_stmt))
3061             return false;
3062
3063           /* Make sure the multiplication isn't also used on that stmt.  */
3064           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3065             if (USE_FROM_PTR (usep) == mul_result)
3066               return false;
3067
3068           /* Re-validate.  */
3069           use_stmt = neguse_stmt;
3070           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3071             return false;
3072           if (!is_gimple_assign (use_stmt))
3073             return false;
3074
3075           use_code = gimple_assign_rhs_code (use_stmt);
3076           negate_p = true;
3077         }
3078
3079       switch (use_code)
3080         {
3081         case MINUS_EXPR:
3082           if (gimple_assign_rhs2 (use_stmt) == result)
3083             negate_p = !negate_p;
3084           break;
3085         case PLUS_EXPR:
3086           break;
3087         default:
3088           /* FMA can only be formed from PLUS and MINUS.  */
3089           return false;
3090         }
3091
3092       /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3093          by a MULT_EXPR that we'll visit later, we might be able to
3094          get a more profitable match with fnma.
3095          OTOH, if we don't, a negate / fma pair has likely lower latency
3096          that a mult / subtract pair.  */
3097       if (use_code == MINUS_EXPR && !negate_p
3098           && gimple_assign_rhs1 (use_stmt) == result
3099           && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3100           && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3101         {
3102           tree rhs2 = gimple_assign_rhs2 (use_stmt);
3103
3104           if (TREE_CODE (rhs2) == SSA_NAME)
3105             {
3106               gimple stmt2 = SSA_NAME_DEF_STMT (rhs2);
3107               if (has_single_use (rhs2)
3108                   && is_gimple_assign (stmt2)
3109                   && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3110               return false;
3111             }
3112         }
3113
3114       /* We can't handle a * b + a * b.  */
3115       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3116         return false;
3117
3118       /* While it is possible to validate whether or not the exact form
3119          that we've recognized is available in the backend, the assumption
3120          is that the transformation is never a loss.  For instance, suppose
3121          the target only has the plain FMA pattern available.  Consider
3122          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3123          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
3124          still have 3 operations, but in the FMA form the two NEGs are
3125          independent and could be run in parallel.  */
3126     }
3127
3128   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3129     {
3130       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3131       enum tree_code use_code;
3132       tree addop, mulop1 = op1, result = mul_result;
3133       bool negate_p = false;
3134
3135       if (is_gimple_debug (use_stmt))
3136         continue;
3137
3138       use_code = gimple_assign_rhs_code (use_stmt);
3139       if (use_code == NEGATE_EXPR)
3140         {
3141           result = gimple_assign_lhs (use_stmt);
3142           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3143           gsi_remove (&gsi, true);
3144           release_defs (use_stmt);
3145
3146           use_stmt = neguse_stmt;
3147           gsi = gsi_for_stmt (use_stmt);
3148           use_code = gimple_assign_rhs_code (use_stmt);
3149           negate_p = true;
3150         }
3151
3152       if (gimple_assign_rhs1 (use_stmt) == result)
3153         {
3154           addop = gimple_assign_rhs2 (use_stmt);
3155           /* a * b - c -> a * b + (-c)  */
3156           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3157             addop = force_gimple_operand_gsi (&gsi,
3158                                               build1 (NEGATE_EXPR,
3159                                                       type, addop),
3160                                               true, NULL_TREE, true,
3161                                               GSI_SAME_STMT);
3162         }
3163       else
3164         {
3165           addop = gimple_assign_rhs1 (use_stmt);
3166           /* a - b * c -> (-b) * c + a */
3167           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3168             negate_p = !negate_p;
3169         }
3170
3171       if (negate_p)
3172         mulop1 = force_gimple_operand_gsi (&gsi,
3173                                            build1 (NEGATE_EXPR,
3174                                                    type, mulop1),
3175                                            true, NULL_TREE, true,
3176                                            GSI_SAME_STMT);
3177
3178       fma_stmt = gimple_build_assign_with_ops (FMA_EXPR,
3179                                                gimple_assign_lhs (use_stmt),
3180                                                mulop1, op2,
3181                                                addop);
3182       gsi_replace (&gsi, fma_stmt, true);
3183       widen_mul_stats.fmas_inserted++;
3184     }
3185
3186   return true;
3187 }
3188
3189 /* Find integer multiplications where the operands are extended from
3190    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3191    where appropriate.  */
3192
3193 namespace {
3194
3195 const pass_data pass_data_optimize_widening_mul =
3196 {
3197   GIMPLE_PASS, /* type */
3198   "widening_mul", /* name */
3199   OPTGROUP_NONE, /* optinfo_flags */
3200   TV_NONE, /* tv_id */
3201   PROP_ssa, /* properties_required */
3202   0, /* properties_provided */
3203   0, /* properties_destroyed */
3204   0, /* todo_flags_start */
3205   TODO_update_ssa, /* todo_flags_finish */
3206 };
3207
3208 class pass_optimize_widening_mul : public gimple_opt_pass
3209 {
3210 public:
3211   pass_optimize_widening_mul (gcc::context *ctxt)
3212     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3213   {}
3214
3215   /* opt_pass methods: */
3216   virtual bool gate (function *)
3217     {
3218       return flag_expensive_optimizations && optimize;
3219     }
3220
3221   virtual unsigned int execute (function *);
3222
3223 }; // class pass_optimize_widening_mul
3224
3225 unsigned int
3226 pass_optimize_widening_mul::execute (function *fun)
3227 {
3228   basic_block bb;
3229   bool cfg_changed = false;
3230
3231   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3232
3233   FOR_EACH_BB_FN (bb, fun)
3234     {
3235       gimple_stmt_iterator gsi;
3236
3237       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3238         {
3239           gimple stmt = gsi_stmt (gsi);
3240           enum tree_code code;
3241
3242           if (is_gimple_assign (stmt))
3243             {
3244               code = gimple_assign_rhs_code (stmt);
3245               switch (code)
3246                 {
3247                 case MULT_EXPR:
3248                   if (!convert_mult_to_widen (stmt, &gsi)
3249                       && convert_mult_to_fma (stmt,
3250                                               gimple_assign_rhs1 (stmt),
3251                                               gimple_assign_rhs2 (stmt)))
3252                     {
3253                       gsi_remove (&gsi, true);
3254                       release_defs (stmt);
3255                       continue;
3256                     }
3257                   break;
3258
3259                 case PLUS_EXPR:
3260                 case MINUS_EXPR:
3261                   convert_plusminus_to_widen (&gsi, stmt, code);
3262                   break;
3263
3264                 default:;
3265                 }
3266             }
3267           else if (is_gimple_call (stmt)
3268                    && gimple_call_lhs (stmt))
3269             {
3270               tree fndecl = gimple_call_fndecl (stmt);
3271               if (fndecl
3272                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
3273                 {
3274                   switch (DECL_FUNCTION_CODE (fndecl))
3275                     {
3276                       case BUILT_IN_POWF:
3277                       case BUILT_IN_POW:
3278                       case BUILT_IN_POWL:
3279                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3280                             && REAL_VALUES_EQUAL
3281                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3282                                   dconst2)
3283                             && convert_mult_to_fma (stmt,
3284                                                     gimple_call_arg (stmt, 0),
3285                                                     gimple_call_arg (stmt, 0)))
3286                           {
3287                             unlink_stmt_vdef (stmt);
3288                             if (gsi_remove (&gsi, true)
3289                                 && gimple_purge_dead_eh_edges (bb))
3290                               cfg_changed = true;
3291                             release_defs (stmt);
3292                             continue;
3293                           }
3294                           break;
3295
3296                       default:;
3297                     }
3298                 }
3299             }
3300           gsi_next (&gsi);
3301         }
3302     }
3303
3304   statistics_counter_event (fun, "widening multiplications inserted",
3305                             widen_mul_stats.widen_mults_inserted);
3306   statistics_counter_event (fun, "widening maccs inserted",
3307                             widen_mul_stats.maccs_inserted);
3308   statistics_counter_event (fun, "fused multiply-adds inserted",
3309                             widen_mul_stats.fmas_inserted);
3310
3311   return cfg_changed ? TODO_cleanup_cfg : 0;
3312 }
3313
3314 } // anon namespace
3315
3316 gimple_opt_pass *
3317 make_pass_optimize_widening_mul (gcc::context *ctxt)
3318 {
3319   return new pass_optimize_widening_mul (ctxt);
3320 }