gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005-2015 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  21    operations.  These are common in sequences such as this one:
  22
  23         modulus = sqrt(x*x + y*y + z*z);
  24         x = x / modulus;
  25         y = y / modulus;
  26         z = z / modulus;
  27
  28    that can be optimized to
  29
  30         modulus = sqrt(x*x + y*y + z*z);
  31         rmodulus = 1.0 / modulus;
  32         x = x * rmodulus;
  33         y = y * rmodulus;
  34         z = z * rmodulus;
  35
  36    We do this for loop invariant divisors, and with this pass whenever
  37    we notice that a division has the same divisor multiple times.
  38
  39    Of course, like in PRE, we don't insert a division if a dominator
  40    already has one.  However, this cannot be done as an extension of
  41    PRE for several reasons.
  42
  43    First of all, with some experiments it was found out that the
  44    transformation is not always useful if there are only two divisions
  45    hy the same divisor.  This is probably because modern processors
  46    can pipeline the divisions; on older, in-order processors it should
  47    still be effective to optimize two divisions by the same number.
  48    We make this a param, and it shall be called N in the remainder of
  49    this comment.
  50
  51    Second, if trapping math is active, we have less freedom on where
  52    to insert divisions: we can only do so in basic blocks that already
  53    contain one.  (If divisions don't trap, instead, we can insert
  54    divisions elsewhere, which will be in blocks that are common dominators
  55    of those that have the division).
  56
  57    We really don't want to compute the reciprocal unless a division will
  58    be found.  To do this, we won't insert the division in a basic block
  59    that has less than N divisions *post-dominating* it.
  60
  61    The algorithm constructs a subset of the dominator tree, holding the
  62    blocks containing the divisions and the common dominators to them,
  63    and walk it twice.  The first walk is in post-order, and it annotates
  64    each block with the number of divisions that post-dominate it: this
  65    gives information on where divisions can be inserted profitably.
  66    The second walk is in pre-order, and it inserts divisions as explained
  67    above, and replaces divisions by multiplications.
  68
  69    In the best case, the cost of the pass is O(n_statements).  In the
  70    worst-case, the cost is due to creating the dominator tree subset,
  71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  72    for n_statements / n_basic_blocks statements.  So, the amortized cost
  73    of creating the dominator tree subset is O(n_basic_blocks) and the
  74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  75
  76    More practically, the cost will be small because there are few
  77    divisions, and they tend to be in the same basic block, so insert_bb
  78    is called very few times.
  79
  80    If we did this using domwalk.c, an efficient implementation would have
  81    to work on all the variables in a single pass, because we could not
  82    work on just a subset of the dominator tree, as we do now, and the
  83    cost would also be something like O(n_statements * n_basic_blocks).
  84    The data structures would be more complex in order to work on all the
  85    variables in a single pass.  */
  86
  87 #include "config.h"
  88 #include "system.h"
  89 #include "coretypes.h"
  90 #include "tm.h"
  91 #include "flags.h"
  92 #include "hash-set.h"
  93 #include "machmode.h"
  94 #include "vec.h"
  95 #include "double-int.h"
  96 #include "input.h"
  97 #include "alias.h"
  98 #include "symtab.h"
  99 #include "wide-int.h"
 100 #include "inchash.h"
 101 #include "tree.h"
 102 #include "fold-const.h"
 103 #include "predict.h"
 104 #include "hard-reg-set.h"
 105 #include "input.h"
 106 #include "function.h"
 107 #include "dominance.h"
 108 #include "cfg.h"
 109 #include "basic-block.h"
 110 #include "tree-ssa-alias.h"
 111 #include "internal-fn.h"
 112 #include "gimple-fold.h"
 113 #include "gimple-expr.h"
 114 #include "is-a.h"
 115 #include "gimple.h"
 116 #include "gimple-iterator.h"
 117 #include "gimplify.h"
 118 #include "gimplify-me.h"
 119 #include "stor-layout.h"
 120 #include "gimple-ssa.h"
 121 #include "tree-cfg.h"
 122 #include "tree-phinodes.h"
 123 #include "ssa-iterators.h"
 124 #include "stringpool.h"
 125 #include "tree-ssanames.h"
 126 #include "expr.h"
 127 #include "tree-dfa.h"
 128 #include "tree-ssa.h"
 129 #include "tree-pass.h"
 130 #include "alloc-pool.h"
 131 #include "target.h"
 132 #include "gimple-pretty-print.h"
 133 #include "builtins.h"
 134
 135 /* FIXME: RTL headers have to be included here for optabs.  */
 136 #include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
 137 #include "expr.h"               /* Because optabs.h wants sepops.  */
 138 #include "insn-codes.h"
 139 #include "optabs.h"
 140
 141 /* This structure represents one basic block that either computes a
 142    division, or is a common dominator for basic block that compute a
 143    division.  */
 144 struct occurrence {
 145   /* The basic block represented by this structure.  */
 146   basic_block bb;
 147
 148   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 149      inserted in BB.  */
 150   tree recip_def;
 151
 152   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 153      was inserted in BB.  */
 154   gimple recip_def_stmt;
 155
 156   /* Pointer to a list of "struct occurrence"s for blocks dominated
 157      by BB.  */
 158   struct occurrence *children;
 159
 160   /* Pointer to the next "struct occurrence"s in the list of blocks
 161      sharing a common dominator.  */
 162   struct occurrence *next;
 163
 164   /* The number of divisions that are in BB before compute_merit.  The
 165      number of divisions that are in BB or post-dominate it after
 166      compute_merit.  */
 167   int num_divisions;
 168
 169   /* True if the basic block has a division, false if it is a common
 170      dominator for basic blocks that do.  If it is false and trapping
 171      math is active, BB is not a candidate for inserting a reciprocal.  */
 172   bool bb_has_division;
 173 };
 174
 175 static struct
 176 {
 177   /* Number of 1.0/X ops inserted.  */
 178   int rdivs_inserted;
 179
 180   /* Number of 1.0/FUNC ops inserted.  */
 181   int rfuncs_inserted;
 182 } reciprocal_stats;
 183
 184 static struct
 185 {
 186   /* Number of cexpi calls inserted.  */
 187   int inserted;
 188 } sincos_stats;
 189
 190 static struct
 191 {
 192   /* Number of hand-written 16-bit nop / bswaps found.  */
 193   int found_16bit;
 194
 195   /* Number of hand-written 32-bit nop / bswaps found.  */
 196   int found_32bit;
 197
 198   /* Number of hand-written 64-bit nop / bswaps found.  */
 199   int found_64bit;
 200 } nop_stats, bswap_stats;
 201
 202 static struct
 203 {
 204   /* Number of widening multiplication ops inserted.  */
 205   int widen_mults_inserted;
 206
 207   /* Number of integer multiply-and-accumulate ops inserted.  */
 208   int maccs_inserted;
 209
 210   /* Number of fp fused multiply-add ops inserted.  */
 211   int fmas_inserted;
 212 } widen_mul_stats;
 213
 214 /* The instance of "struct occurrence" representing the highest
 215    interesting block in the dominator tree.  */
 216 static struct occurrence *occ_head;
 217
 218 /* Allocation pool for getting instances of "struct occurrence".  */
 219 static alloc_pool occ_pool;
 220
 221
 222
 223 /* Allocate and return a new struct occurrence for basic block BB, and
 224    whose children list is headed by CHILDREN.  */
 225 static struct occurrence *
 226 occ_new (basic_block bb, struct occurrence *children)
 227 {
 228   struct occurrence *occ;
 229
 230   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
 231   memset (occ, 0, sizeof (struct occurrence));
 232
 233   occ->bb = bb;
 234   occ->children = children;
 235   return occ;
 236 }
 237
 238
 239 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 240    list of "struct occurrence"s, one per basic block, having IDOM as
 241    their common dominator.
 242
 243    We try to insert NEW_OCC as deep as possible in the tree, and we also
 244    insert any other block that is a common dominator for BB and one
 245    block already in the tree.  */
 246
 247 static void
 248 insert_bb (struct occurrence *new_occ, basic_block idom,
 249            struct occurrence **p_head)
 250 {
 251   struct occurrence *occ, **p_occ;
 252
 253   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 254     {
 255       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 256       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 257       if (dom == bb)
 258         {
 259           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 260              from its list.  */
 261           *p_occ = occ->next;
 262           occ->next = new_occ->children;
 263           new_occ->children = occ;
 264
 265           /* Try the next block (it may as well be dominated by BB).  */
 266         }
 267
 268       else if (dom == occ_bb)
 269         {
 270           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 271           insert_bb (new_occ, dom, &occ->children);
 272           return;
 273         }
 274
 275       else if (dom != idom)
 276         {
 277           gcc_assert (!dom->aux);
 278
 279           /* There is a dominator between IDOM and BB, add it and make
 280              two children out of NEW_OCC and OCC.  First, remove OCC from
 281              its list.  */
 282           *p_occ = occ->next;
 283           new_occ->next = occ;
 284           occ->next = NULL;
 285
 286           /* None of the previous blocks has DOM as a dominator: if we tail
 287              recursed, we would reexamine them uselessly. Just switch BB with
 288              DOM, and go on looking for blocks dominated by DOM.  */
 289           new_occ = occ_new (dom, new_occ);
 290         }
 291
 292       else
 293         {
 294           /* Nothing special, go on with the next element.  */
 295           p_occ = &occ->next;
 296         }
 297     }
 298
 299   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 300   new_occ->next = *p_head;
 301   *p_head = new_occ;
 302 }
 303
 304 /* Register that we found a division in BB.  */
 305
 306 static inline void
 307 register_division_in (basic_block bb)
 308 {
 309   struct occurrence *occ;
 310
 311   occ = (struct occurrence *) bb->aux;
 312   if (!occ)
 313     {
 314       occ = occ_new (bb, NULL);
 315       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
 316     }
 317
 318   occ->bb_has_division = true;
 319   occ->num_divisions++;
 320 }
 321
 322
 323 /* Compute the number of divisions that postdominate each block in OCC and
 324    its children.  */
 325
 326 static void
 327 compute_merit (struct occurrence *occ)
 328 {
 329   struct occurrence *occ_child;
 330   basic_block dom = occ->bb;
 331
 332   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 333     {
 334       basic_block bb;
 335       if (occ_child->children)
 336         compute_merit (occ_child);
 337
 338       if (flag_exceptions)
 339         bb = single_noncomplex_succ (dom);
 340       else
 341         bb = dom;
 342
 343       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 344         occ->num_divisions += occ_child->num_divisions;
 345     }
 346 }
 347
 348
 349 /* Return whether USE_STMT is a floating-point division by DEF.  */
 350 static inline bool
 351 is_division_by (gimple use_stmt, tree def)
 352 {
 353   return is_gimple_assign (use_stmt)
 354          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 355          && gimple_assign_rhs2 (use_stmt) == def
 356          /* Do not recognize x / x as valid division, as we are getting
 357             confused later by replacing all immediate uses x in such
 358             a stmt.  */
 359          && gimple_assign_rhs1 (use_stmt) != def;
 360 }
 361
 362 /* Walk the subset of the dominator tree rooted at OCC, setting the
 363    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 364    the given basic block.  The field may be left NULL, of course,
 365    if it is not possible or profitable to do the optimization.
 366
 367    DEF_BSI is an iterator pointing at the statement defining DEF.
 368    If RECIP_DEF is set, a dominator already has a computation that can
 369    be used.  */
 370
 371 static void
 372 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 373                     tree def, tree recip_def, int threshold)
 374 {
 375   tree type;
 376   gassign *new_stmt;
 377   gimple_stmt_iterator gsi;
 378   struct occurrence *occ_child;
 379
 380   if (!recip_def
 381       && (occ->bb_has_division || !flag_trapping_math)
 382       && occ->num_divisions >= threshold)
 383     {
 384       /* Make a variable with the replacement and substitute it.  */
 385       type = TREE_TYPE (def);
 386       recip_def = create_tmp_reg (type, "reciptmp");
 387       new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
 388                                       build_one_cst (type), def);
 389
 390       if (occ->bb_has_division)
 391         {
 392           /* Case 1: insert before an existing division.  */
 393           gsi = gsi_after_labels (occ->bb);
 394           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 395             gsi_next (&gsi);
 396
 397           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 398         }
 399       else if (def_gsi && occ->bb == def_gsi->bb)
 400         {
 401           /* Case 2: insert right after the definition.  Note that this will
 402              never happen if the definition statement can throw, because in
 403              that case the sole successor of the statement's basic block will
 404              dominate all the uses as well.  */
 405           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 406         }
 407       else
 408         {
 409           /* Case 3: insert in a basic block not containing defs/uses.  */
 410           gsi = gsi_after_labels (occ->bb);
 411           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 412         }
 413
 414       reciprocal_stats.rdivs_inserted++;
 415
 416       occ->recip_def_stmt = new_stmt;
 417     }
 418
 419   occ->recip_def = recip_def;
 420   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 421     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 422 }
 423
 424
 425 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 426    possible.  */
 427
 428 static inline void
 429 replace_reciprocal (use_operand_p use_p)
 430 {
 431   gimple use_stmt = USE_STMT (use_p);
 432   basic_block bb = gimple_bb (use_stmt);
 433   struct occurrence *occ = (struct occurrence *) bb->aux;
 434
 435   if (optimize_bb_for_speed_p (bb)
 436       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 437     {
 438       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 439       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 440       SET_USE (use_p, occ->recip_def);
 441       fold_stmt_inplace (&gsi);
 442       update_stmt (use_stmt);
 443     }
 444 }
 445
 446
 447 /* Free OCC and return one more "struct occurrence" to be freed.  */
 448
 449 static struct occurrence *
 450 free_bb (struct occurrence *occ)
 451 {
 452   struct occurrence *child, *next;
 453
 454   /* First get the two pointers hanging off OCC.  */
 455   next = occ->next;
 456   child = occ->children;
 457   occ->bb->aux = NULL;
 458   pool_free (occ_pool, occ);
 459
 460   /* Now ensure that we don't recurse unless it is necessary.  */
 461   if (!child)
 462     return next;
 463   else
 464     {
 465       while (next)
 466         next = free_bb (next);
 467
 468       return child;
 469     }
 470 }
 471
 472
 473 /* Look for floating-point divisions among DEF's uses, and try to
 474    replace them by multiplications with the reciprocal.  Add
 475    as many statements computing the reciprocal as needed.
 476
 477    DEF must be a GIMPLE register of a floating-point type.  */
 478
 479 static void
 480 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 481 {
 482   use_operand_p use_p;
 483   imm_use_iterator use_iter;
 484   struct occurrence *occ;
 485   int count = 0, threshold;
 486
 487   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 488
 489   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 490     {
 491       gimple use_stmt = USE_STMT (use_p);
 492       if (is_division_by (use_stmt, def))
 493         {
 494           register_division_in (gimple_bb (use_stmt));
 495           count++;
 496         }
 497     }
 498
 499   /* Do the expensive part only if we can hope to optimize something.  */
 500   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 501   if (count >= threshold)
 502     {
 503       gimple use_stmt;
 504       for (occ = occ_head; occ; occ = occ->next)
 505         {
 506           compute_merit (occ);
 507           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 508         }
 509
 510       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 511         {
 512           if (is_division_by (use_stmt, def))
 513             {
 514               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 515                 replace_reciprocal (use_p);
 516             }
 517         }
 518     }
 519
 520   for (occ = occ_head; occ; )
 521     occ = free_bb (occ);
 522
 523   occ_head = NULL;
 524 }
 525
 526 /* Go through all the floating-point SSA_NAMEs, and call
 527    execute_cse_reciprocals_1 on each of them.  */
 528 namespace {
 529
 530 const pass_data pass_data_cse_reciprocals =
 531 {
 532   GIMPLE_PASS, /* type */
 533   "recip", /* name */
 534   OPTGROUP_NONE, /* optinfo_flags */
 535   TV_NONE, /* tv_id */
 536   PROP_ssa, /* properties_required */
 537   0, /* properties_provided */
 538   0, /* properties_destroyed */
 539   0, /* todo_flags_start */
 540   TODO_update_ssa, /* todo_flags_finish */
 541 };
 542
 543 class pass_cse_reciprocals : public gimple_opt_pass
 544 {
 545 public:
 546   pass_cse_reciprocals (gcc::context *ctxt)
 547     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
 548   {}
 549
 550   /* opt_pass methods: */
 551   virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
 552   virtual unsigned int execute (function *);
 553
 554 }; // class pass_cse_reciprocals
 555
 556 unsigned int
 557 pass_cse_reciprocals::execute (function *fun)
 558 {
 559   basic_block bb;
 560   tree arg;
 561
 562   occ_pool = create_alloc_pool ("dominators for recip",
 563                                 sizeof (struct occurrence),
 564                                 n_basic_blocks_for_fn (fun) / 3 + 1);
 565
 566   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 567   calculate_dominance_info (CDI_DOMINATORS);
 568   calculate_dominance_info (CDI_POST_DOMINATORS);
 569
 570 #ifdef ENABLE_CHECKING
 571   FOR_EACH_BB_FN (bb, fun)
 572     gcc_assert (!bb->aux);
 573 #endif
 574
 575   for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
 576     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 577         && is_gimple_reg (arg))
 578       {
 579         tree name = ssa_default_def (fun, arg);
 580         if (name)
 581           execute_cse_reciprocals_1 (NULL, name);
 582       }
 583
 584   FOR_EACH_BB_FN (bb, fun)
 585     {
 586       tree def;
 587
 588       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 589            gsi_next (&gsi))
 590         {
 591           gphi *phi = gsi.phi ();
 592           def = PHI_RESULT (phi);
 593           if (! virtual_operand_p (def)
 594               && FLOAT_TYPE_P (TREE_TYPE (def)))
 595             execute_cse_reciprocals_1 (NULL, def);
 596         }
 597
 598       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 599            gsi_next (&gsi))
 600         {
 601           gimple stmt = gsi_stmt (gsi);
 602
 603           if (gimple_has_lhs (stmt)
 604               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 605               && FLOAT_TYPE_P (TREE_TYPE (def))
 606               && TREE_CODE (def) == SSA_NAME)
 607             execute_cse_reciprocals_1 (&gsi, def);
 608         }
 609
 610       if (optimize_bb_for_size_p (bb))
 611         continue;
 612
 613       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 614       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 615            gsi_next (&gsi))
 616         {
 617           gimple stmt = gsi_stmt (gsi);
 618           tree fndecl;
 619
 620           if (is_gimple_assign (stmt)
 621               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 622             {
 623               tree arg1 = gimple_assign_rhs2 (stmt);
 624               gimple stmt1;
 625
 626               if (TREE_CODE (arg1) != SSA_NAME)
 627                 continue;
 628
 629               stmt1 = SSA_NAME_DEF_STMT (arg1);
 630
 631               if (is_gimple_call (stmt1)
 632                   && gimple_call_lhs (stmt1)
 633                   && (fndecl = gimple_call_fndecl (stmt1))
 634                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 635                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 636                 {
 637                   enum built_in_function code;
 638                   bool md_code, fail;
 639                   imm_use_iterator ui;
 640                   use_operand_p use_p;
 641
 642                   code = DECL_FUNCTION_CODE (fndecl);
 643                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 644
 645                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 646                   if (!fndecl)
 647                     continue;
 648
 649                   /* Check that all uses of the SSA name are divisions,
 650                      otherwise replacing the defining statement will do
 651                      the wrong thing.  */
 652                   fail = false;
 653                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 654                     {
 655                       gimple stmt2 = USE_STMT (use_p);
 656                       if (is_gimple_debug (stmt2))
 657                         continue;
 658                       if (!is_gimple_assign (stmt2)
 659                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 660                           || gimple_assign_rhs1 (stmt2) == arg1
 661                           || gimple_assign_rhs2 (stmt2) != arg1)
 662                         {
 663                           fail = true;
 664                           break;
 665                         }
 666                     }
 667                   if (fail)
 668                     continue;
 669
 670                   gimple_replace_ssa_lhs (stmt1, arg1);
 671                   gimple_call_set_fndecl (stmt1, fndecl);
 672                   update_stmt (stmt1);
 673                   reciprocal_stats.rfuncs_inserted++;
 674
 675                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 676                     {
 677                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 678                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 679                       fold_stmt_inplace (&gsi);
 680                       update_stmt (stmt);
 681                     }
 682                 }
 683             }
 684         }
 685     }
 686
 687   statistics_counter_event (fun, "reciprocal divs inserted",
 688                             reciprocal_stats.rdivs_inserted);
 689   statistics_counter_event (fun, "reciprocal functions inserted",
 690                             reciprocal_stats.rfuncs_inserted);
 691
 692   free_dominance_info (CDI_DOMINATORS);
 693   free_dominance_info (CDI_POST_DOMINATORS);
 694   free_alloc_pool (occ_pool);
 695   return 0;
 696 }
 697
 698 } // anon namespace
 699
 700 gimple_opt_pass *
 701 make_pass_cse_reciprocals (gcc::context *ctxt)
 702 {
 703   return new pass_cse_reciprocals (ctxt);
 704 }
 705
 706 /* Records an occurrence at statement USE_STMT in the vector of trees
 707    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 708    is not yet initialized.  Returns true if the occurrence was pushed on
 709    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 710    statements in the vector.  */
 711
 712 static bool
 713 maybe_record_sincos (vec<gimple> *stmts,
 714                      basic_block *top_bb, gimple use_stmt)
 715 {
 716   basic_block use_bb = gimple_bb (use_stmt);
 717   if (*top_bb
 718       && (*top_bb == use_bb
 719           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 720     stmts->safe_push (use_stmt);
 721   else if (!*top_bb
 722            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 723     {
 724       stmts->safe_push (use_stmt);
 725       *top_bb = use_bb;
 726     }
 727   else
 728     return false;
 729
 730   return true;
 731 }
 732
 733 /* Look for sin, cos and cexpi calls with the same argument NAME and
 734    create a single call to cexpi CSEing the result in this case.
 735    We first walk over all immediate uses of the argument collecting
 736    statements that we can CSE in a vector and in a second pass replace
 737    the statement rhs with a REALPART or IMAGPART expression on the
 738    result of the cexpi call we insert before the use statement that
 739    dominates all other candidates.  */
 740
 741 static bool
 742 execute_cse_sincos_1 (tree name)
 743 {
 744   gimple_stmt_iterator gsi;
 745   imm_use_iterator use_iter;
 746   tree fndecl, res, type;
 747   gimple def_stmt, use_stmt, stmt;
 748   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 749   auto_vec<gimple> stmts;
 750   basic_block top_bb = NULL;
 751   int i;
 752   bool cfg_changed = false;
 753
 754   type = TREE_TYPE (name);
 755   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 756     {
 757       if (gimple_code (use_stmt) != GIMPLE_CALL
 758           || !gimple_call_lhs (use_stmt)
 759           || !(fndecl = gimple_call_fndecl (use_stmt))
 760           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 761         continue;
 762
 763       switch (DECL_FUNCTION_CODE (fndecl))
 764         {
 765         CASE_FLT_FN (BUILT_IN_COS):
 766           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 767           break;
 768
 769         CASE_FLT_FN (BUILT_IN_SIN):
 770           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 771           break;
 772
 773         CASE_FLT_FN (BUILT_IN_CEXPI):
 774           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 775           break;
 776
 777         default:;
 778         }
 779     }
 780
 781   if (seen_cos + seen_sin + seen_cexpi <= 1)
 782     return false;
 783
 784   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 785      the name def statement.  */
 786   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 787   if (!fndecl)
 788     return false;
 789   stmt = gimple_build_call (fndecl, 1, name);
 790   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
 791   gimple_call_set_lhs (stmt, res);
 792
 793   def_stmt = SSA_NAME_DEF_STMT (name);
 794   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 795       && gimple_code (def_stmt) != GIMPLE_PHI
 796       && gimple_bb (def_stmt) == top_bb)
 797     {
 798       gsi = gsi_for_stmt (def_stmt);
 799       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 800     }
 801   else
 802     {
 803       gsi = gsi_after_labels (top_bb);
 804       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 805     }
 806   sincos_stats.inserted++;
 807
 808   /* And adjust the recorded old call sites.  */
 809   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
 810     {
 811       tree rhs = NULL;
 812       fndecl = gimple_call_fndecl (use_stmt);
 813
 814       switch (DECL_FUNCTION_CODE (fndecl))
 815         {
 816         CASE_FLT_FN (BUILT_IN_COS):
 817           rhs = fold_build1 (REALPART_EXPR, type, res);
 818           break;
 819
 820         CASE_FLT_FN (BUILT_IN_SIN):
 821           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 822           break;
 823
 824         CASE_FLT_FN (BUILT_IN_CEXPI):
 825           rhs = res;
 826           break;
 827
 828         default:;
 829           gcc_unreachable ();
 830         }
 831
 832         /* Replace call with a copy.  */
 833         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 834
 835         gsi = gsi_for_stmt (use_stmt);
 836         gsi_replace (&gsi, stmt, true);
 837         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 838           cfg_changed = true;
 839     }
 840
 841   return cfg_changed;
 842 }
 843
 844 /* To evaluate powi(x,n), the floating point value x raised to the
 845    constant integer exponent n, we use a hybrid algorithm that
 846    combines the "window method" with look-up tables.  For an
 847    introduction to exponentiation algorithms and "addition chains",
 848    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 849    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 850    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 851    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 852
 853 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 854    multiplications to inline before calling the system library's pow
 855    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 856    so this default never requires calling pow, powf or powl.  */
 857
 858 #ifndef POWI_MAX_MULTS
 859 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 860 #endif
 861
 862 /* The size of the "optimal power tree" lookup table.  All
 863    exponents less than this value are simply looked up in the
 864    powi_table below.  This threshold is also used to size the
 865    cache of pseudo registers that hold intermediate results.  */
 866 #define POWI_TABLE_SIZE 256
 867
 868 /* The size, in bits of the window, used in the "window method"
 869    exponentiation algorithm.  This is equivalent to a radix of
 870    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 871 #define POWI_WINDOW_SIZE 3
 872
 873 /* The following table is an efficient representation of an
 874    "optimal power tree".  For each value, i, the corresponding
 875    value, j, in the table states than an optimal evaluation
 876    sequence for calculating pow(x,i) can be found by evaluating
 877    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 878    100 integers is given in Knuth's "Seminumerical algorithms".  */
 879
 880 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 881   {
 882       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 883       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 884       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 885      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 886      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 887      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 888      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 889      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 890      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 891      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 892      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 893      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 894      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 895      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 896      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 897      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 898      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 899      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 900      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 901      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 902      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 903      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 904      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 905      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 906      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 907     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 908     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 909     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 910     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 911     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 912     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 913     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 914   };
 915
 916
 917 /* Return the number of multiplications required to calculate
 918    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 919    subroutine of powi_cost.  CACHE is an array indicating
 920    which exponents have already been calculated.  */
 921
 922 static int
 923 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 924 {
 925   /* If we've already calculated this exponent, then this evaluation
 926      doesn't require any additional multiplications.  */
 927   if (cache[n])
 928     return 0;
 929
 930   cache[n] = true;
 931   return powi_lookup_cost (n - powi_table[n], cache)
 932          + powi_lookup_cost (powi_table[n], cache) + 1;
 933 }
 934
 935 /* Return the number of multiplications required to calculate
 936    powi(x,n) for an arbitrary x, given the exponent N.  This
 937    function needs to be kept in sync with powi_as_mults below.  */
 938
 939 static int
 940 powi_cost (HOST_WIDE_INT n)
 941 {
 942   bool cache[POWI_TABLE_SIZE];
 943   unsigned HOST_WIDE_INT digit;
 944   unsigned HOST_WIDE_INT val;
 945   int result;
 946
 947   if (n == 0)
 948     return 0;
 949
 950   /* Ignore the reciprocal when calculating the cost.  */
 951   val = (n < 0) ? -n : n;
 952
 953   /* Initialize the exponent cache.  */
 954   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 955   cache[1] = true;
 956
 957   result = 0;
 958
 959   while (val >= POWI_TABLE_SIZE)
 960     {
 961       if (val & 1)
 962         {
 963           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 964           result += powi_lookup_cost (digit, cache)
 965                     + POWI_WINDOW_SIZE + 1;
 966           val >>= POWI_WINDOW_SIZE;
 967         }
 968       else
 969         {
 970           val >>= 1;
 971           result++;
 972         }
 973     }
 974
 975   return result + powi_lookup_cost (val, cache);
 976 }
 977
 978 /* Recursive subroutine of powi_as_mults.  This function takes the
 979    array, CACHE, of already calculated exponents and an exponent N and
 980    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 981
 982 static tree
 983 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 984                  HOST_WIDE_INT n, tree *cache)
 985 {
 986   tree op0, op1, ssa_target;
 987   unsigned HOST_WIDE_INT digit;
 988   gassign *mult_stmt;
 989
 990   if (n < POWI_TABLE_SIZE && cache[n])
 991     return cache[n];
 992
 993   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
 994
 995   if (n < POWI_TABLE_SIZE)
 996     {
 997       cache[n] = ssa_target;
 998       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
 999       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1000     }
1001   else if (n & 1)
1002     {
1003       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1004       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1005       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1006     }
1007   else
1008     {
1009       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1010       op1 = op0;
1011     }
1012
1013   mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1014   gimple_set_location (mult_stmt, loc);
1015   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1016
1017   return ssa_target;
1018 }
1019
1020 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1021    This function needs to be kept in sync with powi_cost above.  */
1022
1023 static tree
1024 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1025                tree arg0, HOST_WIDE_INT n)
1026 {
1027   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1028   gassign *div_stmt;
1029   tree target;
1030
1031   if (n == 0)
1032     return build_real (type, dconst1);
1033
1034   memset (cache, 0,  sizeof (cache));
1035   cache[1] = arg0;
1036
1037   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1038   if (n >= 0)
1039     return result;
1040
1041   /* If the original exponent was negative, reciprocate the result.  */
1042   target = make_temp_ssa_name (type, NULL, "powmult");
1043   div_stmt = gimple_build_assign (target, RDIV_EXPR,
1044                                   build_real (type, dconst1), result);
1045   gimple_set_location (div_stmt, loc);
1046   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1047
1048   return target;
1049 }
1050
1051 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1052    location info LOC.  If the arguments are appropriate, create an
1053    equivalent sequence of statements prior to GSI using an optimal
1054    number of multiplications, and return an expession holding the
1055    result.  */
1056
1057 static tree
1058 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1059                             tree arg0, HOST_WIDE_INT n)
1060 {
1061   /* Avoid largest negative number.  */
1062   if (n != -n
1063       && ((n >= -1 && n <= 2)
1064           || (optimize_function_for_speed_p (cfun)
1065               && powi_cost (n) <= POWI_MAX_MULTS)))
1066     return powi_as_mults (gsi, loc, arg0, n);
1067
1068   return NULL_TREE;
1069 }
1070
1071 /* Build a gimple call statement that calls FN with argument ARG.
1072    Set the lhs of the call statement to a fresh SSA name.  Insert the
1073    statement prior to GSI's current position, and return the fresh
1074    SSA name.  */
1075
1076 static tree
1077 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1078                        tree fn, tree arg)
1079 {
1080   gcall *call_stmt;
1081   tree ssa_target;
1082
1083   call_stmt = gimple_build_call (fn, 1, arg);
1084   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1085   gimple_set_lhs (call_stmt, ssa_target);
1086   gimple_set_location (call_stmt, loc);
1087   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1088
1089   return ssa_target;
1090 }
1091
1092 /* Build a gimple binary operation with the given CODE and arguments
1093    ARG0, ARG1, assigning the result to a new SSA name for variable
1094    TARGET.  Insert the statement prior to GSI's current position, and
1095    return the fresh SSA name.*/
1096
1097 static tree
1098 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1099                         const char *name, enum tree_code code,
1100                         tree arg0, tree arg1)
1101 {
1102   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1103   gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1104   gimple_set_location (stmt, loc);
1105   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1106   return result;
1107 }
1108
1109 /* Build a gimple reference operation with the given CODE and argument
1110    ARG, assigning the result to a new SSA name of TYPE with NAME.
1111    Insert the statement prior to GSI's current position, and return
1112    the fresh SSA name.  */
1113
1114 static inline tree
1115 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1116                       const char *name, enum tree_code code, tree arg0)
1117 {
1118   tree result = make_temp_ssa_name (type, NULL, name);
1119   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1120   gimple_set_location (stmt, loc);
1121   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1122   return result;
1123 }
1124
1125 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1126    prior to GSI's current position, and return the fresh SSA name.  */
1127
1128 static tree
1129 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1130                        tree type, tree val)
1131 {
1132   tree result = make_ssa_name (type);
1133   gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1134   gimple_set_location (stmt, loc);
1135   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1136   return result;
1137 }
1138
1139 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1140    with location info LOC.  If possible, create an equivalent and
1141    less expensive sequence of statements prior to GSI, and return an
1142    expession holding the result.  */
1143
1144 static tree
1145 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1146                            tree arg0, tree arg1)
1147 {
1148   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1149   REAL_VALUE_TYPE c2, dconst3;
1150   HOST_WIDE_INT n;
1151   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1152   machine_mode mode;
1153   bool hw_sqrt_exists, c_is_int, c2_is_int;
1154
1155   /* If the exponent isn't a constant, there's nothing of interest
1156      to be done.  */
1157   if (TREE_CODE (arg1) != REAL_CST)
1158     return NULL_TREE;
1159
1160   /* If the exponent is equivalent to an integer, expand to an optimal
1161      multiplication sequence when profitable.  */
1162   c = TREE_REAL_CST (arg1);
1163   n = real_to_integer (&c);
1164   real_from_integer (&cint, VOIDmode, n, SIGNED);
1165   c_is_int = real_identical (&c, &cint);
1166
1167   if (c_is_int
1168       && ((n >= -1 && n <= 2)
1169           || (flag_unsafe_math_optimizations
1170               && optimize_bb_for_speed_p (gsi_bb (*gsi))
1171               && powi_cost (n) <= POWI_MAX_MULTS)))
1172     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1173
1174   /* Attempt various optimizations using sqrt and cbrt.  */
1175   type = TREE_TYPE (arg0);
1176   mode = TYPE_MODE (type);
1177   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1178
1179   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1180      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1181      sqrt(-0) = -0.  */
1182   if (sqrtfn
1183       && REAL_VALUES_EQUAL (c, dconsthalf)
1184       && !HONOR_SIGNED_ZEROS (mode))
1185     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1186
1187   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1188      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1189      so do this optimization even if -Os.  Don't do this optimization
1190      if we don't have a hardware sqrt insn.  */
1191   dconst1_4 = dconst1;
1192   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1193   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1194
1195   if (flag_unsafe_math_optimizations
1196       && sqrtfn
1197       && REAL_VALUES_EQUAL (c, dconst1_4)
1198       && hw_sqrt_exists)
1199     {
1200       /* sqrt(x)  */
1201       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1202
1203       /* sqrt(sqrt(x))  */
1204       return build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1205     }
1206
1207   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1208      optimizing for space.  Don't do this optimization if we don't have
1209      a hardware sqrt insn.  */
1210   real_from_integer (&dconst3_4, VOIDmode, 3, SIGNED);
1211   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1212
1213   if (flag_unsafe_math_optimizations
1214       && sqrtfn
1215       && optimize_function_for_speed_p (cfun)
1216       && REAL_VALUES_EQUAL (c, dconst3_4)
1217       && hw_sqrt_exists)
1218     {
1219       /* sqrt(x)  */
1220       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1221
1222       /* sqrt(sqrt(x))  */
1223       sqrt_sqrt = build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1224
1225       /* sqrt(x) * sqrt(sqrt(x))  */
1226       return build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1227                                      sqrt_arg0, sqrt_sqrt);
1228     }
1229
1230   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1231      optimizations since 1./3. is not exactly representable.  If x
1232      is negative and finite, the correct value of pow(x,1./3.) is
1233      a NaN with the "invalid" exception raised, because the value
1234      of 1./3. actually has an even denominator.  The correct value
1235      of cbrt(x) is a negative real value.  */
1236   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1237   dconst1_3 = real_value_truncate (mode, dconst_third ());
1238
1239   if (flag_unsafe_math_optimizations
1240       && cbrtfn
1241       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1242       && REAL_VALUES_EQUAL (c, dconst1_3))
1243     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1244
1245   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1246      if we don't have a hardware sqrt insn.  */
1247   dconst1_6 = dconst1_3;
1248   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1249
1250   if (flag_unsafe_math_optimizations
1251       && sqrtfn
1252       && cbrtfn
1253       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1254       && optimize_function_for_speed_p (cfun)
1255       && hw_sqrt_exists
1256       && REAL_VALUES_EQUAL (c, dconst1_6))
1257     {
1258       /* sqrt(x)  */
1259       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1260
1261       /* cbrt(sqrt(x))  */
1262       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1263     }
1264
1265   /* Optimize pow(x,c), where n = 2c for some nonzero integer n
1266      and c not an integer, into
1267
1268        sqrt(x) * powi(x, n/2),                n > 0;
1269        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1270
1271      Do not calculate the powi factor when n/2 = 0.  */
1272   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1273   n = real_to_integer (&c2);
1274   real_from_integer (&cint, VOIDmode, n, SIGNED);
1275   c2_is_int = real_identical (&c2, &cint);
1276
1277   if (flag_unsafe_math_optimizations
1278       && sqrtfn
1279       && c2_is_int
1280       && !c_is_int
1281       && optimize_function_for_speed_p (cfun))
1282     {
1283       tree powi_x_ndiv2 = NULL_TREE;
1284
1285       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1286          possible or profitable, give up.  Skip the degenerate case when
1287          n is 1 or -1, where the result is always 1.  */
1288       if (absu_hwi (n) != 1)
1289         {
1290           powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1291                                                      abs_hwi (n / 2));
1292           if (!powi_x_ndiv2)
1293             return NULL_TREE;
1294         }
1295
1296       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1297          result of the optimal multiply sequence just calculated.  */
1298       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1299
1300       if (absu_hwi (n) == 1)
1301         result = sqrt_arg0;
1302       else
1303         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1304                                          sqrt_arg0, powi_x_ndiv2);
1305
1306       /* If n is negative, reciprocate the result.  */
1307       if (n < 0)
1308         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1309                                          build_real (type, dconst1), result);
1310       return result;
1311     }
1312
1313   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1314
1315      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1316      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1317
1318      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1319      different from pow(x, 1./3.) due to rounding and behavior with
1320      negative x, we need to constrain this transformation to unsafe
1321      math and positive x or finite math.  */
1322   real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1323   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1324   real_round (&c2, mode, &c2);
1325   n = real_to_integer (&c2);
1326   real_from_integer (&cint, VOIDmode, n, SIGNED);
1327   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1328   real_convert (&c2, mode, &c2);
1329
1330   if (flag_unsafe_math_optimizations
1331       && cbrtfn
1332       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1333       && real_identical (&c2, &c)
1334       && !c2_is_int
1335       && optimize_function_for_speed_p (cfun)
1336       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1337     {
1338       tree powi_x_ndiv3 = NULL_TREE;
1339
1340       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1341          possible or profitable, give up.  Skip the degenerate case when
1342          abs(n) < 3, where the result is always 1.  */
1343       if (absu_hwi (n) >= 3)
1344         {
1345           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1346                                                      abs_hwi (n / 3));
1347           if (!powi_x_ndiv3)
1348             return NULL_TREE;
1349         }
1350
1351       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1352          as that creates an unnecessary variable.  Instead, just produce
1353          either cbrt(x) or cbrt(x) * cbrt(x).  */
1354       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1355
1356       if (absu_hwi (n) % 3 == 1)
1357         powi_cbrt_x = cbrt_x;
1358       else
1359         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1360                                               cbrt_x, cbrt_x);
1361
1362       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1363       if (absu_hwi (n) < 3)
1364         result = powi_cbrt_x;
1365       else
1366         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1367                                          powi_x_ndiv3, powi_cbrt_x);
1368
1369       /* If n is negative, reciprocate the result.  */
1370       if (n < 0)
1371         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1372                                          build_real (type, dconst1), result);
1373
1374       return result;
1375     }
1376
1377   /* No optimizations succeeded.  */
1378   return NULL_TREE;
1379 }
1380
1381 /* ARG is the argument to a cabs builtin call in GSI with location info
1382    LOC.  Create a sequence of statements prior to GSI that calculates
1383    sqrt(R*R + I*I), where R and I are the real and imaginary components
1384    of ARG, respectively.  Return an expression holding the result.  */
1385
1386 static tree
1387 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1388 {
1389   tree real_part, imag_part, addend1, addend2, sum, result;
1390   tree type = TREE_TYPE (TREE_TYPE (arg));
1391   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1392   machine_mode mode = TYPE_MODE (type);
1393
1394   if (!flag_unsafe_math_optimizations
1395       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1396       || !sqrtfn
1397       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1398     return NULL_TREE;
1399
1400   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1401                                     REALPART_EXPR, arg);
1402   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1403                                     real_part, real_part);
1404   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1405                                     IMAGPART_EXPR, arg);
1406   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1407                                     imag_part, imag_part);
1408   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1409   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1410
1411   return result;
1412 }
1413
1414 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1415    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1416    an optimal number of multiplies, when n is a constant.  */
1417
1418 namespace {
1419
1420 const pass_data pass_data_cse_sincos =
1421 {
1422   GIMPLE_PASS, /* type */
1423   "sincos", /* name */
1424   OPTGROUP_NONE, /* optinfo_flags */
1425   TV_NONE, /* tv_id */
1426   PROP_ssa, /* properties_required */
1427   0, /* properties_provided */
1428   0, /* properties_destroyed */
1429   0, /* todo_flags_start */
1430   TODO_update_ssa, /* todo_flags_finish */
1431 };
1432
1433 class pass_cse_sincos : public gimple_opt_pass
1434 {
1435 public:
1436   pass_cse_sincos (gcc::context *ctxt)
1437     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1438   {}
1439
1440   /* opt_pass methods: */
1441   virtual bool gate (function *)
1442     {
1443       /* We no longer require either sincos or cexp, since powi expansion
1444          piggybacks on this pass.  */
1445       return optimize;
1446     }
1447
1448   virtual unsigned int execute (function *);
1449
1450 }; // class pass_cse_sincos
1451
1452 unsigned int
1453 pass_cse_sincos::execute (function *fun)
1454 {
1455   basic_block bb;
1456   bool cfg_changed = false;
1457
1458   calculate_dominance_info (CDI_DOMINATORS);
1459   memset (&sincos_stats, 0, sizeof (sincos_stats));
1460
1461   FOR_EACH_BB_FN (bb, fun)
1462     {
1463       gimple_stmt_iterator gsi;
1464       bool cleanup_eh = false;
1465
1466       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1467         {
1468           gimple stmt = gsi_stmt (gsi);
1469           tree fndecl;
1470
1471           /* Only the last stmt in a bb could throw, no need to call
1472              gimple_purge_dead_eh_edges if we change something in the middle
1473              of a basic block.  */
1474           cleanup_eh = false;
1475
1476           if (is_gimple_call (stmt)
1477               && gimple_call_lhs (stmt)
1478               && (fndecl = gimple_call_fndecl (stmt))
1479               && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1480             {
1481               tree arg, arg0, arg1, result;
1482               HOST_WIDE_INT n;
1483               location_t loc;
1484
1485               switch (DECL_FUNCTION_CODE (fndecl))
1486                 {
1487                 CASE_FLT_FN (BUILT_IN_COS):
1488                 CASE_FLT_FN (BUILT_IN_SIN):
1489                 CASE_FLT_FN (BUILT_IN_CEXPI):
1490                   /* Make sure we have either sincos or cexp.  */
1491                   if (!targetm.libc_has_function (function_c99_math_complex)
1492                       && !targetm.libc_has_function (function_sincos))
1493                     break;
1494
1495                   arg = gimple_call_arg (stmt, 0);
1496                   if (TREE_CODE (arg) == SSA_NAME)
1497                     cfg_changed |= execute_cse_sincos_1 (arg);
1498                   break;
1499
1500                 CASE_FLT_FN (BUILT_IN_POW):
1501                   arg0 = gimple_call_arg (stmt, 0);
1502                   arg1 = gimple_call_arg (stmt, 1);
1503
1504                   loc = gimple_location (stmt);
1505                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1506
1507                   if (result)
1508                     {
1509                       tree lhs = gimple_get_lhs (stmt);
1510                       gassign *new_stmt = gimple_build_assign (lhs, result);
1511                       gimple_set_location (new_stmt, loc);
1512                       unlink_stmt_vdef (stmt);
1513                       gsi_replace (&gsi, new_stmt, true);
1514                       cleanup_eh = true;
1515                       if (gimple_vdef (stmt))
1516                         release_ssa_name (gimple_vdef (stmt));
1517                     }
1518                   break;
1519
1520                 CASE_FLT_FN (BUILT_IN_POWI):
1521                   arg0 = gimple_call_arg (stmt, 0);
1522                   arg1 = gimple_call_arg (stmt, 1);
1523                   loc = gimple_location (stmt);
1524
1525                   if (real_minus_onep (arg0))
1526                     {
1527                       tree t0, t1, cond, one, minus_one;
1528                       gassign *stmt;
1529
1530                       t0 = TREE_TYPE (arg0);
1531                       t1 = TREE_TYPE (arg1);
1532                       one = build_real (t0, dconst1);
1533                       minus_one = build_real (t0, dconstm1);
1534
1535                       cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1536                       stmt = gimple_build_assign (cond, BIT_AND_EXPR,
1537                                                   arg1, build_int_cst (t1, 1));
1538                       gimple_set_location (stmt, loc);
1539                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1540
1541                       result = make_temp_ssa_name (t0, NULL, "powi");
1542                       stmt = gimple_build_assign (result, COND_EXPR, cond,
1543                                                   minus_one, one);
1544                       gimple_set_location (stmt, loc);
1545                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1546                     }
1547                   else
1548                     {
1549                       if (!tree_fits_shwi_p (arg1))
1550                         break;
1551
1552                       n = tree_to_shwi (arg1);
1553                       result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1554                     }
1555
1556                   if (result)
1557                     {
1558                       tree lhs = gimple_get_lhs (stmt);
1559                       gassign *new_stmt = gimple_build_assign (lhs, result);
1560                       gimple_set_location (new_stmt, loc);
1561                       unlink_stmt_vdef (stmt);
1562                       gsi_replace (&gsi, new_stmt, true);
1563                       cleanup_eh = true;
1564                       if (gimple_vdef (stmt))
1565                         release_ssa_name (gimple_vdef (stmt));
1566                     }
1567                   break;
1568
1569                 CASE_FLT_FN (BUILT_IN_CABS):
1570                   arg0 = gimple_call_arg (stmt, 0);
1571                   loc = gimple_location (stmt);
1572                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1573
1574                   if (result)
1575                     {
1576                       tree lhs = gimple_get_lhs (stmt);
1577                       gassign *new_stmt = gimple_build_assign (lhs, result);
1578                       gimple_set_location (new_stmt, loc);
1579                       unlink_stmt_vdef (stmt);
1580                       gsi_replace (&gsi, new_stmt, true);
1581                       cleanup_eh = true;
1582                       if (gimple_vdef (stmt))
1583                         release_ssa_name (gimple_vdef (stmt));
1584                     }
1585                   break;
1586
1587                 default:;
1588                 }
1589             }
1590         }
1591       if (cleanup_eh)
1592         cfg_changed |= gimple_purge_dead_eh_edges (bb);
1593     }
1594
1595   statistics_counter_event (fun, "sincos statements inserted",
1596                             sincos_stats.inserted);
1597
1598   free_dominance_info (CDI_DOMINATORS);
1599   return cfg_changed ? TODO_cleanup_cfg : 0;
1600 }
1601
1602 } // anon namespace
1603
1604 gimple_opt_pass *
1605 make_pass_cse_sincos (gcc::context *ctxt)
1606 {
1607   return new pass_cse_sincos (ctxt);
1608 }
1609
1610 /* A symbolic number is used to detect byte permutation and selection
1611    patterns.  Therefore the field N contains an artificial number
1612    consisting of octet sized markers:
1613
1614    0    - target byte has the value 0
1615    FF   - target byte has an unknown value (eg. due to sign extension)
1616    1..size - marker value is the target byte index minus one.
1617
1618    To detect permutations on memory sources (arrays and structures), a symbolic
1619    number is also associated a base address (the array or structure the load is
1620    made from), an offset from the base address and a range which gives the
1621    difference between the highest and lowest accessed memory location to make
1622    such a symbolic number. The range is thus different from size which reflects
1623    the size of the type of current expression. Note that for non memory source,
1624    range holds the same value as size.
1625
1626    For instance, for an array char a[], (short) a[0] | (short) a[3] would have
1627    a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
1628    still have a size of 2 but this time a range of 1.  */
1629
1630 struct symbolic_number {
1631   uint64_t n;
1632   tree type;
1633   tree base_addr;
1634   tree offset;
1635   HOST_WIDE_INT bytepos;
1636   tree alias_set;
1637   tree vuse;
1638   unsigned HOST_WIDE_INT range;
1639 };
1640
1641 #define BITS_PER_MARKER 8
1642 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1643 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1644 #define HEAD_MARKER(n, size) \
1645   ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1646
1647 /* The number which the find_bswap_or_nop_1 result should match in
1648    order to have a nop.  The number is masked according to the size of
1649    the symbolic number before using it.  */
1650 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1651   (uint64_t)0x08070605 << 32 | 0x04030201)
1652
1653 /* The number which the find_bswap_or_nop_1 result should match in
1654    order to have a byte swap.  The number is masked according to the
1655    size of the symbolic number before using it.  */
1656 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1657   (uint64_t)0x01020304 << 32 | 0x05060708)
1658
1659 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1660    number N.  Return false if the requested operation is not permitted
1661    on a symbolic number.  */
1662
1663 static inline bool
1664 do_shift_rotate (enum tree_code code,
1665                  struct symbolic_number *n,
1666                  int count)
1667 {
1668   int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1669   unsigned head_marker;
1670
1671   if (count % BITS_PER_UNIT != 0)
1672     return false;
1673   count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
1674
1675   /* Zero out the extra bits of N in order to avoid them being shifted
1676      into the significant bits.  */
1677   if (size < 64 / BITS_PER_MARKER)
1678     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1679
1680   switch (code)
1681     {
1682     case LSHIFT_EXPR:
1683       n->n <<= count;
1684       break;
1685     case RSHIFT_EXPR:
1686       head_marker = HEAD_MARKER (n->n, size);
1687       n->n >>= count;
1688       /* Arithmetic shift of signed type: result is dependent on the value.  */
1689       if (!TYPE_UNSIGNED (n->type) && head_marker)
1690         for (i = 0; i < count / BITS_PER_MARKER; i++)
1691           n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1692                   << ((size - 1 - i) * BITS_PER_MARKER);
1693       break;
1694     case LROTATE_EXPR:
1695       n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
1696       break;
1697     case RROTATE_EXPR:
1698       n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
1699       break;
1700     default:
1701       return false;
1702     }
1703   /* Zero unused bits for size.  */
1704   if (size < 64 / BITS_PER_MARKER)
1705     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1706   return true;
1707 }
1708
1709 /* Perform sanity checking for the symbolic number N and the gimple
1710    statement STMT.  */
1711
1712 static inline bool
1713 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1714 {
1715   tree lhs_type;
1716
1717   lhs_type = gimple_expr_type (stmt);
1718
1719   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1720     return false;
1721
1722   if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
1723     return false;
1724
1725   return true;
1726 }
1727
1728 /* Initialize the symbolic number N for the bswap pass from the base element
1729    SRC manipulated by the bitwise OR expression.  */
1730
1731 static bool
1732 init_symbolic_number (struct symbolic_number *n, tree src)
1733 {
1734   int size;
1735
1736   n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
1737
1738   /* Set up the symbolic number N by setting each byte to a value between 1 and
1739      the byte size of rhs1.  The highest order byte is set to n->size and the
1740      lowest order byte to 1.  */
1741   n->type = TREE_TYPE (src);
1742   size = TYPE_PRECISION (n->type);
1743   if (size % BITS_PER_UNIT != 0)
1744     return false;
1745   size /= BITS_PER_UNIT;
1746   if (size > 64 / BITS_PER_MARKER)
1747     return false;
1748   n->range = size;
1749   n->n = CMPNOP;
1750
1751   if (size < 64 / BITS_PER_MARKER)
1752     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1753
1754   return true;
1755 }
1756
1757 /* Check if STMT might be a byte swap or a nop from a memory source and returns
1758    the answer. If so, REF is that memory source and the base of the memory area
1759    accessed and the offset of the access from that base are recorded in N.  */
1760
1761 bool
1762 find_bswap_or_nop_load (gimple stmt, tree ref, struct symbolic_number *n)
1763 {
1764   /* Leaf node is an array or component ref. Memorize its base and
1765      offset from base to compare to other such leaf node.  */
1766   HOST_WIDE_INT bitsize, bitpos;
1767   machine_mode mode;
1768   int unsignedp, volatilep;
1769   tree offset, base_addr;
1770
1771   if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
1772     return false;
1773
1774   base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
1775                                    &unsignedp, &volatilep, false);
1776
1777   if (TREE_CODE (base_addr) == MEM_REF)
1778     {
1779       offset_int bit_offset = 0;
1780       tree off = TREE_OPERAND (base_addr, 1);
1781
1782       if (!integer_zerop (off))
1783         {
1784           offset_int boff, coff = mem_ref_offset (base_addr);
1785           boff = wi::lshift (coff, LOG2_BITS_PER_UNIT);
1786           bit_offset += boff;
1787         }
1788
1789       base_addr = TREE_OPERAND (base_addr, 0);
1790
1791       /* Avoid returning a negative bitpos as this may wreak havoc later.  */
1792       if (wi::neg_p (bit_offset))
1793         {
1794           offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
1795           offset_int tem = bit_offset.and_not (mask);
1796           /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
1797              Subtract it to BIT_OFFSET and add it (scaled) to OFFSET.  */
1798           bit_offset -= tem;
1799           tem = wi::arshift (tem, LOG2_BITS_PER_UNIT);
1800           if (offset)
1801             offset = size_binop (PLUS_EXPR, offset,
1802                                     wide_int_to_tree (sizetype, tem));
1803           else
1804             offset = wide_int_to_tree (sizetype, tem);
1805         }
1806
1807       bitpos += bit_offset.to_shwi ();
1808     }
1809
1810   if (bitpos % BITS_PER_UNIT)
1811     return false;
1812   if (bitsize % BITS_PER_UNIT)
1813     return false;
1814
1815   if (!init_symbolic_number (n, ref))
1816     return false;
1817   n->base_addr = base_addr;
1818   n->offset = offset;
1819   n->bytepos = bitpos / BITS_PER_UNIT;
1820   n->alias_set = reference_alias_ptr_type (ref);
1821   n->vuse = gimple_vuse (stmt);
1822   return true;
1823 }
1824
1825 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
1826    the operation given by the rhs of STMT on the result.  If the operation
1827    could successfully be executed the function returns a gimple stmt whose
1828    rhs's first tree is the expression of the source operand and NULL
1829    otherwise.  */
1830
1831 static gimple
1832 find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
1833 {
1834   enum tree_code code;
1835   tree rhs1, rhs2 = NULL;
1836   gimple rhs1_stmt, rhs2_stmt, source_stmt1;
1837   enum gimple_rhs_class rhs_class;
1838
1839   if (!limit || !is_gimple_assign (stmt))
1840     return NULL;
1841
1842   rhs1 = gimple_assign_rhs1 (stmt);
1843
1844   if (find_bswap_or_nop_load (stmt, rhs1, n))
1845     return stmt;
1846
1847   if (TREE_CODE (rhs1) != SSA_NAME)
1848     return NULL;
1849
1850   code = gimple_assign_rhs_code (stmt);
1851   rhs_class = gimple_assign_rhs_class (stmt);
1852   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1853
1854   if (rhs_class == GIMPLE_BINARY_RHS)
1855     rhs2 = gimple_assign_rhs2 (stmt);
1856
1857   /* Handle unary rhs and binary rhs with integer constants as second
1858      operand.  */
1859
1860   if (rhs_class == GIMPLE_UNARY_RHS
1861       || (rhs_class == GIMPLE_BINARY_RHS
1862           && TREE_CODE (rhs2) == INTEGER_CST))
1863     {
1864       if (code != BIT_AND_EXPR
1865           && code != LSHIFT_EXPR
1866           && code != RSHIFT_EXPR
1867           && code != LROTATE_EXPR
1868           && code != RROTATE_EXPR
1869           && !CONVERT_EXPR_CODE_P (code))
1870         return NULL;
1871
1872       source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
1873
1874       /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
1875          we have to initialize the symbolic number.  */
1876       if (!source_stmt1)
1877         {
1878           if (gimple_assign_load_p (stmt)
1879               || !init_symbolic_number (n, rhs1))
1880             return NULL;
1881           source_stmt1 = stmt;
1882         }
1883
1884       switch (code)
1885         {
1886         case BIT_AND_EXPR:
1887           {
1888             int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1889             uint64_t val = int_cst_value (rhs2), mask = 0;
1890             uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
1891
1892             /* Only constants masking full bytes are allowed.  */
1893             for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
1894               if ((val & tmp) != 0 && (val & tmp) != tmp)
1895                 return NULL;
1896               else if (val & tmp)
1897                 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
1898
1899             n->n &= mask;
1900           }
1901           break;
1902         case LSHIFT_EXPR:
1903         case RSHIFT_EXPR:
1904         case LROTATE_EXPR:
1905         case RROTATE_EXPR:
1906           if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
1907             return NULL;
1908           break;
1909         CASE_CONVERT:
1910           {
1911             int i, type_size, old_type_size;
1912             tree type;
1913
1914             type = gimple_expr_type (stmt);
1915             type_size = TYPE_PRECISION (type);
1916             if (type_size % BITS_PER_UNIT != 0)
1917               return NULL;
1918             type_size /= BITS_PER_UNIT;
1919             if (type_size > 64 / BITS_PER_MARKER)
1920               return NULL;
1921
1922             /* Sign extension: result is dependent on the value.  */
1923             old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1924             if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
1925                 && HEAD_MARKER (n->n, old_type_size))
1926               for (i = 0; i < type_size - old_type_size; i++)
1927                 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1928                         << ((type_size - 1 - i) * BITS_PER_MARKER);
1929
1930             if (type_size < 64 / BITS_PER_MARKER)
1931               {
1932                 /* If STMT casts to a smaller type mask out the bits not
1933                    belonging to the target type.  */
1934                 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
1935               }
1936             n->type = type;
1937             if (!n->base_addr)
1938               n->range = type_size;
1939           }
1940           break;
1941         default:
1942           return NULL;
1943         };
1944       return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
1945     }
1946
1947   /* Handle binary rhs.  */
1948
1949   if (rhs_class == GIMPLE_BINARY_RHS)
1950     {
1951       int i, size;
1952       struct symbolic_number n1, n2;
1953       uint64_t mask;
1954       gimple source_stmt2;
1955
1956       if (code != BIT_IOR_EXPR)
1957         return NULL;
1958
1959       if (TREE_CODE (rhs2) != SSA_NAME)
1960         return NULL;
1961
1962       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1963
1964       switch (code)
1965         {
1966         case BIT_IOR_EXPR:
1967           source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
1968
1969           if (!source_stmt1)
1970             return NULL;
1971
1972           source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
1973
1974           if (!source_stmt2)
1975             return NULL;
1976
1977           if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
1978             return NULL;
1979
1980           if (!n1.vuse != !n2.vuse ||
1981           (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
1982             return NULL;
1983
1984           if (gimple_assign_rhs1 (source_stmt1)
1985               != gimple_assign_rhs1 (source_stmt2))
1986             {
1987               int64_t inc;
1988               HOST_WIDE_INT off_sub;
1989               struct symbolic_number *n_ptr;
1990
1991               if (!n1.base_addr || !n2.base_addr
1992                   || !operand_equal_p (n1.base_addr, n2.base_addr, 0))
1993                 return NULL;
1994               if (!n1.offset != !n2.offset ||
1995                   (n1.offset && !operand_equal_p (n1.offset, n2.offset, 0)))
1996                 return NULL;
1997
1998               /* We swap n1 with n2 to have n1 < n2.  */
1999               if (n2.bytepos < n1.bytepos)
2000                 {
2001                   struct symbolic_number tmpn;
2002
2003                   tmpn = n2;
2004                   n2 = n1;
2005                   n1 = tmpn;
2006                   source_stmt1 = source_stmt2;
2007                 }
2008
2009               off_sub = n2.bytepos - n1.bytepos;
2010
2011               /* Check that the range of memory covered can be represented by
2012                  a symbolic number.  */
2013               if (off_sub + n2.range > 64 / BITS_PER_MARKER)
2014                 return NULL;
2015               n->range = n2.range + off_sub;
2016
2017               /* Reinterpret byte marks in symbolic number holding the value of
2018                  bigger weight according to target endianness.  */
2019               inc = BYTES_BIG_ENDIAN ? off_sub + n2.range - n1.range : off_sub;
2020               size = TYPE_PRECISION (n1.type) / BITS_PER_UNIT;
2021               if (BYTES_BIG_ENDIAN)
2022                 n_ptr = &n1;
2023               else
2024                 n_ptr = &n2;
2025               for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
2026                 {
2027                   unsigned marker =
2028                     (n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
2029                   if (marker && marker != MARKER_BYTE_UNKNOWN)
2030                     n_ptr->n += inc;
2031                 }
2032             }
2033           else
2034             n->range = n1.range;
2035
2036           if (!n1.alias_set
2037               || alias_ptr_types_compatible_p (n1.alias_set, n2.alias_set))
2038             n->alias_set = n1.alias_set;
2039           else
2040             n->alias_set = ptr_type_node;
2041           n->vuse = n1.vuse;
2042           n->base_addr = n1.base_addr;
2043           n->offset = n1.offset;
2044           n->bytepos = n1.bytepos;
2045           n->type = n1.type;
2046           size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2047           for (i = 0, mask = MARKER_MASK; i < size;
2048                i++, mask <<= BITS_PER_MARKER)
2049             {
2050               uint64_t masked1, masked2;
2051
2052               masked1 = n1.n & mask;
2053               masked2 = n2.n & mask;
2054               if (masked1 && masked2 && masked1 != masked2)
2055                 return NULL;
2056             }
2057           n->n = n1.n | n2.n;
2058
2059           if (!verify_symbolic_number_p (n, stmt))
2060             return NULL;
2061
2062           break;
2063         default:
2064           return NULL;
2065         }
2066       return source_stmt1;
2067     }
2068   return NULL;
2069 }
2070
2071 /* Check if STMT completes a bswap implementation or a read in a given
2072    endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2073    accordingly.  It also sets N to represent the kind of operations
2074    performed: size of the resulting expression and whether it works on
2075    a memory source, and if so alias-set and vuse.  At last, the
2076    function returns a stmt whose rhs's first tree is the source
2077    expression.  */
2078
2079 static gimple
2080 find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap)
2081 {
2082 /* The number which the find_bswap_or_nop_1 result should match in order
2083    to have a full byte swap.  The number is shifted to the right
2084    according to the size of the symbolic number before using it.  */
2085   uint64_t cmpxchg = CMPXCHG;
2086   uint64_t cmpnop = CMPNOP;
2087
2088   gimple source_stmt;
2089   int limit;
2090
2091   /* The last parameter determines the depth search limit.  It usually
2092      correlates directly to the number n of bytes to be touched.  We
2093      increase that number by log2(n) + 1 here in order to also
2094      cover signed -> unsigned conversions of the src operand as can be seen
2095      in libgcc, and for initial shift/and operation of the src operand.  */
2096   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2097   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2098   source_stmt =  find_bswap_or_nop_1 (stmt, n, limit);
2099
2100   if (!source_stmt)
2101     return NULL;
2102
2103   /* Find real size of result (highest non zero byte).  */
2104   if (n->base_addr)
2105     {
2106       int rsize;
2107       uint64_t tmpn;
2108
2109       for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2110       n->range = rsize;
2111     }
2112
2113   /* Zero out the extra bits of N and CMP*.  */
2114   if (n->range < (int) sizeof (int64_t))
2115     {
2116       uint64_t mask;
2117
2118       mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2119       cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2120       cmpnop &= mask;
2121     }
2122
2123   /* A complete byte swap should make the symbolic number to start with
2124      the largest digit in the highest order byte. Unchanged symbolic
2125      number indicates a read with same endianness as target architecture.  */
2126   if (n->n == cmpnop)
2127     *bswap = false;
2128   else if (n->n == cmpxchg)
2129     *bswap = true;
2130   else
2131     return NULL;
2132
2133   /* Useless bit manipulation performed by code.  */
2134   if (!n->base_addr && n->n == cmpnop)
2135     return NULL;
2136
2137   n->range *= BITS_PER_UNIT;
2138   return source_stmt;
2139 }
2140
2141 namespace {
2142
2143 const pass_data pass_data_optimize_bswap =
2144 {
2145   GIMPLE_PASS, /* type */
2146   "bswap", /* name */
2147   OPTGROUP_NONE, /* optinfo_flags */
2148   TV_NONE, /* tv_id */
2149   PROP_ssa, /* properties_required */
2150   0, /* properties_provided */
2151   0, /* properties_destroyed */
2152   0, /* todo_flags_start */
2153   0, /* todo_flags_finish */
2154 };
2155
2156 class pass_optimize_bswap : public gimple_opt_pass
2157 {
2158 public:
2159   pass_optimize_bswap (gcc::context *ctxt)
2160     : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2161   {}
2162
2163   /* opt_pass methods: */
2164   virtual bool gate (function *)
2165     {
2166       return flag_expensive_optimizations && optimize;
2167     }
2168
2169   virtual unsigned int execute (function *);
2170
2171 }; // class pass_optimize_bswap
2172
2173 /* Perform the bswap optimization: replace the expression computed in the rhs
2174    of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2175    Which of these alternatives replace the rhs is given by N->base_addr (non
2176    null if a load is needed) and BSWAP.  The type, VUSE and set-alias of the
2177    load to perform are also given in N while the builtin bswap invoke is given
2178    in FNDEL.  Finally, if a load is involved, SRC_STMT refers to one of the
2179    load statements involved to construct the rhs in CUR_STMT and N->range gives
2180    the size of the rhs expression for maintaining some statistics.
2181
2182    Note that if the replacement involve a load, CUR_STMT is moved just after
2183    SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2184    changing of basic block.  */
2185
2186 static bool
2187 bswap_replace (gimple cur_stmt, gimple src_stmt, tree fndecl, tree bswap_type,
2188                tree load_type, struct symbolic_number *n, bool bswap)
2189 {
2190   gimple_stmt_iterator gsi;
2191   tree src, tmp, tgt;
2192   gimple bswap_stmt;
2193
2194   gsi = gsi_for_stmt (cur_stmt);
2195   src = gimple_assign_rhs1 (src_stmt);
2196   tgt = gimple_assign_lhs (cur_stmt);
2197
2198   /* Need to load the value from memory first.  */
2199   if (n->base_addr)
2200     {
2201       gimple_stmt_iterator gsi_ins = gsi_for_stmt (src_stmt);
2202       tree addr_expr, addr_tmp, val_expr, val_tmp;
2203       tree load_offset_ptr, aligned_load_type;
2204       gimple addr_stmt, load_stmt;
2205       unsigned align;
2206
2207       align = get_object_alignment (src);
2208       if (bswap
2209           && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
2210           && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
2211         return false;
2212
2213       /* Move cur_stmt just before  one of the load of the original
2214          to ensure it has the same VUSE.  See PR61517 for what could
2215          go wrong.  */
2216       gsi_move_before (&gsi, &gsi_ins);
2217       gsi = gsi_for_stmt (cur_stmt);
2218
2219       /*  Compute address to load from and cast according to the size
2220           of the load.  */
2221       addr_expr = build_fold_addr_expr (unshare_expr (src));
2222       if (is_gimple_min_invariant (addr_expr))
2223         addr_tmp = addr_expr;
2224       else
2225         {
2226           addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2227                                          "load_src");
2228           addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2229           gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2230         }
2231
2232       /* Perform the load.  */
2233       aligned_load_type = load_type;
2234       if (align < TYPE_ALIGN (load_type))
2235         aligned_load_type = build_aligned_type (load_type, align);
2236       load_offset_ptr = build_int_cst (n->alias_set, 0);
2237       val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2238                               load_offset_ptr);
2239
2240       if (!bswap)
2241         {
2242           if (n->range == 16)
2243             nop_stats.found_16bit++;
2244           else if (n->range == 32)
2245             nop_stats.found_32bit++;
2246           else
2247             {
2248               gcc_assert (n->range == 64);
2249               nop_stats.found_64bit++;
2250             }
2251
2252           /* Convert the result of load if necessary.  */
2253           if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2254             {
2255               val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2256                                             "load_dst");
2257               load_stmt = gimple_build_assign (val_tmp, val_expr);
2258               gimple_set_vuse (load_stmt, n->vuse);
2259               gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2260               gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2261             }
2262           else
2263             {
2264               gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2265               gimple_set_vuse (cur_stmt, n->vuse);
2266             }
2267           update_stmt (cur_stmt);
2268
2269           if (dump_file)
2270             {
2271               fprintf (dump_file,
2272                        "%d bit load in target endianness found at: ",
2273                        (int)n->range);
2274               print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2275             }
2276           return true;
2277         }
2278       else
2279         {
2280           val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2281           load_stmt = gimple_build_assign (val_tmp, val_expr);
2282           gimple_set_vuse (load_stmt, n->vuse);
2283           gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2284         }
2285       src = val_tmp;
2286     }
2287
2288   if (n->range == 16)
2289     bswap_stats.found_16bit++;
2290   else if (n->range == 32)
2291     bswap_stats.found_32bit++;
2292   else
2293     {
2294       gcc_assert (n->range == 64);
2295       bswap_stats.found_64bit++;
2296     }
2297
2298   tmp = src;
2299
2300   /* Canonical form for 16 bit bswap is a rotate expression.  Only 16bit values
2301      are considered as rotation of 2N bit values by N bits is generally not
2302      equivalent to a bswap.  Consider for instance 0x01020304 >> 16 which gives
2303      0x03040102 while a bswap for that value is 0x04030201.  */
2304   if (bswap && n->range == 16)
2305     {
2306       tree count = build_int_cst (NULL, BITS_PER_UNIT);
2307       bswap_type = TREE_TYPE (src);
2308       src = fold_build2 (LROTATE_EXPR, bswap_type, src, count);
2309       bswap_stmt = gimple_build_assign (NULL, src);
2310     }
2311   else
2312     {
2313       /* Convert the src expression if necessary.  */
2314       if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2315         {
2316           gimple convert_stmt;
2317           tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2318           convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
2319           gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2320         }
2321
2322       bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2323     }
2324
2325   tmp = tgt;
2326
2327   /* Convert the result if necessary.  */
2328   if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2329     {
2330       gimple convert_stmt;
2331       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2332       convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
2333       gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2334     }
2335
2336   gimple_set_lhs (bswap_stmt, tmp);
2337
2338   if (dump_file)
2339     {
2340       fprintf (dump_file, "%d bit bswap implementation found at: ",
2341                (int)n->range);
2342       print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2343     }
2344
2345   gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2346   gsi_remove (&gsi, true);
2347   return true;
2348 }
2349
2350 /* Find manual byte swap implementations as well as load in a given
2351    endianness. Byte swaps are turned into a bswap builtin invokation
2352    while endian loads are converted to bswap builtin invokation or
2353    simple load according to the target endianness.  */
2354
2355 unsigned int
2356 pass_optimize_bswap::execute (function *fun)
2357 {
2358   basic_block bb;
2359   bool bswap32_p, bswap64_p;
2360   bool changed = false;
2361   tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2362
2363   if (BITS_PER_UNIT != 8)
2364     return 0;
2365
2366   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2367                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2368   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2369                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2370                    || (bswap32_p && word_mode == SImode)));
2371
2372   /* Determine the argument type of the builtins.  The code later on
2373      assumes that the return and argument type are the same.  */
2374   if (bswap32_p)
2375     {
2376       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2377       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2378     }
2379
2380   if (bswap64_p)
2381     {
2382       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2383       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2384     }
2385
2386   memset (&nop_stats, 0, sizeof (nop_stats));
2387   memset (&bswap_stats, 0, sizeof (bswap_stats));
2388
2389   FOR_EACH_BB_FN (bb, fun)
2390     {
2391       gimple_stmt_iterator gsi;
2392
2393       /* We do a reverse scan for bswap patterns to make sure we get the
2394          widest match. As bswap pattern matching doesn't handle previously
2395          inserted smaller bswap replacements as sub-patterns, the wider
2396          variant wouldn't be detected.  */
2397       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2398         {
2399           gimple src_stmt, cur_stmt = gsi_stmt (gsi);
2400           tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2401           enum tree_code code;
2402           struct symbolic_number n;
2403           bool bswap;
2404
2405           /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2406              might be moved to a different basic block by bswap_replace and gsi
2407              must not points to it if that's the case.  Moving the gsi_prev
2408              there make sure that gsi points to the statement previous to
2409              cur_stmt while still making sure that all statements are
2410              considered in this basic block.  */
2411           gsi_prev (&gsi);
2412
2413           if (!is_gimple_assign (cur_stmt))
2414             continue;
2415
2416           code = gimple_assign_rhs_code (cur_stmt);
2417           switch (code)
2418             {
2419             case LROTATE_EXPR:
2420             case RROTATE_EXPR:
2421               if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2422                   || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2423                      % BITS_PER_UNIT)
2424                 continue;
2425               /* Fall through.  */
2426             case BIT_IOR_EXPR:
2427               break;
2428             default:
2429               continue;
2430             }
2431
2432           src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2433
2434           if (!src_stmt)
2435             continue;
2436
2437           switch (n.range)
2438             {
2439             case 16:
2440               /* Already in canonical form, nothing to do.  */
2441               if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2442                 continue;
2443               load_type = uint16_type_node;
2444               break;
2445             case 32:
2446               load_type = uint32_type_node;
2447               if (bswap32_p)
2448                 {
2449                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2450                   bswap_type = bswap32_type;
2451                 }
2452               break;
2453             case 64:
2454               load_type = uint64_type_node;
2455               if (bswap64_p)
2456                 {
2457                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2458                   bswap_type = bswap64_type;
2459                 }
2460               break;
2461             default:
2462               continue;
2463             }
2464
2465           if (bswap && !fndecl && n.range != 16)
2466             continue;
2467
2468           if (bswap_replace (cur_stmt, src_stmt, fndecl, bswap_type, load_type,
2469                              &n, bswap))
2470             changed = true;
2471         }
2472     }
2473
2474   statistics_counter_event (fun, "16-bit nop implementations found",
2475                             nop_stats.found_16bit);
2476   statistics_counter_event (fun, "32-bit nop implementations found",
2477                             nop_stats.found_32bit);
2478   statistics_counter_event (fun, "64-bit nop implementations found",
2479                             nop_stats.found_64bit);
2480   statistics_counter_event (fun, "16-bit bswap implementations found",
2481                             bswap_stats.found_16bit);
2482   statistics_counter_event (fun, "32-bit bswap implementations found",
2483                             bswap_stats.found_32bit);
2484   statistics_counter_event (fun, "64-bit bswap implementations found",
2485                             bswap_stats.found_64bit);
2486
2487   return (changed ? TODO_update_ssa : 0);
2488 }
2489
2490 } // anon namespace
2491
2492 gimple_opt_pass *
2493 make_pass_optimize_bswap (gcc::context *ctxt)
2494 {
2495   return new pass_optimize_bswap (ctxt);
2496 }
2497
2498 /* Return true if stmt is a type conversion operation that can be stripped
2499    when used in a widening multiply operation.  */
2500 static bool
2501 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
2502 {
2503   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2504
2505   if (TREE_CODE (result_type) == INTEGER_TYPE)
2506     {
2507       tree op_type;
2508       tree inner_op_type;
2509
2510       if (!CONVERT_EXPR_CODE_P (rhs_code))
2511         return false;
2512
2513       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2514
2515       /* If the type of OP has the same precision as the result, then
2516          we can strip this conversion.  The multiply operation will be
2517          selected to create the correct extension as a by-product.  */
2518       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2519         return true;
2520
2521       /* We can also strip a conversion if it preserves the signed-ness of
2522          the operation and doesn't narrow the range.  */
2523       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2524
2525       /* If the inner-most type is unsigned, then we can strip any
2526          intermediate widening operation.  If it's signed, then the
2527          intermediate widening operation must also be signed.  */
2528       if ((TYPE_UNSIGNED (inner_op_type)
2529            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2530           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2531         return true;
2532
2533       return false;
2534     }
2535
2536   return rhs_code == FIXED_CONVERT_EXPR;
2537 }
2538
2539 /* Return true if RHS is a suitable operand for a widening multiplication,
2540    assuming a target type of TYPE.
2541    There are two cases:
2542
2543      - RHS makes some value at least twice as wide.  Store that value
2544        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2545
2546      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2547        but leave *TYPE_OUT untouched.  */
2548
2549 static bool
2550 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2551                         tree *new_rhs_out)
2552 {
2553   gimple stmt;
2554   tree type1, rhs1;
2555
2556   if (TREE_CODE (rhs) == SSA_NAME)
2557     {
2558       stmt = SSA_NAME_DEF_STMT (rhs);
2559       if (is_gimple_assign (stmt))
2560         {
2561           if (! widening_mult_conversion_strippable_p (type, stmt))
2562             rhs1 = rhs;
2563           else
2564             {
2565               rhs1 = gimple_assign_rhs1 (stmt);
2566
2567               if (TREE_CODE (rhs1) == INTEGER_CST)
2568                 {
2569                   *new_rhs_out = rhs1;
2570                   *type_out = NULL;
2571                   return true;
2572                 }
2573             }
2574         }
2575       else
2576         rhs1 = rhs;
2577
2578       type1 = TREE_TYPE (rhs1);
2579
2580       if (TREE_CODE (type1) != TREE_CODE (type)
2581           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2582         return false;
2583
2584       *new_rhs_out = rhs1;
2585       *type_out = type1;
2586       return true;
2587     }
2588
2589   if (TREE_CODE (rhs) == INTEGER_CST)
2590     {
2591       *new_rhs_out = rhs;
2592       *type_out = NULL;
2593       return true;
2594     }
2595
2596   return false;
2597 }
2598
2599 /* Return true if STMT performs a widening multiplication, assuming the
2600    output type is TYPE.  If so, store the unwidened types of the operands
2601    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2602    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2603    and *TYPE2_OUT would give the operands of the multiplication.  */
2604
2605 static bool
2606 is_widening_mult_p (gimple stmt,
2607                     tree *type1_out, tree *rhs1_out,
2608                     tree *type2_out, tree *rhs2_out)
2609 {
2610   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2611
2612   if (TREE_CODE (type) != INTEGER_TYPE
2613       && TREE_CODE (type) != FIXED_POINT_TYPE)
2614     return false;
2615
2616   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2617                                rhs1_out))
2618     return false;
2619
2620   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2621                                rhs2_out))
2622     return false;
2623
2624   if (*type1_out == NULL)
2625     {
2626       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2627         return false;
2628       *type1_out = *type2_out;
2629     }
2630
2631   if (*type2_out == NULL)
2632     {
2633       if (!int_fits_type_p (*rhs2_out, *type1_out))
2634         return false;
2635       *type2_out = *type1_out;
2636     }
2637
2638   /* Ensure that the larger of the two operands comes first. */
2639   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2640     {
2641       tree tmp;
2642       tmp = *type1_out;
2643       *type1_out = *type2_out;
2644       *type2_out = tmp;
2645       tmp = *rhs1_out;
2646       *rhs1_out = *rhs2_out;
2647       *rhs2_out = tmp;
2648     }
2649
2650   return true;
2651 }
2652
2653 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2654    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2655    value is true iff we converted the statement.  */
2656
2657 static bool
2658 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2659 {
2660   tree lhs, rhs1, rhs2, type, type1, type2;
2661   enum insn_code handler;
2662   machine_mode to_mode, from_mode, actual_mode;
2663   optab op;
2664   int actual_precision;
2665   location_t loc = gimple_location (stmt);
2666   bool from_unsigned1, from_unsigned2;
2667
2668   lhs = gimple_assign_lhs (stmt);
2669   type = TREE_TYPE (lhs);
2670   if (TREE_CODE (type) != INTEGER_TYPE)
2671     return false;
2672
2673   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2674     return false;
2675
2676   to_mode = TYPE_MODE (type);
2677   from_mode = TYPE_MODE (type1);
2678   from_unsigned1 = TYPE_UNSIGNED (type1);
2679   from_unsigned2 = TYPE_UNSIGNED (type2);
2680
2681   if (from_unsigned1 && from_unsigned2)
2682     op = umul_widen_optab;
2683   else if (!from_unsigned1 && !from_unsigned2)
2684     op = smul_widen_optab;
2685   else
2686     op = usmul_widen_optab;
2687
2688   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2689                                                   0, &actual_mode);
2690
2691   if (handler == CODE_FOR_nothing)
2692     {
2693       if (op != smul_widen_optab)
2694         {
2695           /* We can use a signed multiply with unsigned types as long as
2696              there is a wider mode to use, or it is the smaller of the two
2697              types that is unsigned.  Note that type1 >= type2, always.  */
2698           if ((TYPE_UNSIGNED (type1)
2699                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2700               || (TYPE_UNSIGNED (type2)
2701                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2702             {
2703               from_mode = GET_MODE_WIDER_MODE (from_mode);
2704               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2705                 return false;
2706             }
2707
2708           op = smul_widen_optab;
2709           handler = find_widening_optab_handler_and_mode (op, to_mode,
2710                                                           from_mode, 0,
2711                                                           &actual_mode);
2712
2713           if (handler == CODE_FOR_nothing)
2714             return false;
2715
2716           from_unsigned1 = from_unsigned2 = false;
2717         }
2718       else
2719         return false;
2720     }
2721
2722   /* Ensure that the inputs to the handler are in the correct precison
2723      for the opcode.  This will be the full mode size.  */
2724   actual_precision = GET_MODE_PRECISION (actual_mode);
2725   if (2 * actual_precision > TYPE_PRECISION (type))
2726     return false;
2727   if (actual_precision != TYPE_PRECISION (type1)
2728       || from_unsigned1 != TYPE_UNSIGNED (type1))
2729     rhs1 = build_and_insert_cast (gsi, loc,
2730                                   build_nonstandard_integer_type
2731                                     (actual_precision, from_unsigned1), rhs1);
2732   if (actual_precision != TYPE_PRECISION (type2)
2733       || from_unsigned2 != TYPE_UNSIGNED (type2))
2734     rhs2 = build_and_insert_cast (gsi, loc,
2735                                   build_nonstandard_integer_type
2736                                     (actual_precision, from_unsigned2), rhs2);
2737
2738   /* Handle constants.  */
2739   if (TREE_CODE (rhs1) == INTEGER_CST)
2740     rhs1 = fold_convert (type1, rhs1);
2741   if (TREE_CODE (rhs2) == INTEGER_CST)
2742     rhs2 = fold_convert (type2, rhs2);
2743
2744   gimple_assign_set_rhs1 (stmt, rhs1);
2745   gimple_assign_set_rhs2 (stmt, rhs2);
2746   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2747   update_stmt (stmt);
2748   widen_mul_stats.widen_mults_inserted++;
2749   return true;
2750 }
2751
2752 /* Process a single gimple statement STMT, which is found at the
2753    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2754    rhs (given by CODE), and try to convert it into a
2755    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2756    is true iff we converted the statement.  */
2757
2758 static bool
2759 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2760                             enum tree_code code)
2761 {
2762   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2763   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2764   tree type, type1, type2, optype;
2765   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2766   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2767   optab this_optab;
2768   enum tree_code wmult_code;
2769   enum insn_code handler;
2770   machine_mode to_mode, from_mode, actual_mode;
2771   location_t loc = gimple_location (stmt);
2772   int actual_precision;
2773   bool from_unsigned1, from_unsigned2;
2774
2775   lhs = gimple_assign_lhs (stmt);
2776   type = TREE_TYPE (lhs);
2777   if (TREE_CODE (type) != INTEGER_TYPE
2778       && TREE_CODE (type) != FIXED_POINT_TYPE)
2779     return false;
2780
2781   if (code == MINUS_EXPR)
2782     wmult_code = WIDEN_MULT_MINUS_EXPR;
2783   else
2784     wmult_code = WIDEN_MULT_PLUS_EXPR;
2785
2786   rhs1 = gimple_assign_rhs1 (stmt);
2787   rhs2 = gimple_assign_rhs2 (stmt);
2788
2789   if (TREE_CODE (rhs1) == SSA_NAME)
2790     {
2791       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2792       if (is_gimple_assign (rhs1_stmt))
2793         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2794     }
2795
2796   if (TREE_CODE (rhs2) == SSA_NAME)
2797     {
2798       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2799       if (is_gimple_assign (rhs2_stmt))
2800         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2801     }
2802
2803   /* Allow for one conversion statement between the multiply
2804      and addition/subtraction statement.  If there are more than
2805      one conversions then we assume they would invalidate this
2806      transformation.  If that's not the case then they should have
2807      been folded before now.  */
2808   if (CONVERT_EXPR_CODE_P (rhs1_code))
2809     {
2810       conv1_stmt = rhs1_stmt;
2811       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2812       if (TREE_CODE (rhs1) == SSA_NAME)
2813         {
2814           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2815           if (is_gimple_assign (rhs1_stmt))
2816             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2817         }
2818       else
2819         return false;
2820     }
2821   if (CONVERT_EXPR_CODE_P (rhs2_code))
2822     {
2823       conv2_stmt = rhs2_stmt;
2824       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2825       if (TREE_CODE (rhs2) == SSA_NAME)
2826         {
2827           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2828           if (is_gimple_assign (rhs2_stmt))
2829             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2830         }
2831       else
2832         return false;
2833     }
2834
2835   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2836      is_widening_mult_p, but we still need the rhs returns.
2837
2838      It might also appear that it would be sufficient to use the existing
2839      operands of the widening multiply, but that would limit the choice of
2840      multiply-and-accumulate instructions.
2841
2842      If the widened-multiplication result has more than one uses, it is
2843      probably wiser not to do the conversion.  */
2844   if (code == PLUS_EXPR
2845       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2846     {
2847       if (!has_single_use (rhs1)
2848           || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2849                                   &type2, &mult_rhs2))
2850         return false;
2851       add_rhs = rhs2;
2852       conv_stmt = conv1_stmt;
2853     }
2854   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2855     {
2856       if (!has_single_use (rhs2)
2857           || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2858                                   &type2, &mult_rhs2))
2859         return false;
2860       add_rhs = rhs1;
2861       conv_stmt = conv2_stmt;
2862     }
2863   else
2864     return false;
2865
2866   to_mode = TYPE_MODE (type);
2867   from_mode = TYPE_MODE (type1);
2868   from_unsigned1 = TYPE_UNSIGNED (type1);
2869   from_unsigned2 = TYPE_UNSIGNED (type2);
2870   optype = type1;
2871
2872   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2873   if (from_unsigned1 != from_unsigned2)
2874     {
2875       if (!INTEGRAL_TYPE_P (type))
2876         return false;
2877       /* We can use a signed multiply with unsigned types as long as
2878          there is a wider mode to use, or it is the smaller of the two
2879          types that is unsigned.  Note that type1 >= type2, always.  */
2880       if ((from_unsigned1
2881            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2882           || (from_unsigned2
2883               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2884         {
2885           from_mode = GET_MODE_WIDER_MODE (from_mode);
2886           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2887             return false;
2888         }
2889
2890       from_unsigned1 = from_unsigned2 = false;
2891       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2892                                                false);
2893     }
2894
2895   /* If there was a conversion between the multiply and addition
2896      then we need to make sure it fits a multiply-and-accumulate.
2897      The should be a single mode change which does not change the
2898      value.  */
2899   if (conv_stmt)
2900     {
2901       /* We use the original, unmodified data types for this.  */
2902       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2903       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2904       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2905       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2906
2907       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2908         {
2909           /* Conversion is a truncate.  */
2910           if (TYPE_PRECISION (to_type) < data_size)
2911             return false;
2912         }
2913       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2914         {
2915           /* Conversion is an extend.  Check it's the right sort.  */
2916           if (TYPE_UNSIGNED (from_type) != is_unsigned
2917               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2918             return false;
2919         }
2920       /* else convert is a no-op for our purposes.  */
2921     }
2922
2923   /* Verify that the machine can perform a widening multiply
2924      accumulate in this mode/signedness combination, otherwise
2925      this transformation is likely to pessimize code.  */
2926   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2927   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2928                                                   from_mode, 0, &actual_mode);
2929
2930   if (handler == CODE_FOR_nothing)
2931     return false;
2932
2933   /* Ensure that the inputs to the handler are in the correct precison
2934      for the opcode.  This will be the full mode size.  */
2935   actual_precision = GET_MODE_PRECISION (actual_mode);
2936   if (actual_precision != TYPE_PRECISION (type1)
2937       || from_unsigned1 != TYPE_UNSIGNED (type1))
2938     mult_rhs1 = build_and_insert_cast (gsi, loc,
2939                                        build_nonstandard_integer_type
2940                                          (actual_precision, from_unsigned1),
2941                                        mult_rhs1);
2942   if (actual_precision != TYPE_PRECISION (type2)
2943       || from_unsigned2 != TYPE_UNSIGNED (type2))
2944     mult_rhs2 = build_and_insert_cast (gsi, loc,
2945                                        build_nonstandard_integer_type
2946                                          (actual_precision, from_unsigned2),
2947                                        mult_rhs2);
2948
2949   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2950     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2951
2952   /* Handle constants.  */
2953   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2954     mult_rhs1 = fold_convert (type1, mult_rhs1);
2955   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2956     mult_rhs2 = fold_convert (type2, mult_rhs2);
2957
2958   gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
2959                                   add_rhs);
2960   update_stmt (gsi_stmt (*gsi));
2961   widen_mul_stats.maccs_inserted++;
2962   return true;
2963 }
2964
2965 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
2966    with uses in additions and subtractions to form fused multiply-add
2967    operations.  Returns true if successful and MUL_STMT should be removed.  */
2968
2969 static bool
2970 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
2971 {
2972   tree mul_result = gimple_get_lhs (mul_stmt);
2973   tree type = TREE_TYPE (mul_result);
2974   gimple use_stmt, neguse_stmt;
2975   gassign *fma_stmt;
2976   use_operand_p use_p;
2977   imm_use_iterator imm_iter;
2978
2979   if (FLOAT_TYPE_P (type)
2980       && flag_fp_contract_mode == FP_CONTRACT_OFF)
2981     return false;
2982
2983   /* We don't want to do bitfield reduction ops.  */
2984   if (INTEGRAL_TYPE_P (type)
2985       && (TYPE_PRECISION (type)
2986           != GET_MODE_PRECISION (TYPE_MODE (type))))
2987     return false;
2988
2989   /* If the target doesn't support it, don't generate it.  We assume that
2990      if fma isn't available then fms, fnma or fnms are not either.  */
2991   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
2992     return false;
2993
2994   /* If the multiplication has zero uses, it is kept around probably because
2995      of -fnon-call-exceptions.  Don't optimize it away in that case,
2996      it is DCE job.  */
2997   if (has_zero_uses (mul_result))
2998     return false;
2999
3000   /* Make sure that the multiplication statement becomes dead after
3001      the transformation, thus that all uses are transformed to FMAs.
3002      This means we assume that an FMA operation has the same cost
3003      as an addition.  */
3004   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3005     {
3006       enum tree_code use_code;
3007       tree result = mul_result;
3008       bool negate_p = false;
3009
3010       use_stmt = USE_STMT (use_p);
3011
3012       if (is_gimple_debug (use_stmt))
3013         continue;
3014
3015       /* For now restrict this operations to single basic blocks.  In theory
3016          we would want to support sinking the multiplication in
3017          m = a*b;
3018          if ()
3019            ma = m + c;
3020          else
3021            d = m;
3022          to form a fma in the then block and sink the multiplication to the
3023          else block.  */
3024       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3025         return false;
3026
3027       if (!is_gimple_assign (use_stmt))
3028         return false;
3029
3030       use_code = gimple_assign_rhs_code (use_stmt);
3031
3032       /* A negate on the multiplication leads to FNMA.  */
3033       if (use_code == NEGATE_EXPR)
3034         {
3035           ssa_op_iter iter;
3036           use_operand_p usep;
3037
3038           result = gimple_assign_lhs (use_stmt);
3039
3040           /* Make sure the negate statement becomes dead with this
3041              single transformation.  */
3042           if (!single_imm_use (gimple_assign_lhs (use_stmt),
3043                                &use_p, &neguse_stmt))
3044             return false;
3045
3046           /* Make sure the multiplication isn't also used on that stmt.  */
3047           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3048             if (USE_FROM_PTR (usep) == mul_result)
3049               return false;
3050
3051           /* Re-validate.  */
3052           use_stmt = neguse_stmt;
3053           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3054             return false;
3055           if (!is_gimple_assign (use_stmt))
3056             return false;
3057
3058           use_code = gimple_assign_rhs_code (use_stmt);
3059           negate_p = true;
3060         }
3061
3062       switch (use_code)
3063         {
3064         case MINUS_EXPR:
3065           if (gimple_assign_rhs2 (use_stmt) == result)
3066             negate_p = !negate_p;
3067           break;
3068         case PLUS_EXPR:
3069           break;
3070         default:
3071           /* FMA can only be formed from PLUS and MINUS.  */
3072           return false;
3073         }
3074
3075       /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3076          by a MULT_EXPR that we'll visit later, we might be able to
3077          get a more profitable match with fnma.
3078          OTOH, if we don't, a negate / fma pair has likely lower latency
3079          that a mult / subtract pair.  */
3080       if (use_code == MINUS_EXPR && !negate_p
3081           && gimple_assign_rhs1 (use_stmt) == result
3082           && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3083           && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3084         {
3085           tree rhs2 = gimple_assign_rhs2 (use_stmt);
3086
3087           if (TREE_CODE (rhs2) == SSA_NAME)
3088             {
3089               gimple stmt2 = SSA_NAME_DEF_STMT (rhs2);
3090               if (has_single_use (rhs2)
3091                   && is_gimple_assign (stmt2)
3092                   && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3093               return false;
3094             }
3095         }
3096
3097       /* We can't handle a * b + a * b.  */
3098       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3099         return false;
3100
3101       /* While it is possible to validate whether or not the exact form
3102          that we've recognized is available in the backend, the assumption
3103          is that the transformation is never a loss.  For instance, suppose
3104          the target only has the plain FMA pattern available.  Consider
3105          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3106          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
3107          still have 3 operations, but in the FMA form the two NEGs are
3108          independent and could be run in parallel.  */
3109     }
3110
3111   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3112     {
3113       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3114       enum tree_code use_code;
3115       tree addop, mulop1 = op1, result = mul_result;
3116       bool negate_p = false;
3117
3118       if (is_gimple_debug (use_stmt))
3119         continue;
3120
3121       use_code = gimple_assign_rhs_code (use_stmt);
3122       if (use_code == NEGATE_EXPR)
3123         {
3124           result = gimple_assign_lhs (use_stmt);
3125           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3126           gsi_remove (&gsi, true);
3127           release_defs (use_stmt);
3128
3129           use_stmt = neguse_stmt;
3130           gsi = gsi_for_stmt (use_stmt);
3131           use_code = gimple_assign_rhs_code (use_stmt);
3132           negate_p = true;
3133         }
3134
3135       if (gimple_assign_rhs1 (use_stmt) == result)
3136         {
3137           addop = gimple_assign_rhs2 (use_stmt);
3138           /* a * b - c -> a * b + (-c)  */
3139           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3140             addop = force_gimple_operand_gsi (&gsi,
3141                                               build1 (NEGATE_EXPR,
3142                                                       type, addop),
3143                                               true, NULL_TREE, true,
3144                                               GSI_SAME_STMT);
3145         }
3146       else
3147         {
3148           addop = gimple_assign_rhs1 (use_stmt);
3149           /* a - b * c -> (-b) * c + a */
3150           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3151             negate_p = !negate_p;
3152         }
3153
3154       if (negate_p)
3155         mulop1 = force_gimple_operand_gsi (&gsi,
3156                                            build1 (NEGATE_EXPR,
3157                                                    type, mulop1),
3158                                            true, NULL_TREE, true,
3159                                            GSI_SAME_STMT);
3160
3161       fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
3162                                       FMA_EXPR, mulop1, op2, addop);
3163       gsi_replace (&gsi, fma_stmt, true);
3164       widen_mul_stats.fmas_inserted++;
3165     }
3166
3167   return true;
3168 }
3169
3170 /* Find integer multiplications where the operands are extended from
3171    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3172    where appropriate.  */
3173
3174 namespace {
3175
3176 const pass_data pass_data_optimize_widening_mul =
3177 {
3178   GIMPLE_PASS, /* type */
3179   "widening_mul", /* name */
3180   OPTGROUP_NONE, /* optinfo_flags */
3181   TV_NONE, /* tv_id */
3182   PROP_ssa, /* properties_required */
3183   0, /* properties_provided */
3184   0, /* properties_destroyed */
3185   0, /* todo_flags_start */
3186   TODO_update_ssa, /* todo_flags_finish */
3187 };
3188
3189 class pass_optimize_widening_mul : public gimple_opt_pass
3190 {
3191 public:
3192   pass_optimize_widening_mul (gcc::context *ctxt)
3193     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3194   {}
3195
3196   /* opt_pass methods: */
3197   virtual bool gate (function *)
3198     {
3199       return flag_expensive_optimizations && optimize;
3200     }
3201
3202   virtual unsigned int execute (function *);
3203
3204 }; // class pass_optimize_widening_mul
3205
3206 unsigned int
3207 pass_optimize_widening_mul::execute (function *fun)
3208 {
3209   basic_block bb;
3210   bool cfg_changed = false;
3211
3212   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3213
3214   FOR_EACH_BB_FN (bb, fun)
3215     {
3216       gimple_stmt_iterator gsi;
3217
3218       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3219         {
3220           gimple stmt = gsi_stmt (gsi);
3221           enum tree_code code;
3222
3223           if (is_gimple_assign (stmt))
3224             {
3225               code = gimple_assign_rhs_code (stmt);
3226               switch (code)
3227                 {
3228                 case MULT_EXPR:
3229                   if (!convert_mult_to_widen (stmt, &gsi)
3230                       && convert_mult_to_fma (stmt,
3231                                               gimple_assign_rhs1 (stmt),
3232                                               gimple_assign_rhs2 (stmt)))
3233                     {
3234                       gsi_remove (&gsi, true);
3235                       release_defs (stmt);
3236                       continue;
3237                     }
3238                   break;
3239
3240                 case PLUS_EXPR:
3241                 case MINUS_EXPR:
3242                   convert_plusminus_to_widen (&gsi, stmt, code);
3243                   break;
3244
3245                 default:;
3246                 }
3247             }
3248           else if (is_gimple_call (stmt)
3249                    && gimple_call_lhs (stmt))
3250             {
3251               tree fndecl = gimple_call_fndecl (stmt);
3252               if (fndecl
3253                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
3254                 {
3255                   switch (DECL_FUNCTION_CODE (fndecl))
3256                     {
3257                       case BUILT_IN_POWF:
3258                       case BUILT_IN_POW:
3259                       case BUILT_IN_POWL:
3260                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3261                             && REAL_VALUES_EQUAL
3262                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3263                                   dconst2)
3264                             && convert_mult_to_fma (stmt,
3265                                                     gimple_call_arg (stmt, 0),
3266                                                     gimple_call_arg (stmt, 0)))
3267                           {
3268                             unlink_stmt_vdef (stmt);
3269                             if (gsi_remove (&gsi, true)
3270                                 && gimple_purge_dead_eh_edges (bb))
3271                               cfg_changed = true;
3272                             release_defs (stmt);
3273                             continue;
3274                           }
3275                           break;
3276
3277                       default:;
3278                     }
3279                 }
3280             }
3281           gsi_next (&gsi);
3282         }
3283     }
3284
3285   statistics_counter_event (fun, "widening multiplications inserted",
3286                             widen_mul_stats.widen_mults_inserted);
3287   statistics_counter_event (fun, "widening maccs inserted",
3288                             widen_mul_stats.maccs_inserted);
3289   statistics_counter_event (fun, "fused multiply-adds inserted",
3290                             widen_mul_stats.fmas_inserted);
3291
3292   return cfg_changed ? TODO_cleanup_cfg : 0;
3293 }
3294
3295 } // anon namespace
3296
3297 gimple_opt_pass *
3298 make_pass_optimize_widening_mul (gcc::context *ctxt)
3299 {
3300   return new pass_optimize_widening_mul (ctxt);
3301 }