gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005-2015 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  21    operations.  These are common in sequences such as this one:
  22
  23         modulus = sqrt(x*x + y*y + z*z);
  24         x = x / modulus;
  25         y = y / modulus;
  26         z = z / modulus;
  27
  28    that can be optimized to
  29
  30         modulus = sqrt(x*x + y*y + z*z);
  31         rmodulus = 1.0 / modulus;
  32         x = x * rmodulus;
  33         y = y * rmodulus;
  34         z = z * rmodulus;
  35
  36    We do this for loop invariant divisors, and with this pass whenever
  37    we notice that a division has the same divisor multiple times.
  38
  39    Of course, like in PRE, we don't insert a division if a dominator
  40    already has one.  However, this cannot be done as an extension of
  41    PRE for several reasons.
  42
  43    First of all, with some experiments it was found out that the
  44    transformation is not always useful if there are only two divisions
  45    hy the same divisor.  This is probably because modern processors
  46    can pipeline the divisions; on older, in-order processors it should
  47    still be effective to optimize two divisions by the same number.
  48    We make this a param, and it shall be called N in the remainder of
  49    this comment.
  50
  51    Second, if trapping math is active, we have less freedom on where
  52    to insert divisions: we can only do so in basic blocks that already
  53    contain one.  (If divisions don't trap, instead, we can insert
  54    divisions elsewhere, which will be in blocks that are common dominators
  55    of those that have the division).
  56
  57    We really don't want to compute the reciprocal unless a division will
  58    be found.  To do this, we won't insert the division in a basic block
  59    that has less than N divisions *post-dominating* it.
  60
  61    The algorithm constructs a subset of the dominator tree, holding the
  62    blocks containing the divisions and the common dominators to them,
  63    and walk it twice.  The first walk is in post-order, and it annotates
  64    each block with the number of divisions that post-dominate it: this
  65    gives information on where divisions can be inserted profitably.
  66    The second walk is in pre-order, and it inserts divisions as explained
  67    above, and replaces divisions by multiplications.
  68
  69    In the best case, the cost of the pass is O(n_statements).  In the
  70    worst-case, the cost is due to creating the dominator tree subset,
  71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  72    for n_statements / n_basic_blocks statements.  So, the amortized cost
  73    of creating the dominator tree subset is O(n_basic_blocks) and the
  74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  75
  76    More practically, the cost will be small because there are few
  77    divisions, and they tend to be in the same basic block, so insert_bb
  78    is called very few times.
  79
  80    If we did this using domwalk.c, an efficient implementation would have
  81    to work on all the variables in a single pass, because we could not
  82    work on just a subset of the dominator tree, as we do now, and the
  83    cost would also be something like O(n_statements * n_basic_blocks).
  84    The data structures would be more complex in order to work on all the
  85    variables in a single pass.  */
  86
  87 #include "config.h"
  88 #include "system.h"
  89 #include "coretypes.h"
  90 #include "tm.h"
  91 #include "flags.h"
  92 #include "hash-set.h"
  93 #include "machmode.h"
  94 #include "vec.h"
  95 #include "double-int.h"
  96 #include "input.h"
  97 #include "alias.h"
  98 #include "symtab.h"
  99 #include "wide-int.h"
 100 #include "inchash.h"
 101 #include "tree.h"
 102 #include "fold-const.h"
 103 #include "predict.h"
 104 #include "hard-reg-set.h"
 105 #include "function.h"
 106 #include "dominance.h"
 107 #include "cfg.h"
 108 #include "basic-block.h"
 109 #include "tree-ssa-alias.h"
 110 #include "internal-fn.h"
 111 #include "gimple-fold.h"
 112 #include "gimple-expr.h"
 113 #include "is-a.h"
 114 #include "gimple.h"
 115 #include "gimple-iterator.h"
 116 #include "gimplify.h"
 117 #include "gimplify-me.h"
 118 #include "stor-layout.h"
 119 #include "gimple-ssa.h"
 120 #include "tree-cfg.h"
 121 #include "tree-phinodes.h"
 122 #include "ssa-iterators.h"
 123 #include "stringpool.h"
 124 #include "tree-ssanames.h"
 125 #include "hashtab.h"
 126 #include "rtl.h"
 127 #include "statistics.h"
 128 #include "real.h"
 129 #include "fixed-value.h"
 130 #include "insn-config.h"
 131 #include "expmed.h"
 132 #include "dojump.h"
 133 #include "explow.h"
 134 #include "calls.h"
 135 #include "emit-rtl.h"
 136 #include "varasm.h"
 137 #include "stmt.h"
 138 #include "expr.h"
 139 #include "tree-dfa.h"
 140 #include "tree-ssa.h"
 141 #include "tree-pass.h"
 142 #include "alloc-pool.h"
 143 #include "target.h"
 144 #include "gimple-pretty-print.h"
 145 #include "builtins.h"
 146
 147 /* FIXME: RTL headers have to be included here for optabs.  */
 148 #include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
 149 #include "expr.h"               /* Because optabs.h wants sepops.  */
 150 #include "insn-codes.h"
 151 #include "optabs.h"
 152
 153 /* This structure represents one basic block that either computes a
 154    division, or is a common dominator for basic block that compute a
 155    division.  */
 156 struct occurrence {
 157   /* The basic block represented by this structure.  */
 158   basic_block bb;
 159
 160   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 161      inserted in BB.  */
 162   tree recip_def;
 163
 164   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 165      was inserted in BB.  */
 166   gimple recip_def_stmt;
 167
 168   /* Pointer to a list of "struct occurrence"s for blocks dominated
 169      by BB.  */
 170   struct occurrence *children;
 171
 172   /* Pointer to the next "struct occurrence"s in the list of blocks
 173      sharing a common dominator.  */
 174   struct occurrence *next;
 175
 176   /* The number of divisions that are in BB before compute_merit.  The
 177      number of divisions that are in BB or post-dominate it after
 178      compute_merit.  */
 179   int num_divisions;
 180
 181   /* True if the basic block has a division, false if it is a common
 182      dominator for basic blocks that do.  If it is false and trapping
 183      math is active, BB is not a candidate for inserting a reciprocal.  */
 184   bool bb_has_division;
 185 };
 186
 187 static struct
 188 {
 189   /* Number of 1.0/X ops inserted.  */
 190   int rdivs_inserted;
 191
 192   /* Number of 1.0/FUNC ops inserted.  */
 193   int rfuncs_inserted;
 194 } reciprocal_stats;
 195
 196 static struct
 197 {
 198   /* Number of cexpi calls inserted.  */
 199   int inserted;
 200 } sincos_stats;
 201
 202 static struct
 203 {
 204   /* Number of hand-written 16-bit nop / bswaps found.  */
 205   int found_16bit;
 206
 207   /* Number of hand-written 32-bit nop / bswaps found.  */
 208   int found_32bit;
 209
 210   /* Number of hand-written 64-bit nop / bswaps found.  */
 211   int found_64bit;
 212 } nop_stats, bswap_stats;
 213
 214 static struct
 215 {
 216   /* Number of widening multiplication ops inserted.  */
 217   int widen_mults_inserted;
 218
 219   /* Number of integer multiply-and-accumulate ops inserted.  */
 220   int maccs_inserted;
 221
 222   /* Number of fp fused multiply-add ops inserted.  */
 223   int fmas_inserted;
 224 } widen_mul_stats;
 225
 226 /* The instance of "struct occurrence" representing the highest
 227    interesting block in the dominator tree.  */
 228 static struct occurrence *occ_head;
 229
 230 /* Allocation pool for getting instances of "struct occurrence".  */
 231 static alloc_pool occ_pool;
 232
 233
 234
 235 /* Allocate and return a new struct occurrence for basic block BB, and
 236    whose children list is headed by CHILDREN.  */
 237 static struct occurrence *
 238 occ_new (basic_block bb, struct occurrence *children)
 239 {
 240   struct occurrence *occ;
 241
 242   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
 243   memset (occ, 0, sizeof (struct occurrence));
 244
 245   occ->bb = bb;
 246   occ->children = children;
 247   return occ;
 248 }
 249
 250
 251 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 252    list of "struct occurrence"s, one per basic block, having IDOM as
 253    their common dominator.
 254
 255    We try to insert NEW_OCC as deep as possible in the tree, and we also
 256    insert any other block that is a common dominator for BB and one
 257    block already in the tree.  */
 258
 259 static void
 260 insert_bb (struct occurrence *new_occ, basic_block idom,
 261            struct occurrence **p_head)
 262 {
 263   struct occurrence *occ, **p_occ;
 264
 265   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 266     {
 267       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 268       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 269       if (dom == bb)
 270         {
 271           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 272              from its list.  */
 273           *p_occ = occ->next;
 274           occ->next = new_occ->children;
 275           new_occ->children = occ;
 276
 277           /* Try the next block (it may as well be dominated by BB).  */
 278         }
 279
 280       else if (dom == occ_bb)
 281         {
 282           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 283           insert_bb (new_occ, dom, &occ->children);
 284           return;
 285         }
 286
 287       else if (dom != idom)
 288         {
 289           gcc_assert (!dom->aux);
 290
 291           /* There is a dominator between IDOM and BB, add it and make
 292              two children out of NEW_OCC and OCC.  First, remove OCC from
 293              its list.  */
 294           *p_occ = occ->next;
 295           new_occ->next = occ;
 296           occ->next = NULL;
 297
 298           /* None of the previous blocks has DOM as a dominator: if we tail
 299              recursed, we would reexamine them uselessly. Just switch BB with
 300              DOM, and go on looking for blocks dominated by DOM.  */
 301           new_occ = occ_new (dom, new_occ);
 302         }
 303
 304       else
 305         {
 306           /* Nothing special, go on with the next element.  */
 307           p_occ = &occ->next;
 308         }
 309     }
 310
 311   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 312   new_occ->next = *p_head;
 313   *p_head = new_occ;
 314 }
 315
 316 /* Register that we found a division in BB.  */
 317
 318 static inline void
 319 register_division_in (basic_block bb)
 320 {
 321   struct occurrence *occ;
 322
 323   occ = (struct occurrence *) bb->aux;
 324   if (!occ)
 325     {
 326       occ = occ_new (bb, NULL);
 327       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
 328     }
 329
 330   occ->bb_has_division = true;
 331   occ->num_divisions++;
 332 }
 333
 334
 335 /* Compute the number of divisions that postdominate each block in OCC and
 336    its children.  */
 337
 338 static void
 339 compute_merit (struct occurrence *occ)
 340 {
 341   struct occurrence *occ_child;
 342   basic_block dom = occ->bb;
 343
 344   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 345     {
 346       basic_block bb;
 347       if (occ_child->children)
 348         compute_merit (occ_child);
 349
 350       if (flag_exceptions)
 351         bb = single_noncomplex_succ (dom);
 352       else
 353         bb = dom;
 354
 355       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 356         occ->num_divisions += occ_child->num_divisions;
 357     }
 358 }
 359
 360
 361 /* Return whether USE_STMT is a floating-point division by DEF.  */
 362 static inline bool
 363 is_division_by (gimple use_stmt, tree def)
 364 {
 365   return is_gimple_assign (use_stmt)
 366          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 367          && gimple_assign_rhs2 (use_stmt) == def
 368          /* Do not recognize x / x as valid division, as we are getting
 369             confused later by replacing all immediate uses x in such
 370             a stmt.  */
 371          && gimple_assign_rhs1 (use_stmt) != def;
 372 }
 373
 374 /* Walk the subset of the dominator tree rooted at OCC, setting the
 375    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 376    the given basic block.  The field may be left NULL, of course,
 377    if it is not possible or profitable to do the optimization.
 378
 379    DEF_BSI is an iterator pointing at the statement defining DEF.
 380    If RECIP_DEF is set, a dominator already has a computation that can
 381    be used.  */
 382
 383 static void
 384 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 385                     tree def, tree recip_def, int threshold)
 386 {
 387   tree type;
 388   gassign *new_stmt;
 389   gimple_stmt_iterator gsi;
 390   struct occurrence *occ_child;
 391
 392   if (!recip_def
 393       && (occ->bb_has_division || !flag_trapping_math)
 394       && occ->num_divisions >= threshold)
 395     {
 396       /* Make a variable with the replacement and substitute it.  */
 397       type = TREE_TYPE (def);
 398       recip_def = create_tmp_reg (type, "reciptmp");
 399       new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
 400                                       build_one_cst (type), def);
 401
 402       if (occ->bb_has_division)
 403         {
 404           /* Case 1: insert before an existing division.  */
 405           gsi = gsi_after_labels (occ->bb);
 406           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 407             gsi_next (&gsi);
 408
 409           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 410         }
 411       else if (def_gsi && occ->bb == def_gsi->bb)
 412         {
 413           /* Case 2: insert right after the definition.  Note that this will
 414              never happen if the definition statement can throw, because in
 415              that case the sole successor of the statement's basic block will
 416              dominate all the uses as well.  */
 417           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 418         }
 419       else
 420         {
 421           /* Case 3: insert in a basic block not containing defs/uses.  */
 422           gsi = gsi_after_labels (occ->bb);
 423           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 424         }
 425
 426       reciprocal_stats.rdivs_inserted++;
 427
 428       occ->recip_def_stmt = new_stmt;
 429     }
 430
 431   occ->recip_def = recip_def;
 432   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 433     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 434 }
 435
 436
 437 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 438    possible.  */
 439
 440 static inline void
 441 replace_reciprocal (use_operand_p use_p)
 442 {
 443   gimple use_stmt = USE_STMT (use_p);
 444   basic_block bb = gimple_bb (use_stmt);
 445   struct occurrence *occ = (struct occurrence *) bb->aux;
 446
 447   if (optimize_bb_for_speed_p (bb)
 448       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 449     {
 450       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 451       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 452       SET_USE (use_p, occ->recip_def);
 453       fold_stmt_inplace (&gsi);
 454       update_stmt (use_stmt);
 455     }
 456 }
 457
 458
 459 /* Free OCC and return one more "struct occurrence" to be freed.  */
 460
 461 static struct occurrence *
 462 free_bb (struct occurrence *occ)
 463 {
 464   struct occurrence *child, *next;
 465
 466   /* First get the two pointers hanging off OCC.  */
 467   next = occ->next;
 468   child = occ->children;
 469   occ->bb->aux = NULL;
 470   pool_free (occ_pool, occ);
 471
 472   /* Now ensure that we don't recurse unless it is necessary.  */
 473   if (!child)
 474     return next;
 475   else
 476     {
 477       while (next)
 478         next = free_bb (next);
 479
 480       return child;
 481     }
 482 }
 483
 484
 485 /* Look for floating-point divisions among DEF's uses, and try to
 486    replace them by multiplications with the reciprocal.  Add
 487    as many statements computing the reciprocal as needed.
 488
 489    DEF must be a GIMPLE register of a floating-point type.  */
 490
 491 static void
 492 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 493 {
 494   use_operand_p use_p;
 495   imm_use_iterator use_iter;
 496   struct occurrence *occ;
 497   int count = 0, threshold;
 498
 499   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 500
 501   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 502     {
 503       gimple use_stmt = USE_STMT (use_p);
 504       if (is_division_by (use_stmt, def))
 505         {
 506           register_division_in (gimple_bb (use_stmt));
 507           count++;
 508         }
 509     }
 510
 511   /* Do the expensive part only if we can hope to optimize something.  */
 512   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 513   if (count >= threshold)
 514     {
 515       gimple use_stmt;
 516       for (occ = occ_head; occ; occ = occ->next)
 517         {
 518           compute_merit (occ);
 519           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 520         }
 521
 522       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 523         {
 524           if (is_division_by (use_stmt, def))
 525             {
 526               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 527                 replace_reciprocal (use_p);
 528             }
 529         }
 530     }
 531
 532   for (occ = occ_head; occ; )
 533     occ = free_bb (occ);
 534
 535   occ_head = NULL;
 536 }
 537
 538 /* Go through all the floating-point SSA_NAMEs, and call
 539    execute_cse_reciprocals_1 on each of them.  */
 540 namespace {
 541
 542 const pass_data pass_data_cse_reciprocals =
 543 {
 544   GIMPLE_PASS, /* type */
 545   "recip", /* name */
 546   OPTGROUP_NONE, /* optinfo_flags */
 547   TV_NONE, /* tv_id */
 548   PROP_ssa, /* properties_required */
 549   0, /* properties_provided */
 550   0, /* properties_destroyed */
 551   0, /* todo_flags_start */
 552   TODO_update_ssa, /* todo_flags_finish */
 553 };
 554
 555 class pass_cse_reciprocals : public gimple_opt_pass
 556 {
 557 public:
 558   pass_cse_reciprocals (gcc::context *ctxt)
 559     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
 560   {}
 561
 562   /* opt_pass methods: */
 563   virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
 564   virtual unsigned int execute (function *);
 565
 566 }; // class pass_cse_reciprocals
 567
 568 unsigned int
 569 pass_cse_reciprocals::execute (function *fun)
 570 {
 571   basic_block bb;
 572   tree arg;
 573
 574   occ_pool = create_alloc_pool ("dominators for recip",
 575                                 sizeof (struct occurrence),
 576                                 n_basic_blocks_for_fn (fun) / 3 + 1);
 577
 578   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 579   calculate_dominance_info (CDI_DOMINATORS);
 580   calculate_dominance_info (CDI_POST_DOMINATORS);
 581
 582 #ifdef ENABLE_CHECKING
 583   FOR_EACH_BB_FN (bb, fun)
 584     gcc_assert (!bb->aux);
 585 #endif
 586
 587   for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
 588     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 589         && is_gimple_reg (arg))
 590       {
 591         tree name = ssa_default_def (fun, arg);
 592         if (name)
 593           execute_cse_reciprocals_1 (NULL, name);
 594       }
 595
 596   FOR_EACH_BB_FN (bb, fun)
 597     {
 598       tree def;
 599
 600       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 601            gsi_next (&gsi))
 602         {
 603           gphi *phi = gsi.phi ();
 604           def = PHI_RESULT (phi);
 605           if (! virtual_operand_p (def)
 606               && FLOAT_TYPE_P (TREE_TYPE (def)))
 607             execute_cse_reciprocals_1 (NULL, def);
 608         }
 609
 610       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 611            gsi_next (&gsi))
 612         {
 613           gimple stmt = gsi_stmt (gsi);
 614
 615           if (gimple_has_lhs (stmt)
 616               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 617               && FLOAT_TYPE_P (TREE_TYPE (def))
 618               && TREE_CODE (def) == SSA_NAME)
 619             execute_cse_reciprocals_1 (&gsi, def);
 620         }
 621
 622       if (optimize_bb_for_size_p (bb))
 623         continue;
 624
 625       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 626       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 627            gsi_next (&gsi))
 628         {
 629           gimple stmt = gsi_stmt (gsi);
 630           tree fndecl;
 631
 632           if (is_gimple_assign (stmt)
 633               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 634             {
 635               tree arg1 = gimple_assign_rhs2 (stmt);
 636               gimple stmt1;
 637
 638               if (TREE_CODE (arg1) != SSA_NAME)
 639                 continue;
 640
 641               stmt1 = SSA_NAME_DEF_STMT (arg1);
 642
 643               if (is_gimple_call (stmt1)
 644                   && gimple_call_lhs (stmt1)
 645                   && (fndecl = gimple_call_fndecl (stmt1))
 646                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 647                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 648                 {
 649                   enum built_in_function code;
 650                   bool md_code, fail;
 651                   imm_use_iterator ui;
 652                   use_operand_p use_p;
 653
 654                   code = DECL_FUNCTION_CODE (fndecl);
 655                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 656
 657                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 658                   if (!fndecl)
 659                     continue;
 660
 661                   /* Check that all uses of the SSA name are divisions,
 662                      otherwise replacing the defining statement will do
 663                      the wrong thing.  */
 664                   fail = false;
 665                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 666                     {
 667                       gimple stmt2 = USE_STMT (use_p);
 668                       if (is_gimple_debug (stmt2))
 669                         continue;
 670                       if (!is_gimple_assign (stmt2)
 671                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 672                           || gimple_assign_rhs1 (stmt2) == arg1
 673                           || gimple_assign_rhs2 (stmt2) != arg1)
 674                         {
 675                           fail = true;
 676                           break;
 677                         }
 678                     }
 679                   if (fail)
 680                     continue;
 681
 682                   gimple_replace_ssa_lhs (stmt1, arg1);
 683                   gimple_call_set_fndecl (stmt1, fndecl);
 684                   update_stmt (stmt1);
 685                   reciprocal_stats.rfuncs_inserted++;
 686
 687                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 688                     {
 689                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 690                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 691                       fold_stmt_inplace (&gsi);
 692                       update_stmt (stmt);
 693                     }
 694                 }
 695             }
 696         }
 697     }
 698
 699   statistics_counter_event (fun, "reciprocal divs inserted",
 700                             reciprocal_stats.rdivs_inserted);
 701   statistics_counter_event (fun, "reciprocal functions inserted",
 702                             reciprocal_stats.rfuncs_inserted);
 703
 704   free_dominance_info (CDI_DOMINATORS);
 705   free_dominance_info (CDI_POST_DOMINATORS);
 706   free_alloc_pool (occ_pool);
 707   return 0;
 708 }
 709
 710 } // anon namespace
 711
 712 gimple_opt_pass *
 713 make_pass_cse_reciprocals (gcc::context *ctxt)
 714 {
 715   return new pass_cse_reciprocals (ctxt);
 716 }
 717
 718 /* Records an occurrence at statement USE_STMT in the vector of trees
 719    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 720    is not yet initialized.  Returns true if the occurrence was pushed on
 721    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 722    statements in the vector.  */
 723
 724 static bool
 725 maybe_record_sincos (vec<gimple> *stmts,
 726                      basic_block *top_bb, gimple use_stmt)
 727 {
 728   basic_block use_bb = gimple_bb (use_stmt);
 729   if (*top_bb
 730       && (*top_bb == use_bb
 731           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 732     stmts->safe_push (use_stmt);
 733   else if (!*top_bb
 734            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 735     {
 736       stmts->safe_push (use_stmt);
 737       *top_bb = use_bb;
 738     }
 739   else
 740     return false;
 741
 742   return true;
 743 }
 744
 745 /* Look for sin, cos and cexpi calls with the same argument NAME and
 746    create a single call to cexpi CSEing the result in this case.
 747    We first walk over all immediate uses of the argument collecting
 748    statements that we can CSE in a vector and in a second pass replace
 749    the statement rhs with a REALPART or IMAGPART expression on the
 750    result of the cexpi call we insert before the use statement that
 751    dominates all other candidates.  */
 752
 753 static bool
 754 execute_cse_sincos_1 (tree name)
 755 {
 756   gimple_stmt_iterator gsi;
 757   imm_use_iterator use_iter;
 758   tree fndecl, res, type;
 759   gimple def_stmt, use_stmt, stmt;
 760   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 761   auto_vec<gimple> stmts;
 762   basic_block top_bb = NULL;
 763   int i;
 764   bool cfg_changed = false;
 765
 766   type = TREE_TYPE (name);
 767   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 768     {
 769       if (gimple_code (use_stmt) != GIMPLE_CALL
 770           || !gimple_call_lhs (use_stmt)
 771           || !(fndecl = gimple_call_fndecl (use_stmt))
 772           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 773         continue;
 774
 775       switch (DECL_FUNCTION_CODE (fndecl))
 776         {
 777         CASE_FLT_FN (BUILT_IN_COS):
 778           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 779           break;
 780
 781         CASE_FLT_FN (BUILT_IN_SIN):
 782           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 783           break;
 784
 785         CASE_FLT_FN (BUILT_IN_CEXPI):
 786           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 787           break;
 788
 789         default:;
 790         }
 791     }
 792
 793   if (seen_cos + seen_sin + seen_cexpi <= 1)
 794     return false;
 795
 796   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 797      the name def statement.  */
 798   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 799   if (!fndecl)
 800     return false;
 801   stmt = gimple_build_call (fndecl, 1, name);
 802   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
 803   gimple_call_set_lhs (stmt, res);
 804
 805   def_stmt = SSA_NAME_DEF_STMT (name);
 806   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 807       && gimple_code (def_stmt) != GIMPLE_PHI
 808       && gimple_bb (def_stmt) == top_bb)
 809     {
 810       gsi = gsi_for_stmt (def_stmt);
 811       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 812     }
 813   else
 814     {
 815       gsi = gsi_after_labels (top_bb);
 816       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 817     }
 818   sincos_stats.inserted++;
 819
 820   /* And adjust the recorded old call sites.  */
 821   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
 822     {
 823       tree rhs = NULL;
 824       fndecl = gimple_call_fndecl (use_stmt);
 825
 826       switch (DECL_FUNCTION_CODE (fndecl))
 827         {
 828         CASE_FLT_FN (BUILT_IN_COS):
 829           rhs = fold_build1 (REALPART_EXPR, type, res);
 830           break;
 831
 832         CASE_FLT_FN (BUILT_IN_SIN):
 833           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 834           break;
 835
 836         CASE_FLT_FN (BUILT_IN_CEXPI):
 837           rhs = res;
 838           break;
 839
 840         default:;
 841           gcc_unreachable ();
 842         }
 843
 844         /* Replace call with a copy.  */
 845         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 846
 847         gsi = gsi_for_stmt (use_stmt);
 848         gsi_replace (&gsi, stmt, true);
 849         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 850           cfg_changed = true;
 851     }
 852
 853   return cfg_changed;
 854 }
 855
 856 /* To evaluate powi(x,n), the floating point value x raised to the
 857    constant integer exponent n, we use a hybrid algorithm that
 858    combines the "window method" with look-up tables.  For an
 859    introduction to exponentiation algorithms and "addition chains",
 860    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 861    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 862    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 863    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 864
 865 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 866    multiplications to inline before calling the system library's pow
 867    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 868    so this default never requires calling pow, powf or powl.  */
 869
 870 #ifndef POWI_MAX_MULTS
 871 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 872 #endif
 873
 874 /* The size of the "optimal power tree" lookup table.  All
 875    exponents less than this value are simply looked up in the
 876    powi_table below.  This threshold is also used to size the
 877    cache of pseudo registers that hold intermediate results.  */
 878 #define POWI_TABLE_SIZE 256
 879
 880 /* The size, in bits of the window, used in the "window method"
 881    exponentiation algorithm.  This is equivalent to a radix of
 882    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 883 #define POWI_WINDOW_SIZE 3
 884
 885 /* The following table is an efficient representation of an
 886    "optimal power tree".  For each value, i, the corresponding
 887    value, j, in the table states than an optimal evaluation
 888    sequence for calculating pow(x,i) can be found by evaluating
 889    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 890    100 integers is given in Knuth's "Seminumerical algorithms".  */
 891
 892 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 893   {
 894       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 895       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 896       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 897      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 898      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 899      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 900      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 901      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 902      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 903      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 904      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 905      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 906      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 907      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 908      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 909      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 910      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 911      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 912      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 913      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 914      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 915      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 916      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 917      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 918      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 919     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 920     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 921     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 922     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 923     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 924     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 925     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 926   };
 927
 928
 929 /* Return the number of multiplications required to calculate
 930    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 931    subroutine of powi_cost.  CACHE is an array indicating
 932    which exponents have already been calculated.  */
 933
 934 static int
 935 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 936 {
 937   /* If we've already calculated this exponent, then this evaluation
 938      doesn't require any additional multiplications.  */
 939   if (cache[n])
 940     return 0;
 941
 942   cache[n] = true;
 943   return powi_lookup_cost (n - powi_table[n], cache)
 944          + powi_lookup_cost (powi_table[n], cache) + 1;
 945 }
 946
 947 /* Return the number of multiplications required to calculate
 948    powi(x,n) for an arbitrary x, given the exponent N.  This
 949    function needs to be kept in sync with powi_as_mults below.  */
 950
 951 static int
 952 powi_cost (HOST_WIDE_INT n)
 953 {
 954   bool cache[POWI_TABLE_SIZE];
 955   unsigned HOST_WIDE_INT digit;
 956   unsigned HOST_WIDE_INT val;
 957   int result;
 958
 959   if (n == 0)
 960     return 0;
 961
 962   /* Ignore the reciprocal when calculating the cost.  */
 963   val = (n < 0) ? -n : n;
 964
 965   /* Initialize the exponent cache.  */
 966   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 967   cache[1] = true;
 968
 969   result = 0;
 970
 971   while (val >= POWI_TABLE_SIZE)
 972     {
 973       if (val & 1)
 974         {
 975           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 976           result += powi_lookup_cost (digit, cache)
 977                     + POWI_WINDOW_SIZE + 1;
 978           val >>= POWI_WINDOW_SIZE;
 979         }
 980       else
 981         {
 982           val >>= 1;
 983           result++;
 984         }
 985     }
 986
 987   return result + powi_lookup_cost (val, cache);
 988 }
 989
 990 /* Recursive subroutine of powi_as_mults.  This function takes the
 991    array, CACHE, of already calculated exponents and an exponent N and
 992    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 993
 994 static tree
 995 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 996                  HOST_WIDE_INT n, tree *cache)
 997 {
 998   tree op0, op1, ssa_target;
 999   unsigned HOST_WIDE_INT digit;
1000   gassign *mult_stmt;
1001
1002   if (n < POWI_TABLE_SIZE && cache[n])
1003     return cache[n];
1004
1005   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
1006
1007   if (n < POWI_TABLE_SIZE)
1008     {
1009       cache[n] = ssa_target;
1010       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
1011       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1012     }
1013   else if (n & 1)
1014     {
1015       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1016       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1017       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1018     }
1019   else
1020     {
1021       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1022       op1 = op0;
1023     }
1024
1025   mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1026   gimple_set_location (mult_stmt, loc);
1027   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1028
1029   return ssa_target;
1030 }
1031
1032 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1033    This function needs to be kept in sync with powi_cost above.  */
1034
1035 static tree
1036 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1037                tree arg0, HOST_WIDE_INT n)
1038 {
1039   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1040   gassign *div_stmt;
1041   tree target;
1042
1043   if (n == 0)
1044     return build_real (type, dconst1);
1045
1046   memset (cache, 0,  sizeof (cache));
1047   cache[1] = arg0;
1048
1049   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1050   if (n >= 0)
1051     return result;
1052
1053   /* If the original exponent was negative, reciprocate the result.  */
1054   target = make_temp_ssa_name (type, NULL, "powmult");
1055   div_stmt = gimple_build_assign (target, RDIV_EXPR,
1056                                   build_real (type, dconst1), result);
1057   gimple_set_location (div_stmt, loc);
1058   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1059
1060   return target;
1061 }
1062
1063 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1064    location info LOC.  If the arguments are appropriate, create an
1065    equivalent sequence of statements prior to GSI using an optimal
1066    number of multiplications, and return an expession holding the
1067    result.  */
1068
1069 static tree
1070 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1071                             tree arg0, HOST_WIDE_INT n)
1072 {
1073   /* Avoid largest negative number.  */
1074   if (n != -n
1075       && ((n >= -1 && n <= 2)
1076           || (optimize_function_for_speed_p (cfun)
1077               && powi_cost (n) <= POWI_MAX_MULTS)))
1078     return powi_as_mults (gsi, loc, arg0, n);
1079
1080   return NULL_TREE;
1081 }
1082
1083 /* Build a gimple call statement that calls FN with argument ARG.
1084    Set the lhs of the call statement to a fresh SSA name.  Insert the
1085    statement prior to GSI's current position, and return the fresh
1086    SSA name.  */
1087
1088 static tree
1089 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1090                        tree fn, tree arg)
1091 {
1092   gcall *call_stmt;
1093   tree ssa_target;
1094
1095   call_stmt = gimple_build_call (fn, 1, arg);
1096   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1097   gimple_set_lhs (call_stmt, ssa_target);
1098   gimple_set_location (call_stmt, loc);
1099   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1100
1101   return ssa_target;
1102 }
1103
1104 /* Build a gimple binary operation with the given CODE and arguments
1105    ARG0, ARG1, assigning the result to a new SSA name for variable
1106    TARGET.  Insert the statement prior to GSI's current position, and
1107    return the fresh SSA name.*/
1108
1109 static tree
1110 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1111                         const char *name, enum tree_code code,
1112                         tree arg0, tree arg1)
1113 {
1114   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1115   gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1116   gimple_set_location (stmt, loc);
1117   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1118   return result;
1119 }
1120
1121 /* Build a gimple reference operation with the given CODE and argument
1122    ARG, assigning the result to a new SSA name of TYPE with NAME.
1123    Insert the statement prior to GSI's current position, and return
1124    the fresh SSA name.  */
1125
1126 static inline tree
1127 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1128                       const char *name, enum tree_code code, tree arg0)
1129 {
1130   tree result = make_temp_ssa_name (type, NULL, name);
1131   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1132   gimple_set_location (stmt, loc);
1133   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1134   return result;
1135 }
1136
1137 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1138    prior to GSI's current position, and return the fresh SSA name.  */
1139
1140 static tree
1141 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1142                        tree type, tree val)
1143 {
1144   tree result = make_ssa_name (type);
1145   gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1146   gimple_set_location (stmt, loc);
1147   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1148   return result;
1149 }
1150
1151 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1152    with location info LOC.  If possible, create an equivalent and
1153    less expensive sequence of statements prior to GSI, and return an
1154    expession holding the result.  */
1155
1156 static tree
1157 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1158                            tree arg0, tree arg1)
1159 {
1160   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1161   REAL_VALUE_TYPE c2, dconst3;
1162   HOST_WIDE_INT n;
1163   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1164   machine_mode mode;
1165   bool hw_sqrt_exists, c_is_int, c2_is_int;
1166
1167   /* If the exponent isn't a constant, there's nothing of interest
1168      to be done.  */
1169   if (TREE_CODE (arg1) != REAL_CST)
1170     return NULL_TREE;
1171
1172   /* If the exponent is equivalent to an integer, expand to an optimal
1173      multiplication sequence when profitable.  */
1174   c = TREE_REAL_CST (arg1);
1175   n = real_to_integer (&c);
1176   real_from_integer (&cint, VOIDmode, n, SIGNED);
1177   c_is_int = real_identical (&c, &cint);
1178
1179   if (c_is_int
1180       && ((n >= -1 && n <= 2)
1181           || (flag_unsafe_math_optimizations
1182               && optimize_bb_for_speed_p (gsi_bb (*gsi))
1183               && powi_cost (n) <= POWI_MAX_MULTS)))
1184     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1185
1186   /* Attempt various optimizations using sqrt and cbrt.  */
1187   type = TREE_TYPE (arg0);
1188   mode = TYPE_MODE (type);
1189   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1190
1191   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1192      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1193      sqrt(-0) = -0.  */
1194   if (sqrtfn
1195       && REAL_VALUES_EQUAL (c, dconsthalf)
1196       && !HONOR_SIGNED_ZEROS (mode))
1197     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1198
1199   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1200      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1201      so do this optimization even if -Os.  Don't do this optimization
1202      if we don't have a hardware sqrt insn.  */
1203   dconst1_4 = dconst1;
1204   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1205   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1206
1207   if (flag_unsafe_math_optimizations
1208       && sqrtfn
1209       && REAL_VALUES_EQUAL (c, dconst1_4)
1210       && hw_sqrt_exists)
1211     {
1212       /* sqrt(x)  */
1213       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1214
1215       /* sqrt(sqrt(x))  */
1216       return build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1217     }
1218
1219   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1220      optimizing for space.  Don't do this optimization if we don't have
1221      a hardware sqrt insn.  */
1222   real_from_integer (&dconst3_4, VOIDmode, 3, SIGNED);
1223   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1224
1225   if (flag_unsafe_math_optimizations
1226       && sqrtfn
1227       && optimize_function_for_speed_p (cfun)
1228       && REAL_VALUES_EQUAL (c, dconst3_4)
1229       && hw_sqrt_exists)
1230     {
1231       /* sqrt(x)  */
1232       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1233
1234       /* sqrt(sqrt(x))  */
1235       sqrt_sqrt = build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1236
1237       /* sqrt(x) * sqrt(sqrt(x))  */
1238       return build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1239                                      sqrt_arg0, sqrt_sqrt);
1240     }
1241
1242   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1243      optimizations since 1./3. is not exactly representable.  If x
1244      is negative and finite, the correct value of pow(x,1./3.) is
1245      a NaN with the "invalid" exception raised, because the value
1246      of 1./3. actually has an even denominator.  The correct value
1247      of cbrt(x) is a negative real value.  */
1248   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1249   dconst1_3 = real_value_truncate (mode, dconst_third ());
1250
1251   if (flag_unsafe_math_optimizations
1252       && cbrtfn
1253       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1254       && REAL_VALUES_EQUAL (c, dconst1_3))
1255     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1256
1257   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1258      if we don't have a hardware sqrt insn.  */
1259   dconst1_6 = dconst1_3;
1260   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1261
1262   if (flag_unsafe_math_optimizations
1263       && sqrtfn
1264       && cbrtfn
1265       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1266       && optimize_function_for_speed_p (cfun)
1267       && hw_sqrt_exists
1268       && REAL_VALUES_EQUAL (c, dconst1_6))
1269     {
1270       /* sqrt(x)  */
1271       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1272
1273       /* cbrt(sqrt(x))  */
1274       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1275     }
1276
1277   /* Optimize pow(x,c), where n = 2c for some nonzero integer n
1278      and c not an integer, into
1279
1280        sqrt(x) * powi(x, n/2),                n > 0;
1281        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1282
1283      Do not calculate the powi factor when n/2 = 0.  */
1284   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1285   n = real_to_integer (&c2);
1286   real_from_integer (&cint, VOIDmode, n, SIGNED);
1287   c2_is_int = real_identical (&c2, &cint);
1288
1289   if (flag_unsafe_math_optimizations
1290       && sqrtfn
1291       && c2_is_int
1292       && !c_is_int
1293       && optimize_function_for_speed_p (cfun))
1294     {
1295       tree powi_x_ndiv2 = NULL_TREE;
1296
1297       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1298          possible or profitable, give up.  Skip the degenerate case when
1299          n is 1 or -1, where the result is always 1.  */
1300       if (absu_hwi (n) != 1)
1301         {
1302           powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1303                                                      abs_hwi (n / 2));
1304           if (!powi_x_ndiv2)
1305             return NULL_TREE;
1306         }
1307
1308       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1309          result of the optimal multiply sequence just calculated.  */
1310       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1311
1312       if (absu_hwi (n) == 1)
1313         result = sqrt_arg0;
1314       else
1315         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1316                                          sqrt_arg0, powi_x_ndiv2);
1317
1318       /* If n is negative, reciprocate the result.  */
1319       if (n < 0)
1320         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1321                                          build_real (type, dconst1), result);
1322       return result;
1323     }
1324
1325   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1326
1327      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1328      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1329
1330      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1331      different from pow(x, 1./3.) due to rounding and behavior with
1332      negative x, we need to constrain this transformation to unsafe
1333      math and positive x or finite math.  */
1334   real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1335   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1336   real_round (&c2, mode, &c2);
1337   n = real_to_integer (&c2);
1338   real_from_integer (&cint, VOIDmode, n, SIGNED);
1339   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1340   real_convert (&c2, mode, &c2);
1341
1342   if (flag_unsafe_math_optimizations
1343       && cbrtfn
1344       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1345       && real_identical (&c2, &c)
1346       && !c2_is_int
1347       && optimize_function_for_speed_p (cfun)
1348       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1349     {
1350       tree powi_x_ndiv3 = NULL_TREE;
1351
1352       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1353          possible or profitable, give up.  Skip the degenerate case when
1354          abs(n) < 3, where the result is always 1.  */
1355       if (absu_hwi (n) >= 3)
1356         {
1357           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1358                                                      abs_hwi (n / 3));
1359           if (!powi_x_ndiv3)
1360             return NULL_TREE;
1361         }
1362
1363       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1364          as that creates an unnecessary variable.  Instead, just produce
1365          either cbrt(x) or cbrt(x) * cbrt(x).  */
1366       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1367
1368       if (absu_hwi (n) % 3 == 1)
1369         powi_cbrt_x = cbrt_x;
1370       else
1371         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1372                                               cbrt_x, cbrt_x);
1373
1374       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1375       if (absu_hwi (n) < 3)
1376         result = powi_cbrt_x;
1377       else
1378         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1379                                          powi_x_ndiv3, powi_cbrt_x);
1380
1381       /* If n is negative, reciprocate the result.  */
1382       if (n < 0)
1383         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1384                                          build_real (type, dconst1), result);
1385
1386       return result;
1387     }
1388
1389   /* No optimizations succeeded.  */
1390   return NULL_TREE;
1391 }
1392
1393 /* ARG is the argument to a cabs builtin call in GSI with location info
1394    LOC.  Create a sequence of statements prior to GSI that calculates
1395    sqrt(R*R + I*I), where R and I are the real and imaginary components
1396    of ARG, respectively.  Return an expression holding the result.  */
1397
1398 static tree
1399 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1400 {
1401   tree real_part, imag_part, addend1, addend2, sum, result;
1402   tree type = TREE_TYPE (TREE_TYPE (arg));
1403   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1404   machine_mode mode = TYPE_MODE (type);
1405
1406   if (!flag_unsafe_math_optimizations
1407       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1408       || !sqrtfn
1409       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1410     return NULL_TREE;
1411
1412   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1413                                     REALPART_EXPR, arg);
1414   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1415                                     real_part, real_part);
1416   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1417                                     IMAGPART_EXPR, arg);
1418   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1419                                     imag_part, imag_part);
1420   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1421   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1422
1423   return result;
1424 }
1425
1426 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1427    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1428    an optimal number of multiplies, when n is a constant.  */
1429
1430 namespace {
1431
1432 const pass_data pass_data_cse_sincos =
1433 {
1434   GIMPLE_PASS, /* type */
1435   "sincos", /* name */
1436   OPTGROUP_NONE, /* optinfo_flags */
1437   TV_NONE, /* tv_id */
1438   PROP_ssa, /* properties_required */
1439   0, /* properties_provided */
1440   0, /* properties_destroyed */
1441   0, /* todo_flags_start */
1442   TODO_update_ssa, /* todo_flags_finish */
1443 };
1444
1445 class pass_cse_sincos : public gimple_opt_pass
1446 {
1447 public:
1448   pass_cse_sincos (gcc::context *ctxt)
1449     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1450   {}
1451
1452   /* opt_pass methods: */
1453   virtual bool gate (function *)
1454     {
1455       /* We no longer require either sincos or cexp, since powi expansion
1456          piggybacks on this pass.  */
1457       return optimize;
1458     }
1459
1460   virtual unsigned int execute (function *);
1461
1462 }; // class pass_cse_sincos
1463
1464 unsigned int
1465 pass_cse_sincos::execute (function *fun)
1466 {
1467   basic_block bb;
1468   bool cfg_changed = false;
1469
1470   calculate_dominance_info (CDI_DOMINATORS);
1471   memset (&sincos_stats, 0, sizeof (sincos_stats));
1472
1473   FOR_EACH_BB_FN (bb, fun)
1474     {
1475       gimple_stmt_iterator gsi;
1476       bool cleanup_eh = false;
1477
1478       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1479         {
1480           gimple stmt = gsi_stmt (gsi);
1481           tree fndecl;
1482
1483           /* Only the last stmt in a bb could throw, no need to call
1484              gimple_purge_dead_eh_edges if we change something in the middle
1485              of a basic block.  */
1486           cleanup_eh = false;
1487
1488           if (is_gimple_call (stmt)
1489               && gimple_call_lhs (stmt)
1490               && (fndecl = gimple_call_fndecl (stmt))
1491               && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1492             {
1493               tree arg, arg0, arg1, result;
1494               HOST_WIDE_INT n;
1495               location_t loc;
1496
1497               switch (DECL_FUNCTION_CODE (fndecl))
1498                 {
1499                 CASE_FLT_FN (BUILT_IN_COS):
1500                 CASE_FLT_FN (BUILT_IN_SIN):
1501                 CASE_FLT_FN (BUILT_IN_CEXPI):
1502                   /* Make sure we have either sincos or cexp.  */
1503                   if (!targetm.libc_has_function (function_c99_math_complex)
1504                       && !targetm.libc_has_function (function_sincos))
1505                     break;
1506
1507                   arg = gimple_call_arg (stmt, 0);
1508                   if (TREE_CODE (arg) == SSA_NAME)
1509                     cfg_changed |= execute_cse_sincos_1 (arg);
1510                   break;
1511
1512                 CASE_FLT_FN (BUILT_IN_POW):
1513                   arg0 = gimple_call_arg (stmt, 0);
1514                   arg1 = gimple_call_arg (stmt, 1);
1515
1516                   loc = gimple_location (stmt);
1517                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1518
1519                   if (result)
1520                     {
1521                       tree lhs = gimple_get_lhs (stmt);
1522                       gassign *new_stmt = gimple_build_assign (lhs, result);
1523                       gimple_set_location (new_stmt, loc);
1524                       unlink_stmt_vdef (stmt);
1525                       gsi_replace (&gsi, new_stmt, true);
1526                       cleanup_eh = true;
1527                       if (gimple_vdef (stmt))
1528                         release_ssa_name (gimple_vdef (stmt));
1529                     }
1530                   break;
1531
1532                 CASE_FLT_FN (BUILT_IN_POWI):
1533                   arg0 = gimple_call_arg (stmt, 0);
1534                   arg1 = gimple_call_arg (stmt, 1);
1535                   loc = gimple_location (stmt);
1536
1537                   if (real_minus_onep (arg0))
1538                     {
1539                       tree t0, t1, cond, one, minus_one;
1540                       gassign *stmt;
1541
1542                       t0 = TREE_TYPE (arg0);
1543                       t1 = TREE_TYPE (arg1);
1544                       one = build_real (t0, dconst1);
1545                       minus_one = build_real (t0, dconstm1);
1546
1547                       cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1548                       stmt = gimple_build_assign (cond, BIT_AND_EXPR,
1549                                                   arg1, build_int_cst (t1, 1));
1550                       gimple_set_location (stmt, loc);
1551                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1552
1553                       result = make_temp_ssa_name (t0, NULL, "powi");
1554                       stmt = gimple_build_assign (result, COND_EXPR, cond,
1555                                                   minus_one, one);
1556                       gimple_set_location (stmt, loc);
1557                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1558                     }
1559                   else
1560                     {
1561                       if (!tree_fits_shwi_p (arg1))
1562                         break;
1563
1564                       n = tree_to_shwi (arg1);
1565                       result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1566                     }
1567
1568                   if (result)
1569                     {
1570                       tree lhs = gimple_get_lhs (stmt);
1571                       gassign *new_stmt = gimple_build_assign (lhs, result);
1572                       gimple_set_location (new_stmt, loc);
1573                       unlink_stmt_vdef (stmt);
1574                       gsi_replace (&gsi, new_stmt, true);
1575                       cleanup_eh = true;
1576                       if (gimple_vdef (stmt))
1577                         release_ssa_name (gimple_vdef (stmt));
1578                     }
1579                   break;
1580
1581                 CASE_FLT_FN (BUILT_IN_CABS):
1582                   arg0 = gimple_call_arg (stmt, 0);
1583                   loc = gimple_location (stmt);
1584                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1585
1586                   if (result)
1587                     {
1588                       tree lhs = gimple_get_lhs (stmt);
1589                       gassign *new_stmt = gimple_build_assign (lhs, result);
1590                       gimple_set_location (new_stmt, loc);
1591                       unlink_stmt_vdef (stmt);
1592                       gsi_replace (&gsi, new_stmt, true);
1593                       cleanup_eh = true;
1594                       if (gimple_vdef (stmt))
1595                         release_ssa_name (gimple_vdef (stmt));
1596                     }
1597                   break;
1598
1599                 default:;
1600                 }
1601             }
1602         }
1603       if (cleanup_eh)
1604         cfg_changed |= gimple_purge_dead_eh_edges (bb);
1605     }
1606
1607   statistics_counter_event (fun, "sincos statements inserted",
1608                             sincos_stats.inserted);
1609
1610   free_dominance_info (CDI_DOMINATORS);
1611   return cfg_changed ? TODO_cleanup_cfg : 0;
1612 }
1613
1614 } // anon namespace
1615
1616 gimple_opt_pass *
1617 make_pass_cse_sincos (gcc::context *ctxt)
1618 {
1619   return new pass_cse_sincos (ctxt);
1620 }
1621
1622 /* A symbolic number is used to detect byte permutation and selection
1623    patterns.  Therefore the field N contains an artificial number
1624    consisting of octet sized markers:
1625
1626    0    - target byte has the value 0
1627    FF   - target byte has an unknown value (eg. due to sign extension)
1628    1..size - marker value is the target byte index minus one.
1629
1630    To detect permutations on memory sources (arrays and structures), a symbolic
1631    number is also associated a base address (the array or structure the load is
1632    made from), an offset from the base address and a range which gives the
1633    difference between the highest and lowest accessed memory location to make
1634    such a symbolic number. The range is thus different from size which reflects
1635    the size of the type of current expression. Note that for non memory source,
1636    range holds the same value as size.
1637
1638    For instance, for an array char a[], (short) a[0] | (short) a[3] would have
1639    a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
1640    still have a size of 2 but this time a range of 1.  */
1641
1642 struct symbolic_number {
1643   uint64_t n;
1644   tree type;
1645   tree base_addr;
1646   tree offset;
1647   HOST_WIDE_INT bytepos;
1648   tree alias_set;
1649   tree vuse;
1650   unsigned HOST_WIDE_INT range;
1651 };
1652
1653 #define BITS_PER_MARKER 8
1654 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1655 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1656 #define HEAD_MARKER(n, size) \
1657   ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1658
1659 /* The number which the find_bswap_or_nop_1 result should match in
1660    order to have a nop.  The number is masked according to the size of
1661    the symbolic number before using it.  */
1662 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1663   (uint64_t)0x08070605 << 32 | 0x04030201)
1664
1665 /* The number which the find_bswap_or_nop_1 result should match in
1666    order to have a byte swap.  The number is masked according to the
1667    size of the symbolic number before using it.  */
1668 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1669   (uint64_t)0x01020304 << 32 | 0x05060708)
1670
1671 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1672    number N.  Return false if the requested operation is not permitted
1673    on a symbolic number.  */
1674
1675 static inline bool
1676 do_shift_rotate (enum tree_code code,
1677                  struct symbolic_number *n,
1678                  int count)
1679 {
1680   int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1681   unsigned head_marker;
1682
1683   if (count % BITS_PER_UNIT != 0)
1684     return false;
1685   count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
1686
1687   /* Zero out the extra bits of N in order to avoid them being shifted
1688      into the significant bits.  */
1689   if (size < 64 / BITS_PER_MARKER)
1690     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1691
1692   switch (code)
1693     {
1694     case LSHIFT_EXPR:
1695       n->n <<= count;
1696       break;
1697     case RSHIFT_EXPR:
1698       head_marker = HEAD_MARKER (n->n, size);
1699       n->n >>= count;
1700       /* Arithmetic shift of signed type: result is dependent on the value.  */
1701       if (!TYPE_UNSIGNED (n->type) && head_marker)
1702         for (i = 0; i < count / BITS_PER_MARKER; i++)
1703           n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1704                   << ((size - 1 - i) * BITS_PER_MARKER);
1705       break;
1706     case LROTATE_EXPR:
1707       n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
1708       break;
1709     case RROTATE_EXPR:
1710       n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
1711       break;
1712     default:
1713       return false;
1714     }
1715   /* Zero unused bits for size.  */
1716   if (size < 64 / BITS_PER_MARKER)
1717     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1718   return true;
1719 }
1720
1721 /* Perform sanity checking for the symbolic number N and the gimple
1722    statement STMT.  */
1723
1724 static inline bool
1725 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1726 {
1727   tree lhs_type;
1728
1729   lhs_type = gimple_expr_type (stmt);
1730
1731   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1732     return false;
1733
1734   if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
1735     return false;
1736
1737   return true;
1738 }
1739
1740 /* Initialize the symbolic number N for the bswap pass from the base element
1741    SRC manipulated by the bitwise OR expression.  */
1742
1743 static bool
1744 init_symbolic_number (struct symbolic_number *n, tree src)
1745 {
1746   int size;
1747
1748   n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
1749
1750   /* Set up the symbolic number N by setting each byte to a value between 1 and
1751      the byte size of rhs1.  The highest order byte is set to n->size and the
1752      lowest order byte to 1.  */
1753   n->type = TREE_TYPE (src);
1754   size = TYPE_PRECISION (n->type);
1755   if (size % BITS_PER_UNIT != 0)
1756     return false;
1757   size /= BITS_PER_UNIT;
1758   if (size > 64 / BITS_PER_MARKER)
1759     return false;
1760   n->range = size;
1761   n->n = CMPNOP;
1762
1763   if (size < 64 / BITS_PER_MARKER)
1764     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1765
1766   return true;
1767 }
1768
1769 /* Check if STMT might be a byte swap or a nop from a memory source and returns
1770    the answer. If so, REF is that memory source and the base of the memory area
1771    accessed and the offset of the access from that base are recorded in N.  */
1772
1773 bool
1774 find_bswap_or_nop_load (gimple stmt, tree ref, struct symbolic_number *n)
1775 {
1776   /* Leaf node is an array or component ref. Memorize its base and
1777      offset from base to compare to other such leaf node.  */
1778   HOST_WIDE_INT bitsize, bitpos;
1779   machine_mode mode;
1780   int unsignedp, volatilep;
1781   tree offset, base_addr;
1782
1783   /* Not prepared to handle PDP endian.  */
1784   if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
1785     return false;
1786
1787   if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
1788     return false;
1789
1790   base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
1791                                    &unsignedp, &volatilep, false);
1792
1793   if (TREE_CODE (base_addr) == MEM_REF)
1794     {
1795       offset_int bit_offset = 0;
1796       tree off = TREE_OPERAND (base_addr, 1);
1797
1798       if (!integer_zerop (off))
1799         {
1800           offset_int boff, coff = mem_ref_offset (base_addr);
1801           boff = wi::lshift (coff, LOG2_BITS_PER_UNIT);
1802           bit_offset += boff;
1803         }
1804
1805       base_addr = TREE_OPERAND (base_addr, 0);
1806
1807       /* Avoid returning a negative bitpos as this may wreak havoc later.  */
1808       if (wi::neg_p (bit_offset))
1809         {
1810           offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
1811           offset_int tem = bit_offset.and_not (mask);
1812           /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
1813              Subtract it to BIT_OFFSET and add it (scaled) to OFFSET.  */
1814           bit_offset -= tem;
1815           tem = wi::arshift (tem, LOG2_BITS_PER_UNIT);
1816           if (offset)
1817             offset = size_binop (PLUS_EXPR, offset,
1818                                     wide_int_to_tree (sizetype, tem));
1819           else
1820             offset = wide_int_to_tree (sizetype, tem);
1821         }
1822
1823       bitpos += bit_offset.to_shwi ();
1824     }
1825
1826   if (bitpos % BITS_PER_UNIT)
1827     return false;
1828   if (bitsize % BITS_PER_UNIT)
1829     return false;
1830
1831   if (!init_symbolic_number (n, ref))
1832     return false;
1833   n->base_addr = base_addr;
1834   n->offset = offset;
1835   n->bytepos = bitpos / BITS_PER_UNIT;
1836   n->alias_set = reference_alias_ptr_type (ref);
1837   n->vuse = gimple_vuse (stmt);
1838   return true;
1839 }
1840
1841 /* Compute the symbolic number N representing the result of a bitwise OR on 2
1842    symbolic number N1 and N2 whose source statements are respectively
1843    SOURCE_STMT1 and SOURCE_STMT2.  */
1844
1845 static gimple
1846 perform_symbolic_merge (gimple source_stmt1, struct symbolic_number *n1,
1847                         gimple source_stmt2, struct symbolic_number *n2,
1848                         struct symbolic_number *n)
1849 {
1850   int i, size;
1851   uint64_t mask;
1852   gimple source_stmt;
1853   struct symbolic_number *n_start;
1854
1855   /* Sources are different, cancel bswap if they are not memory location with
1856      the same base (array, structure, ...).  */
1857   if (gimple_assign_rhs1 (source_stmt1) != gimple_assign_rhs1 (source_stmt2))
1858     {
1859       int64_t inc;
1860       HOST_WIDE_INT start_sub, end_sub, end1, end2, end;
1861       struct symbolic_number *toinc_n_ptr, *n_end;
1862
1863       if (!n1->base_addr || !n2->base_addr
1864           || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
1865         return NULL;
1866
1867       if (!n1->offset != !n2->offset
1868           || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
1869         return NULL;
1870
1871       if (n1->bytepos < n2->bytepos)
1872         {
1873           n_start = n1;
1874           start_sub = n2->bytepos - n1->bytepos;
1875           source_stmt = source_stmt1;
1876         }
1877       else
1878         {
1879           n_start = n2;
1880           start_sub = n1->bytepos - n2->bytepos;
1881           source_stmt = source_stmt2;
1882         }
1883
1884       /* Find the highest address at which a load is performed and
1885          compute related info.  */
1886       end1 = n1->bytepos + (n1->range - 1);
1887       end2 = n2->bytepos + (n2->range - 1);
1888       if (end1 < end2)
1889         {
1890           end = end2;
1891           end_sub = end2 - end1;
1892         }
1893       else
1894         {
1895           end = end1;
1896           end_sub = end1 - end2;
1897         }
1898       n_end = (end2 > end1) ? n2 : n1;
1899
1900       /* Find symbolic number whose lsb is the most significant.  */
1901       if (BYTES_BIG_ENDIAN)
1902         toinc_n_ptr = (n_end == n1) ? n2 : n1;
1903       else
1904         toinc_n_ptr = (n_start == n1) ? n2 : n1;
1905
1906       n->range = end - n_start->bytepos + 1;
1907
1908       /* Check that the range of memory covered can be represented by
1909          a symbolic number.  */
1910       if (n->range > 64 / BITS_PER_MARKER)
1911         return NULL;
1912
1913       /* Reinterpret byte marks in symbolic number holding the value of
1914          bigger weight according to target endianness.  */
1915       inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
1916       size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
1917       for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
1918         {
1919           unsigned marker
1920             = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
1921           if (marker && marker != MARKER_BYTE_UNKNOWN)
1922             toinc_n_ptr->n += inc;
1923         }
1924     }
1925   else
1926     {
1927       n->range = n1->range;
1928       n_start = n1;
1929       source_stmt = source_stmt1;
1930     }
1931
1932   if (!n1->alias_set
1933       || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
1934     n->alias_set = n1->alias_set;
1935   else
1936     n->alias_set = ptr_type_node;
1937   n->vuse = n_start->vuse;
1938   n->base_addr = n_start->base_addr;
1939   n->offset = n_start->offset;
1940   n->bytepos = n_start->bytepos;
1941   n->type = n_start->type;
1942   size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1943
1944   for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
1945     {
1946       uint64_t masked1, masked2;
1947
1948       masked1 = n1->n & mask;
1949       masked2 = n2->n & mask;
1950       if (masked1 && masked2 && masked1 != masked2)
1951         return NULL;
1952     }
1953   n->n = n1->n | n2->n;
1954
1955   return source_stmt;
1956 }
1957
1958 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
1959    the operation given by the rhs of STMT on the result.  If the operation
1960    could successfully be executed the function returns a gimple stmt whose
1961    rhs's first tree is the expression of the source operand and NULL
1962    otherwise.  */
1963
1964 static gimple
1965 find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
1966 {
1967   enum tree_code code;
1968   tree rhs1, rhs2 = NULL;
1969   gimple rhs1_stmt, rhs2_stmt, source_stmt1;
1970   enum gimple_rhs_class rhs_class;
1971
1972   if (!limit || !is_gimple_assign (stmt))
1973     return NULL;
1974
1975   rhs1 = gimple_assign_rhs1 (stmt);
1976
1977   if (find_bswap_or_nop_load (stmt, rhs1, n))
1978     return stmt;
1979
1980   if (TREE_CODE (rhs1) != SSA_NAME)
1981     return NULL;
1982
1983   code = gimple_assign_rhs_code (stmt);
1984   rhs_class = gimple_assign_rhs_class (stmt);
1985   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1986
1987   if (rhs_class == GIMPLE_BINARY_RHS)
1988     rhs2 = gimple_assign_rhs2 (stmt);
1989
1990   /* Handle unary rhs and binary rhs with integer constants as second
1991      operand.  */
1992
1993   if (rhs_class == GIMPLE_UNARY_RHS
1994       || (rhs_class == GIMPLE_BINARY_RHS
1995           && TREE_CODE (rhs2) == INTEGER_CST))
1996     {
1997       if (code != BIT_AND_EXPR
1998           && code != LSHIFT_EXPR
1999           && code != RSHIFT_EXPR
2000           && code != LROTATE_EXPR
2001           && code != RROTATE_EXPR
2002           && !CONVERT_EXPR_CODE_P (code))
2003         return NULL;
2004
2005       source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
2006
2007       /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
2008          we have to initialize the symbolic number.  */
2009       if (!source_stmt1)
2010         {
2011           if (gimple_assign_load_p (stmt)
2012               || !init_symbolic_number (n, rhs1))
2013             return NULL;
2014           source_stmt1 = stmt;
2015         }
2016
2017       switch (code)
2018         {
2019         case BIT_AND_EXPR:
2020           {
2021             int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2022             uint64_t val = int_cst_value (rhs2), mask = 0;
2023             uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2024
2025             /* Only constants masking full bytes are allowed.  */
2026             for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
2027               if ((val & tmp) != 0 && (val & tmp) != tmp)
2028                 return NULL;
2029               else if (val & tmp)
2030                 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2031
2032             n->n &= mask;
2033           }
2034           break;
2035         case LSHIFT_EXPR:
2036         case RSHIFT_EXPR:
2037         case LROTATE_EXPR:
2038         case RROTATE_EXPR:
2039           if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2)))
2040             return NULL;
2041           break;
2042         CASE_CONVERT:
2043           {
2044             int i, type_size, old_type_size;
2045             tree type;
2046
2047             type = gimple_expr_type (stmt);
2048             type_size = TYPE_PRECISION (type);
2049             if (type_size % BITS_PER_UNIT != 0)
2050               return NULL;
2051             type_size /= BITS_PER_UNIT;
2052             if (type_size > 64 / BITS_PER_MARKER)
2053               return NULL;
2054
2055             /* Sign extension: result is dependent on the value.  */
2056             old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2057             if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
2058                 && HEAD_MARKER (n->n, old_type_size))
2059               for (i = 0; i < type_size - old_type_size; i++)
2060                 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2061                         << ((type_size - 1 - i) * BITS_PER_MARKER);
2062
2063             if (type_size < 64 / BITS_PER_MARKER)
2064               {
2065                 /* If STMT casts to a smaller type mask out the bits not
2066                    belonging to the target type.  */
2067                 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
2068               }
2069             n->type = type;
2070             if (!n->base_addr)
2071               n->range = type_size;
2072           }
2073           break;
2074         default:
2075           return NULL;
2076         };
2077       return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
2078     }
2079
2080   /* Handle binary rhs.  */
2081
2082   if (rhs_class == GIMPLE_BINARY_RHS)
2083     {
2084       struct symbolic_number n1, n2;
2085       gimple source_stmt, source_stmt2;
2086
2087       if (code != BIT_IOR_EXPR)
2088         return NULL;
2089
2090       if (TREE_CODE (rhs2) != SSA_NAME)
2091         return NULL;
2092
2093       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2094
2095       switch (code)
2096         {
2097         case BIT_IOR_EXPR:
2098           source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
2099
2100           if (!source_stmt1)
2101             return NULL;
2102
2103           source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
2104
2105           if (!source_stmt2)
2106             return NULL;
2107
2108           if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
2109             return NULL;
2110
2111           if (!n1.vuse != !n2.vuse
2112               || (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
2113             return NULL;
2114
2115           source_stmt
2116             = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
2117
2118           if (!source_stmt)
2119             return NULL;
2120
2121           if (!verify_symbolic_number_p (n, stmt))
2122             return NULL;
2123
2124           break;
2125         default:
2126           return NULL;
2127         }
2128       return source_stmt;
2129     }
2130   return NULL;
2131 }
2132
2133 /* Check if STMT completes a bswap implementation or a read in a given
2134    endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2135    accordingly.  It also sets N to represent the kind of operations
2136    performed: size of the resulting expression and whether it works on
2137    a memory source, and if so alias-set and vuse.  At last, the
2138    function returns a stmt whose rhs's first tree is the source
2139    expression.  */
2140
2141 static gimple
2142 find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap)
2143 {
2144 /* The number which the find_bswap_or_nop_1 result should match in order
2145    to have a full byte swap.  The number is shifted to the right
2146    according to the size of the symbolic number before using it.  */
2147   uint64_t cmpxchg = CMPXCHG;
2148   uint64_t cmpnop = CMPNOP;
2149
2150   gimple source_stmt;
2151   int limit;
2152
2153   /* The last parameter determines the depth search limit.  It usually
2154      correlates directly to the number n of bytes to be touched.  We
2155      increase that number by log2(n) + 1 here in order to also
2156      cover signed -> unsigned conversions of the src operand as can be seen
2157      in libgcc, and for initial shift/and operation of the src operand.  */
2158   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2159   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2160   source_stmt = find_bswap_or_nop_1 (stmt, n, limit);
2161
2162   if (!source_stmt)
2163     return NULL;
2164
2165   /* Find real size of result (highest non-zero byte).  */
2166   if (n->base_addr)
2167     {
2168       int rsize;
2169       uint64_t tmpn;
2170
2171       for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2172       n->range = rsize;
2173     }
2174
2175   /* Zero out the extra bits of N and CMP*.  */
2176   if (n->range < (int) sizeof (int64_t))
2177     {
2178       uint64_t mask;
2179
2180       mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2181       cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2182       cmpnop &= mask;
2183     }
2184
2185   /* A complete byte swap should make the symbolic number to start with
2186      the largest digit in the highest order byte. Unchanged symbolic
2187      number indicates a read with same endianness as target architecture.  */
2188   if (n->n == cmpnop)
2189     *bswap = false;
2190   else if (n->n == cmpxchg)
2191     *bswap = true;
2192   else
2193     return NULL;
2194
2195   /* Useless bit manipulation performed by code.  */
2196   if (!n->base_addr && n->n == cmpnop)
2197     return NULL;
2198
2199   n->range *= BITS_PER_UNIT;
2200   return source_stmt;
2201 }
2202
2203 namespace {
2204
2205 const pass_data pass_data_optimize_bswap =
2206 {
2207   GIMPLE_PASS, /* type */
2208   "bswap", /* name */
2209   OPTGROUP_NONE, /* optinfo_flags */
2210   TV_NONE, /* tv_id */
2211   PROP_ssa, /* properties_required */
2212   0, /* properties_provided */
2213   0, /* properties_destroyed */
2214   0, /* todo_flags_start */
2215   0, /* todo_flags_finish */
2216 };
2217
2218 class pass_optimize_bswap : public gimple_opt_pass
2219 {
2220 public:
2221   pass_optimize_bswap (gcc::context *ctxt)
2222     : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2223   {}
2224
2225   /* opt_pass methods: */
2226   virtual bool gate (function *)
2227     {
2228       return flag_expensive_optimizations && optimize;
2229     }
2230
2231   virtual unsigned int execute (function *);
2232
2233 }; // class pass_optimize_bswap
2234
2235 /* Perform the bswap optimization: replace the expression computed in the rhs
2236    of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2237    Which of these alternatives replace the rhs is given by N->base_addr (non
2238    null if a load is needed) and BSWAP.  The type, VUSE and set-alias of the
2239    load to perform are also given in N while the builtin bswap invoke is given
2240    in FNDEL.  Finally, if a load is involved, SRC_STMT refers to one of the
2241    load statements involved to construct the rhs in CUR_STMT and N->range gives
2242    the size of the rhs expression for maintaining some statistics.
2243
2244    Note that if the replacement involve a load, CUR_STMT is moved just after
2245    SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2246    changing of basic block.  */
2247
2248 static bool
2249 bswap_replace (gimple cur_stmt, gimple src_stmt, tree fndecl, tree bswap_type,
2250                tree load_type, struct symbolic_number *n, bool bswap)
2251 {
2252   gimple_stmt_iterator gsi;
2253   tree src, tmp, tgt;
2254   gimple bswap_stmt;
2255
2256   gsi = gsi_for_stmt (cur_stmt);
2257   src = gimple_assign_rhs1 (src_stmt);
2258   tgt = gimple_assign_lhs (cur_stmt);
2259
2260   /* Need to load the value from memory first.  */
2261   if (n->base_addr)
2262     {
2263       gimple_stmt_iterator gsi_ins = gsi_for_stmt (src_stmt);
2264       tree addr_expr, addr_tmp, val_expr, val_tmp;
2265       tree load_offset_ptr, aligned_load_type;
2266       gimple addr_stmt, load_stmt;
2267       unsigned align;
2268       HOST_WIDE_INT load_offset = 0;
2269
2270       align = get_object_alignment (src);
2271       /* If the new access is smaller than the original one, we need
2272          to perform big endian adjustment.  */
2273       if (BYTES_BIG_ENDIAN)
2274         {
2275           HOST_WIDE_INT bitsize, bitpos;
2276           machine_mode mode;
2277           int unsignedp, volatilep;
2278           tree offset;
2279
2280           get_inner_reference (src, &bitsize, &bitpos, &offset, &mode,
2281                                &unsignedp, &volatilep, false);
2282           if (n->range < (unsigned HOST_WIDE_INT) bitsize)
2283             {
2284               load_offset = (bitsize - n->range) / BITS_PER_UNIT;
2285               unsigned HOST_WIDE_INT l
2286                 = (load_offset * BITS_PER_UNIT) & (align - 1);
2287               if (l)
2288                 align = l & -l;
2289             }
2290         }
2291
2292       if (bswap
2293           && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
2294           && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
2295         return false;
2296
2297       /* Move cur_stmt just before  one of the load of the original
2298          to ensure it has the same VUSE.  See PR61517 for what could
2299          go wrong.  */
2300       gsi_move_before (&gsi, &gsi_ins);
2301       gsi = gsi_for_stmt (cur_stmt);
2302
2303       /* Compute address to load from and cast according to the size
2304          of the load.  */
2305       addr_expr = build_fold_addr_expr (unshare_expr (src));
2306       if (is_gimple_mem_ref_addr (addr_expr))
2307         addr_tmp = addr_expr;
2308       else
2309         {
2310           addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2311                                          "load_src");
2312           addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2313           gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2314         }
2315
2316       /* Perform the load.  */
2317       aligned_load_type = load_type;
2318       if (align < TYPE_ALIGN (load_type))
2319         aligned_load_type = build_aligned_type (load_type, align);
2320       load_offset_ptr = build_int_cst (n->alias_set, load_offset);
2321       val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2322                               load_offset_ptr);
2323
2324       if (!bswap)
2325         {
2326           if (n->range == 16)
2327             nop_stats.found_16bit++;
2328           else if (n->range == 32)
2329             nop_stats.found_32bit++;
2330           else
2331             {
2332               gcc_assert (n->range == 64);
2333               nop_stats.found_64bit++;
2334             }
2335
2336           /* Convert the result of load if necessary.  */
2337           if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2338             {
2339               val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2340                                             "load_dst");
2341               load_stmt = gimple_build_assign (val_tmp, val_expr);
2342               gimple_set_vuse (load_stmt, n->vuse);
2343               gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2344               gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2345             }
2346           else
2347             {
2348               gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2349               gimple_set_vuse (cur_stmt, n->vuse);
2350             }
2351           update_stmt (cur_stmt);
2352
2353           if (dump_file)
2354             {
2355               fprintf (dump_file,
2356                        "%d bit load in target endianness found at: ",
2357                        (int) n->range);
2358               print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2359             }
2360           return true;
2361         }
2362       else
2363         {
2364           val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2365           load_stmt = gimple_build_assign (val_tmp, val_expr);
2366           gimple_set_vuse (load_stmt, n->vuse);
2367           gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2368         }
2369       src = val_tmp;
2370     }
2371
2372   if (n->range == 16)
2373     bswap_stats.found_16bit++;
2374   else if (n->range == 32)
2375     bswap_stats.found_32bit++;
2376   else
2377     {
2378       gcc_assert (n->range == 64);
2379       bswap_stats.found_64bit++;
2380     }
2381
2382   tmp = src;
2383
2384   /* Convert the src expression if necessary.  */
2385   if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2386     {
2387       gimple convert_stmt;
2388
2389       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2390       convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
2391       gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2392     }
2393
2394   /* Canonical form for 16 bit bswap is a rotate expression.  Only 16bit values
2395      are considered as rotation of 2N bit values by N bits is generally not
2396      equivalent to a bswap.  Consider for instance 0x01020304 r>> 16 which
2397      gives 0x03040102 while a bswap for that value is 0x04030201.  */
2398   if (bswap && n->range == 16)
2399     {
2400       tree count = build_int_cst (NULL, BITS_PER_UNIT);
2401       src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count);
2402       bswap_stmt = gimple_build_assign (NULL, src);
2403     }
2404   else
2405     bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2406
2407   tmp = tgt;
2408
2409   /* Convert the result if necessary.  */
2410   if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2411     {
2412       gimple convert_stmt;
2413
2414       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2415       convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
2416       gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2417     }
2418
2419   gimple_set_lhs (bswap_stmt, tmp);
2420
2421   if (dump_file)
2422     {
2423       fprintf (dump_file, "%d bit bswap implementation found at: ",
2424                (int) n->range);
2425       print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2426     }
2427
2428   gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2429   gsi_remove (&gsi, true);
2430   return true;
2431 }
2432
2433 /* Find manual byte swap implementations as well as load in a given
2434    endianness. Byte swaps are turned into a bswap builtin invokation
2435    while endian loads are converted to bswap builtin invokation or
2436    simple load according to the target endianness.  */
2437
2438 unsigned int
2439 pass_optimize_bswap::execute (function *fun)
2440 {
2441   basic_block bb;
2442   bool bswap32_p, bswap64_p;
2443   bool changed = false;
2444   tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2445
2446   if (BITS_PER_UNIT != 8)
2447     return 0;
2448
2449   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2450                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2451   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2452                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2453                    || (bswap32_p && word_mode == SImode)));
2454
2455   /* Determine the argument type of the builtins.  The code later on
2456      assumes that the return and argument type are the same.  */
2457   if (bswap32_p)
2458     {
2459       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2460       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2461     }
2462
2463   if (bswap64_p)
2464     {
2465       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2466       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2467     }
2468
2469   memset (&nop_stats, 0, sizeof (nop_stats));
2470   memset (&bswap_stats, 0, sizeof (bswap_stats));
2471
2472   FOR_EACH_BB_FN (bb, fun)
2473     {
2474       gimple_stmt_iterator gsi;
2475
2476       /* We do a reverse scan for bswap patterns to make sure we get the
2477          widest match. As bswap pattern matching doesn't handle previously
2478          inserted smaller bswap replacements as sub-patterns, the wider
2479          variant wouldn't be detected.  */
2480       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2481         {
2482           gimple src_stmt, cur_stmt = gsi_stmt (gsi);
2483           tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2484           enum tree_code code;
2485           struct symbolic_number n;
2486           bool bswap;
2487
2488           /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2489              might be moved to a different basic block by bswap_replace and gsi
2490              must not points to it if that's the case.  Moving the gsi_prev
2491              there make sure that gsi points to the statement previous to
2492              cur_stmt while still making sure that all statements are
2493              considered in this basic block.  */
2494           gsi_prev (&gsi);
2495
2496           if (!is_gimple_assign (cur_stmt))
2497             continue;
2498
2499           code = gimple_assign_rhs_code (cur_stmt);
2500           switch (code)
2501             {
2502             case LROTATE_EXPR:
2503             case RROTATE_EXPR:
2504               if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2505                   || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2506                      % BITS_PER_UNIT)
2507                 continue;
2508               /* Fall through.  */
2509             case BIT_IOR_EXPR:
2510               break;
2511             default:
2512               continue;
2513             }
2514
2515           src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2516
2517           if (!src_stmt)
2518             continue;
2519
2520           switch (n.range)
2521             {
2522             case 16:
2523               /* Already in canonical form, nothing to do.  */
2524               if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2525                 continue;
2526               load_type = bswap_type = uint16_type_node;
2527               break;
2528             case 32:
2529               load_type = uint32_type_node;
2530               if (bswap32_p)
2531                 {
2532                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2533                   bswap_type = bswap32_type;
2534                 }
2535               break;
2536             case 64:
2537               load_type = uint64_type_node;
2538               if (bswap64_p)
2539                 {
2540                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2541                   bswap_type = bswap64_type;
2542                 }
2543               break;
2544             default:
2545               continue;
2546             }
2547
2548           if (bswap && !fndecl && n.range != 16)
2549             continue;
2550
2551           if (bswap_replace (cur_stmt, src_stmt, fndecl, bswap_type, load_type,
2552                              &n, bswap))
2553             changed = true;
2554         }
2555     }
2556
2557   statistics_counter_event (fun, "16-bit nop implementations found",
2558                             nop_stats.found_16bit);
2559   statistics_counter_event (fun, "32-bit nop implementations found",
2560                             nop_stats.found_32bit);
2561   statistics_counter_event (fun, "64-bit nop implementations found",
2562                             nop_stats.found_64bit);
2563   statistics_counter_event (fun, "16-bit bswap implementations found",
2564                             bswap_stats.found_16bit);
2565   statistics_counter_event (fun, "32-bit bswap implementations found",
2566                             bswap_stats.found_32bit);
2567   statistics_counter_event (fun, "64-bit bswap implementations found",
2568                             bswap_stats.found_64bit);
2569
2570   return (changed ? TODO_update_ssa : 0);
2571 }
2572
2573 } // anon namespace
2574
2575 gimple_opt_pass *
2576 make_pass_optimize_bswap (gcc::context *ctxt)
2577 {
2578   return new pass_optimize_bswap (ctxt);
2579 }
2580
2581 /* Return true if stmt is a type conversion operation that can be stripped
2582    when used in a widening multiply operation.  */
2583 static bool
2584 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
2585 {
2586   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2587
2588   if (TREE_CODE (result_type) == INTEGER_TYPE)
2589     {
2590       tree op_type;
2591       tree inner_op_type;
2592
2593       if (!CONVERT_EXPR_CODE_P (rhs_code))
2594         return false;
2595
2596       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2597
2598       /* If the type of OP has the same precision as the result, then
2599          we can strip this conversion.  The multiply operation will be
2600          selected to create the correct extension as a by-product.  */
2601       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2602         return true;
2603
2604       /* We can also strip a conversion if it preserves the signed-ness of
2605          the operation and doesn't narrow the range.  */
2606       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2607
2608       /* If the inner-most type is unsigned, then we can strip any
2609          intermediate widening operation.  If it's signed, then the
2610          intermediate widening operation must also be signed.  */
2611       if ((TYPE_UNSIGNED (inner_op_type)
2612            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2613           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2614         return true;
2615
2616       return false;
2617     }
2618
2619   return rhs_code == FIXED_CONVERT_EXPR;
2620 }
2621
2622 /* Return true if RHS is a suitable operand for a widening multiplication,
2623    assuming a target type of TYPE.
2624    There are two cases:
2625
2626      - RHS makes some value at least twice as wide.  Store that value
2627        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2628
2629      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2630        but leave *TYPE_OUT untouched.  */
2631
2632 static bool
2633 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2634                         tree *new_rhs_out)
2635 {
2636   gimple stmt;
2637   tree type1, rhs1;
2638
2639   if (TREE_CODE (rhs) == SSA_NAME)
2640     {
2641       stmt = SSA_NAME_DEF_STMT (rhs);
2642       if (is_gimple_assign (stmt))
2643         {
2644           if (! widening_mult_conversion_strippable_p (type, stmt))
2645             rhs1 = rhs;
2646           else
2647             {
2648               rhs1 = gimple_assign_rhs1 (stmt);
2649
2650               if (TREE_CODE (rhs1) == INTEGER_CST)
2651                 {
2652                   *new_rhs_out = rhs1;
2653                   *type_out = NULL;
2654                   return true;
2655                 }
2656             }
2657         }
2658       else
2659         rhs1 = rhs;
2660
2661       type1 = TREE_TYPE (rhs1);
2662
2663       if (TREE_CODE (type1) != TREE_CODE (type)
2664           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2665         return false;
2666
2667       *new_rhs_out = rhs1;
2668       *type_out = type1;
2669       return true;
2670     }
2671
2672   if (TREE_CODE (rhs) == INTEGER_CST)
2673     {
2674       *new_rhs_out = rhs;
2675       *type_out = NULL;
2676       return true;
2677     }
2678
2679   return false;
2680 }
2681
2682 /* Return true if STMT performs a widening multiplication, assuming the
2683    output type is TYPE.  If so, store the unwidened types of the operands
2684    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2685    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2686    and *TYPE2_OUT would give the operands of the multiplication.  */
2687
2688 static bool
2689 is_widening_mult_p (gimple stmt,
2690                     tree *type1_out, tree *rhs1_out,
2691                     tree *type2_out, tree *rhs2_out)
2692 {
2693   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2694
2695   if (TREE_CODE (type) != INTEGER_TYPE
2696       && TREE_CODE (type) != FIXED_POINT_TYPE)
2697     return false;
2698
2699   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2700                                rhs1_out))
2701     return false;
2702
2703   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2704                                rhs2_out))
2705     return false;
2706
2707   if (*type1_out == NULL)
2708     {
2709       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2710         return false;
2711       *type1_out = *type2_out;
2712     }
2713
2714   if (*type2_out == NULL)
2715     {
2716       if (!int_fits_type_p (*rhs2_out, *type1_out))
2717         return false;
2718       *type2_out = *type1_out;
2719     }
2720
2721   /* Ensure that the larger of the two operands comes first. */
2722   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2723     {
2724       tree tmp;
2725       tmp = *type1_out;
2726       *type1_out = *type2_out;
2727       *type2_out = tmp;
2728       tmp = *rhs1_out;
2729       *rhs1_out = *rhs2_out;
2730       *rhs2_out = tmp;
2731     }
2732
2733   return true;
2734 }
2735
2736 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2737    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2738    value is true iff we converted the statement.  */
2739
2740 static bool
2741 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2742 {
2743   tree lhs, rhs1, rhs2, type, type1, type2;
2744   enum insn_code handler;
2745   machine_mode to_mode, from_mode, actual_mode;
2746   optab op;
2747   int actual_precision;
2748   location_t loc = gimple_location (stmt);
2749   bool from_unsigned1, from_unsigned2;
2750
2751   lhs = gimple_assign_lhs (stmt);
2752   type = TREE_TYPE (lhs);
2753   if (TREE_CODE (type) != INTEGER_TYPE)
2754     return false;
2755
2756   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2757     return false;
2758
2759   to_mode = TYPE_MODE (type);
2760   from_mode = TYPE_MODE (type1);
2761   from_unsigned1 = TYPE_UNSIGNED (type1);
2762   from_unsigned2 = TYPE_UNSIGNED (type2);
2763
2764   if (from_unsigned1 && from_unsigned2)
2765     op = umul_widen_optab;
2766   else if (!from_unsigned1 && !from_unsigned2)
2767     op = smul_widen_optab;
2768   else
2769     op = usmul_widen_optab;
2770
2771   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2772                                                   0, &actual_mode);
2773
2774   if (handler == CODE_FOR_nothing)
2775     {
2776       if (op != smul_widen_optab)
2777         {
2778           /* We can use a signed multiply with unsigned types as long as
2779              there is a wider mode to use, or it is the smaller of the two
2780              types that is unsigned.  Note that type1 >= type2, always.  */
2781           if ((TYPE_UNSIGNED (type1)
2782                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2783               || (TYPE_UNSIGNED (type2)
2784                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2785             {
2786               from_mode = GET_MODE_WIDER_MODE (from_mode);
2787               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2788                 return false;
2789             }
2790
2791           op = smul_widen_optab;
2792           handler = find_widening_optab_handler_and_mode (op, to_mode,
2793                                                           from_mode, 0,
2794                                                           &actual_mode);
2795
2796           if (handler == CODE_FOR_nothing)
2797             return false;
2798
2799           from_unsigned1 = from_unsigned2 = false;
2800         }
2801       else
2802         return false;
2803     }
2804
2805   /* Ensure that the inputs to the handler are in the correct precison
2806      for the opcode.  This will be the full mode size.  */
2807   actual_precision = GET_MODE_PRECISION (actual_mode);
2808   if (2 * actual_precision > TYPE_PRECISION (type))
2809     return false;
2810   if (actual_precision != TYPE_PRECISION (type1)
2811       || from_unsigned1 != TYPE_UNSIGNED (type1))
2812     rhs1 = build_and_insert_cast (gsi, loc,
2813                                   build_nonstandard_integer_type
2814                                     (actual_precision, from_unsigned1), rhs1);
2815   if (actual_precision != TYPE_PRECISION (type2)
2816       || from_unsigned2 != TYPE_UNSIGNED (type2))
2817     rhs2 = build_and_insert_cast (gsi, loc,
2818                                   build_nonstandard_integer_type
2819                                     (actual_precision, from_unsigned2), rhs2);
2820
2821   /* Handle constants.  */
2822   if (TREE_CODE (rhs1) == INTEGER_CST)
2823     rhs1 = fold_convert (type1, rhs1);
2824   if (TREE_CODE (rhs2) == INTEGER_CST)
2825     rhs2 = fold_convert (type2, rhs2);
2826
2827   gimple_assign_set_rhs1 (stmt, rhs1);
2828   gimple_assign_set_rhs2 (stmt, rhs2);
2829   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2830   update_stmt (stmt);
2831   widen_mul_stats.widen_mults_inserted++;
2832   return true;
2833 }
2834
2835 /* Process a single gimple statement STMT, which is found at the
2836    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2837    rhs (given by CODE), and try to convert it into a
2838    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2839    is true iff we converted the statement.  */
2840
2841 static bool
2842 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2843                             enum tree_code code)
2844 {
2845   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2846   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2847   tree type, type1, type2, optype;
2848   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2849   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2850   optab this_optab;
2851   enum tree_code wmult_code;
2852   enum insn_code handler;
2853   machine_mode to_mode, from_mode, actual_mode;
2854   location_t loc = gimple_location (stmt);
2855   int actual_precision;
2856   bool from_unsigned1, from_unsigned2;
2857
2858   lhs = gimple_assign_lhs (stmt);
2859   type = TREE_TYPE (lhs);
2860   if (TREE_CODE (type) != INTEGER_TYPE
2861       && TREE_CODE (type) != FIXED_POINT_TYPE)
2862     return false;
2863
2864   if (code == MINUS_EXPR)
2865     wmult_code = WIDEN_MULT_MINUS_EXPR;
2866   else
2867     wmult_code = WIDEN_MULT_PLUS_EXPR;
2868
2869   rhs1 = gimple_assign_rhs1 (stmt);
2870   rhs2 = gimple_assign_rhs2 (stmt);
2871
2872   if (TREE_CODE (rhs1) == SSA_NAME)
2873     {
2874       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2875       if (is_gimple_assign (rhs1_stmt))
2876         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2877     }
2878
2879   if (TREE_CODE (rhs2) == SSA_NAME)
2880     {
2881       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2882       if (is_gimple_assign (rhs2_stmt))
2883         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2884     }
2885
2886   /* Allow for one conversion statement between the multiply
2887      and addition/subtraction statement.  If there are more than
2888      one conversions then we assume they would invalidate this
2889      transformation.  If that's not the case then they should have
2890      been folded before now.  */
2891   if (CONVERT_EXPR_CODE_P (rhs1_code))
2892     {
2893       conv1_stmt = rhs1_stmt;
2894       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2895       if (TREE_CODE (rhs1) == SSA_NAME)
2896         {
2897           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2898           if (is_gimple_assign (rhs1_stmt))
2899             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2900         }
2901       else
2902         return false;
2903     }
2904   if (CONVERT_EXPR_CODE_P (rhs2_code))
2905     {
2906       conv2_stmt = rhs2_stmt;
2907       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2908       if (TREE_CODE (rhs2) == SSA_NAME)
2909         {
2910           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2911           if (is_gimple_assign (rhs2_stmt))
2912             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2913         }
2914       else
2915         return false;
2916     }
2917
2918   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2919      is_widening_mult_p, but we still need the rhs returns.
2920
2921      It might also appear that it would be sufficient to use the existing
2922      operands of the widening multiply, but that would limit the choice of
2923      multiply-and-accumulate instructions.
2924
2925      If the widened-multiplication result has more than one uses, it is
2926      probably wiser not to do the conversion.  */
2927   if (code == PLUS_EXPR
2928       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2929     {
2930       if (!has_single_use (rhs1)
2931           || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2932                                   &type2, &mult_rhs2))
2933         return false;
2934       add_rhs = rhs2;
2935       conv_stmt = conv1_stmt;
2936     }
2937   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2938     {
2939       if (!has_single_use (rhs2)
2940           || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2941                                   &type2, &mult_rhs2))
2942         return false;
2943       add_rhs = rhs1;
2944       conv_stmt = conv2_stmt;
2945     }
2946   else
2947     return false;
2948
2949   to_mode = TYPE_MODE (type);
2950   from_mode = TYPE_MODE (type1);
2951   from_unsigned1 = TYPE_UNSIGNED (type1);
2952   from_unsigned2 = TYPE_UNSIGNED (type2);
2953   optype = type1;
2954
2955   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2956   if (from_unsigned1 != from_unsigned2)
2957     {
2958       if (!INTEGRAL_TYPE_P (type))
2959         return false;
2960       /* We can use a signed multiply with unsigned types as long as
2961          there is a wider mode to use, or it is the smaller of the two
2962          types that is unsigned.  Note that type1 >= type2, always.  */
2963       if ((from_unsigned1
2964            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2965           || (from_unsigned2
2966               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2967         {
2968           from_mode = GET_MODE_WIDER_MODE (from_mode);
2969           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2970             return false;
2971         }
2972
2973       from_unsigned1 = from_unsigned2 = false;
2974       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2975                                                false);
2976     }
2977
2978   /* If there was a conversion between the multiply and addition
2979      then we need to make sure it fits a multiply-and-accumulate.
2980      The should be a single mode change which does not change the
2981      value.  */
2982   if (conv_stmt)
2983     {
2984       /* We use the original, unmodified data types for this.  */
2985       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2986       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2987       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2988       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2989
2990       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2991         {
2992           /* Conversion is a truncate.  */
2993           if (TYPE_PRECISION (to_type) < data_size)
2994             return false;
2995         }
2996       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2997         {
2998           /* Conversion is an extend.  Check it's the right sort.  */
2999           if (TYPE_UNSIGNED (from_type) != is_unsigned
3000               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
3001             return false;
3002         }
3003       /* else convert is a no-op for our purposes.  */
3004     }
3005
3006   /* Verify that the machine can perform a widening multiply
3007      accumulate in this mode/signedness combination, otherwise
3008      this transformation is likely to pessimize code.  */
3009   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
3010   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
3011                                                   from_mode, 0, &actual_mode);
3012
3013   if (handler == CODE_FOR_nothing)
3014     return false;
3015
3016   /* Ensure that the inputs to the handler are in the correct precison
3017      for the opcode.  This will be the full mode size.  */
3018   actual_precision = GET_MODE_PRECISION (actual_mode);
3019   if (actual_precision != TYPE_PRECISION (type1)
3020       || from_unsigned1 != TYPE_UNSIGNED (type1))
3021     mult_rhs1 = build_and_insert_cast (gsi, loc,
3022                                        build_nonstandard_integer_type
3023                                          (actual_precision, from_unsigned1),
3024                                        mult_rhs1);
3025   if (actual_precision != TYPE_PRECISION (type2)
3026       || from_unsigned2 != TYPE_UNSIGNED (type2))
3027     mult_rhs2 = build_and_insert_cast (gsi, loc,
3028                                        build_nonstandard_integer_type
3029                                          (actual_precision, from_unsigned2),
3030                                        mult_rhs2);
3031
3032   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
3033     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
3034
3035   /* Handle constants.  */
3036   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
3037     mult_rhs1 = fold_convert (type1, mult_rhs1);
3038   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
3039     mult_rhs2 = fold_convert (type2, mult_rhs2);
3040
3041   gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
3042                                   add_rhs);
3043   update_stmt (gsi_stmt (*gsi));
3044   widen_mul_stats.maccs_inserted++;
3045   return true;
3046 }
3047
3048 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3049    with uses in additions and subtractions to form fused multiply-add
3050    operations.  Returns true if successful and MUL_STMT should be removed.  */
3051
3052 static bool
3053 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
3054 {
3055   tree mul_result = gimple_get_lhs (mul_stmt);
3056   tree type = TREE_TYPE (mul_result);
3057   gimple use_stmt, neguse_stmt;
3058   gassign *fma_stmt;
3059   use_operand_p use_p;
3060   imm_use_iterator imm_iter;
3061
3062   if (FLOAT_TYPE_P (type)
3063       && flag_fp_contract_mode == FP_CONTRACT_OFF)
3064     return false;
3065
3066   /* We don't want to do bitfield reduction ops.  */
3067   if (INTEGRAL_TYPE_P (type)
3068       && (TYPE_PRECISION (type)
3069           != GET_MODE_PRECISION (TYPE_MODE (type))))
3070     return false;
3071
3072   /* If the target doesn't support it, don't generate it.  We assume that
3073      if fma isn't available then fms, fnma or fnms are not either.  */
3074   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3075     return false;
3076
3077   /* If the multiplication has zero uses, it is kept around probably because
3078      of -fnon-call-exceptions.  Don't optimize it away in that case,
3079      it is DCE job.  */
3080   if (has_zero_uses (mul_result))
3081     return false;
3082
3083   /* Make sure that the multiplication statement becomes dead after
3084      the transformation, thus that all uses are transformed to FMAs.
3085      This means we assume that an FMA operation has the same cost
3086      as an addition.  */
3087   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3088     {
3089       enum tree_code use_code;
3090       tree result = mul_result;
3091       bool negate_p = false;
3092
3093       use_stmt = USE_STMT (use_p);
3094
3095       if (is_gimple_debug (use_stmt))
3096         continue;
3097
3098       /* For now restrict this operations to single basic blocks.  In theory
3099          we would want to support sinking the multiplication in
3100          m = a*b;
3101          if ()
3102            ma = m + c;
3103          else
3104            d = m;
3105          to form a fma in the then block and sink the multiplication to the
3106          else block.  */
3107       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3108         return false;
3109
3110       if (!is_gimple_assign (use_stmt))
3111         return false;
3112
3113       use_code = gimple_assign_rhs_code (use_stmt);
3114
3115       /* A negate on the multiplication leads to FNMA.  */
3116       if (use_code == NEGATE_EXPR)
3117         {
3118           ssa_op_iter iter;
3119           use_operand_p usep;
3120
3121           result = gimple_assign_lhs (use_stmt);
3122
3123           /* Make sure the negate statement becomes dead with this
3124              single transformation.  */
3125           if (!single_imm_use (gimple_assign_lhs (use_stmt),
3126                                &use_p, &neguse_stmt))
3127             return false;
3128
3129           /* Make sure the multiplication isn't also used on that stmt.  */
3130           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3131             if (USE_FROM_PTR (usep) == mul_result)
3132               return false;
3133
3134           /* Re-validate.  */
3135           use_stmt = neguse_stmt;
3136           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3137             return false;
3138           if (!is_gimple_assign (use_stmt))
3139             return false;
3140
3141           use_code = gimple_assign_rhs_code (use_stmt);
3142           negate_p = true;
3143         }
3144
3145       switch (use_code)
3146         {
3147         case MINUS_EXPR:
3148           if (gimple_assign_rhs2 (use_stmt) == result)
3149             negate_p = !negate_p;
3150           break;
3151         case PLUS_EXPR:
3152           break;
3153         default:
3154           /* FMA can only be formed from PLUS and MINUS.  */
3155           return false;
3156         }
3157
3158       /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3159          by a MULT_EXPR that we'll visit later, we might be able to
3160          get a more profitable match with fnma.
3161          OTOH, if we don't, a negate / fma pair has likely lower latency
3162          that a mult / subtract pair.  */
3163       if (use_code == MINUS_EXPR && !negate_p
3164           && gimple_assign_rhs1 (use_stmt) == result
3165           && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3166           && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3167         {
3168           tree rhs2 = gimple_assign_rhs2 (use_stmt);
3169
3170           if (TREE_CODE (rhs2) == SSA_NAME)
3171             {
3172               gimple stmt2 = SSA_NAME_DEF_STMT (rhs2);
3173               if (has_single_use (rhs2)
3174                   && is_gimple_assign (stmt2)
3175                   && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3176               return false;
3177             }
3178         }
3179
3180       /* We can't handle a * b + a * b.  */
3181       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3182         return false;
3183
3184       /* While it is possible to validate whether or not the exact form
3185          that we've recognized is available in the backend, the assumption
3186          is that the transformation is never a loss.  For instance, suppose
3187          the target only has the plain FMA pattern available.  Consider
3188          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3189          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
3190          still have 3 operations, but in the FMA form the two NEGs are
3191          independent and could be run in parallel.  */
3192     }
3193
3194   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3195     {
3196       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3197       enum tree_code use_code;
3198       tree addop, mulop1 = op1, result = mul_result;
3199       bool negate_p = false;
3200
3201       if (is_gimple_debug (use_stmt))
3202         continue;
3203
3204       use_code = gimple_assign_rhs_code (use_stmt);
3205       if (use_code == NEGATE_EXPR)
3206         {
3207           result = gimple_assign_lhs (use_stmt);
3208           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3209           gsi_remove (&gsi, true);
3210           release_defs (use_stmt);
3211
3212           use_stmt = neguse_stmt;
3213           gsi = gsi_for_stmt (use_stmt);
3214           use_code = gimple_assign_rhs_code (use_stmt);
3215           negate_p = true;
3216         }
3217
3218       if (gimple_assign_rhs1 (use_stmt) == result)
3219         {
3220           addop = gimple_assign_rhs2 (use_stmt);
3221           /* a * b - c -> a * b + (-c)  */
3222           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3223             addop = force_gimple_operand_gsi (&gsi,
3224                                               build1 (NEGATE_EXPR,
3225                                                       type, addop),
3226                                               true, NULL_TREE, true,
3227                                               GSI_SAME_STMT);
3228         }
3229       else
3230         {
3231           addop = gimple_assign_rhs1 (use_stmt);
3232           /* a - b * c -> (-b) * c + a */
3233           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3234             negate_p = !negate_p;
3235         }
3236
3237       if (negate_p)
3238         mulop1 = force_gimple_operand_gsi (&gsi,
3239                                            build1 (NEGATE_EXPR,
3240                                                    type, mulop1),
3241                                            true, NULL_TREE, true,
3242                                            GSI_SAME_STMT);
3243
3244       fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
3245                                       FMA_EXPR, mulop1, op2, addop);
3246       gsi_replace (&gsi, fma_stmt, true);
3247       widen_mul_stats.fmas_inserted++;
3248     }
3249
3250   return true;
3251 }
3252
3253 /* Find integer multiplications where the operands are extended from
3254    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3255    where appropriate.  */
3256
3257 namespace {
3258
3259 const pass_data pass_data_optimize_widening_mul =
3260 {
3261   GIMPLE_PASS, /* type */
3262   "widening_mul", /* name */
3263   OPTGROUP_NONE, /* optinfo_flags */
3264   TV_NONE, /* tv_id */
3265   PROP_ssa, /* properties_required */
3266   0, /* properties_provided */
3267   0, /* properties_destroyed */
3268   0, /* todo_flags_start */
3269   TODO_update_ssa, /* todo_flags_finish */
3270 };
3271
3272 class pass_optimize_widening_mul : public gimple_opt_pass
3273 {
3274 public:
3275   pass_optimize_widening_mul (gcc::context *ctxt)
3276     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3277   {}
3278
3279   /* opt_pass methods: */
3280   virtual bool gate (function *)
3281     {
3282       return flag_expensive_optimizations && optimize;
3283     }
3284
3285   virtual unsigned int execute (function *);
3286
3287 }; // class pass_optimize_widening_mul
3288
3289 unsigned int
3290 pass_optimize_widening_mul::execute (function *fun)
3291 {
3292   basic_block bb;
3293   bool cfg_changed = false;
3294
3295   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3296
3297   FOR_EACH_BB_FN (bb, fun)
3298     {
3299       gimple_stmt_iterator gsi;
3300
3301       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3302         {
3303           gimple stmt = gsi_stmt (gsi);
3304           enum tree_code code;
3305
3306           if (is_gimple_assign (stmt))
3307             {
3308               code = gimple_assign_rhs_code (stmt);
3309               switch (code)
3310                 {
3311                 case MULT_EXPR:
3312                   if (!convert_mult_to_widen (stmt, &gsi)
3313                       && convert_mult_to_fma (stmt,
3314                                               gimple_assign_rhs1 (stmt),
3315                                               gimple_assign_rhs2 (stmt)))
3316                     {
3317                       gsi_remove (&gsi, true);
3318                       release_defs (stmt);
3319                       continue;
3320                     }
3321                   break;
3322
3323                 case PLUS_EXPR:
3324                 case MINUS_EXPR:
3325                   convert_plusminus_to_widen (&gsi, stmt, code);
3326                   break;
3327
3328                 default:;
3329                 }
3330             }
3331           else if (is_gimple_call (stmt)
3332                    && gimple_call_lhs (stmt))
3333             {
3334               tree fndecl = gimple_call_fndecl (stmt);
3335               if (fndecl
3336                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
3337                 {
3338                   switch (DECL_FUNCTION_CODE (fndecl))
3339                     {
3340                       case BUILT_IN_POWF:
3341                       case BUILT_IN_POW:
3342                       case BUILT_IN_POWL:
3343                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3344                             && REAL_VALUES_EQUAL
3345                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3346                                   dconst2)
3347                             && convert_mult_to_fma (stmt,
3348                                                     gimple_call_arg (stmt, 0),
3349                                                     gimple_call_arg (stmt, 0)))
3350                           {
3351                             unlink_stmt_vdef (stmt);
3352                             if (gsi_remove (&gsi, true)
3353                                 && gimple_purge_dead_eh_edges (bb))
3354                               cfg_changed = true;
3355                             release_defs (stmt);
3356                             continue;
3357                           }
3358                           break;
3359
3360                       default:;
3361                     }
3362                 }
3363             }
3364           gsi_next (&gsi);
3365         }
3366     }
3367
3368   statistics_counter_event (fun, "widening multiplications inserted",
3369                             widen_mul_stats.widen_mults_inserted);
3370   statistics_counter_event (fun, "widening maccs inserted",
3371                             widen_mul_stats.maccs_inserted);
3372   statistics_counter_event (fun, "fused multiply-adds inserted",
3373                             widen_mul_stats.fmas_inserted);
3374
3375   return cfg_changed ? TODO_cleanup_cfg : 0;
3376 }
3377
3378 } // anon namespace
3379
3380 gimple_opt_pass *
3381 make_pass_optimize_widening_mul (gcc::context *ctxt)
3382 {
3383   return new pass_optimize_widening_mul (ctxt);
3384 }