gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2007, 2008, 2010, 2011, 2012
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "rtl.h"
  26 #include "hard-reg-set.h"
  27 #include "obstack.h"
  28 #include "basic-block.h"
  29 #include "cfgloop.h"
  30 #include "params.h"
  31 #include "expr.h"
  32 #include "hashtab.h"
  33 #include "recog.h"
  34 #include "target.h"
  35 #include "dumpfile.h"
  36
  37 /* This pass performs loop unrolling and peeling.  We only perform these
  38    optimizations on innermost loops (with single exception) because
  39    the impact on performance is greatest here, and we want to avoid
  40    unnecessary code size growth.  The gain is caused by greater sequentiality
  41    of code, better code to optimize for further passes and in some cases
  42    by fewer testings of exit conditions.  The main problem is code growth,
  43    that impacts performance negatively due to effect of caches.
  44
  45    What we do:
  46
  47    -- complete peeling of once-rolling loops; this is the above mentioned
  48       exception, as this causes loop to be cancelled completely and
  49       does not cause code growth
  50    -- complete peeling of loops that roll (small) constant times.
  51    -- simple peeling of first iterations of loops that do not roll much
  52       (according to profile feedback)
  53    -- unrolling of loops that roll constant times; this is almost always
  54       win, as we get rid of exit condition tests.
  55    -- unrolling of loops that roll number of times that we can compute
  56       in runtime; we also get rid of exit condition tests here, but there
  57       is the extra expense for calculating the number of iterations
  58    -- simple unrolling of remaining loops; this is performed only if we
  59       are asked to, as the gain is questionable in this case and often
  60       it may even slow down the code
  61    For more detailed descriptions of each of those, see comments at
  62    appropriate function below.
  63
  64    There is a lot of parameters (defined and described in params.def) that
  65    control how much we unroll/peel.
  66
  67    ??? A great problem is that we don't have a good way how to determine
  68    how many times we should unroll the loop; the experiments I have made
  69    showed that this choice may affect performance in order of several %.
  70    */
  71
  72 /* Information about induction variables to split.  */
  73
  74 struct iv_to_split
  75 {
  76   rtx insn;             /* The insn in that the induction variable occurs.  */
  77   rtx orig_var;         /* The variable (register) for the IV before split.  */
  78   rtx base_var;         /* The variable on that the values in the further
  79                            iterations are based.  */
  80   rtx step;             /* Step of the induction variable.  */
  81   struct iv_to_split *next; /* Next entry in walking order.  */
  82   unsigned n_loc;
  83   unsigned loc[3];      /* Location where the definition of the induction
  84                            variable occurs in the insn.  For example if
  85                            N_LOC is 2, the expression is located at
  86                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  87 };
  88
  89 /* Information about accumulators to expand.  */
  90
  91 struct var_to_expand
  92 {
  93   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  94   rtx reg;                         /* The accumulator which is expanded.  */
  95   vec<rtx> var_expansions;   /* The copies of the accumulator which is expanded.  */
  96   struct var_to_expand *next;      /* Next entry in walking order.  */
  97   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  98                                       or multiplication.  */
  99   int expansion_count;             /* Count the number of expansions generated so far.  */
 100   int reuse_expansion;             /* The expansion we intend to reuse to expand
 101                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 102                                       the original accumulator.  Else use
 103                                       var_expansions[REUSE_EXPANSION - 1].  */
 104 };
 105
 106 /* Information about optimization applied in
 107    the unrolled loop.  */
 108
 109 struct opt_info
 110 {
 111   htab_t insns_to_split;           /* A hashtable of insns to split.  */
 112   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 113   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 114   htab_t insns_with_var_to_expand; /* A hashtable of insns with accumulators
 115                                       to expand.  */
 116   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 117   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 118   unsigned first_new_block;        /* The first basic block that was
 119                                       duplicated.  */
 120   basic_block loop_exit;           /* The loop exit basic block.  */
 121   basic_block loop_preheader;      /* The loop preheader basic block.  */
 122 };
 123
 124 static void decide_unrolling_and_peeling (int);
 125 static void peel_loops_completely (int);
 126 static void decide_peel_simple (struct loop *, int);
 127 static void decide_peel_once_rolling (struct loop *, int);
 128 static void decide_peel_completely (struct loop *, int);
 129 static void decide_unroll_stupid (struct loop *, int);
 130 static void decide_unroll_constant_iterations (struct loop *, int);
 131 static void decide_unroll_runtime_iterations (struct loop *, int);
 132 static void peel_loop_simple (struct loop *);
 133 static void peel_loop_completely (struct loop *);
 134 static void unroll_loop_stupid (struct loop *);
 135 static void unroll_loop_constant_iterations (struct loop *);
 136 static void unroll_loop_runtime_iterations (struct loop *);
 137 static struct opt_info *analyze_insns_in_loop (struct loop *);
 138 static void opt_info_start_duplication (struct opt_info *);
 139 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 140 static void free_opt_info (struct opt_info *);
 141 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 142 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 143 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 144 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 145 static void insert_var_expansion_initialization (struct var_to_expand *,
 146                                                  basic_block);
 147 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 148                                              basic_block);
 149 static rtx get_expansion (struct var_to_expand *);
 150
 151 /* Emit a message summarizing the unroll or peel that will be
 152    performed for LOOP, along with the loop's location LOCUS, if
 153    appropriate given the dump or -fopt-info settings.  */
 154
 155 static void
 156 report_unroll_peel (struct loop *loop, location_t locus)
 157 {
 158   struct niter_desc *desc;
 159   int niters = 0;
 160   int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
 161
 162   if (!dump_enabled_p ())
 163     return;
 164
 165   /* In the special case where the loop never iterated, emit
 166      a different message so that we don't report an unroll by 0.
 167      This matches the equivalent message emitted during tree unrolling.  */
 168   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
 169       && !loop->lpt_decision.times)
 170     {
 171       dump_printf_loc (report_flags, locus,
 172                        "Turned loop into non-loop; it never loops.\n");
 173       return;
 174     }
 175
 176   desc = get_simple_loop_desc (loop);
 177
 178   if (desc->const_iter)
 179     niters = desc->niter;
 180   else if (loop->header->count)
 181     niters = expected_loop_iterations (loop);
 182
 183   dump_printf_loc (report_flags, locus,
 184                    "%s loop %d times",
 185                    (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
 186                     ?  "Completely unroll"
 187                     : (loop->lpt_decision.decision == LPT_PEEL_SIMPLE
 188                        ? "Peel" : "Unroll")),
 189                    loop->lpt_decision.times);
 190   if (profile_info)
 191     dump_printf (report_flags,
 192                  " (header execution count %d",
 193                  (int)loop->header->count);
 194   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 195     dump_printf (report_flags,
 196                  "%s%s iterations %d)",
 197                  profile_info ? ", " : " (",
 198                  desc->const_iter ? "const" : "average",
 199                  niters);
 200   else if (profile_info)
 201     dump_printf (report_flags, ")");
 202
 203   dump_printf (report_flags, "\n");
 204 }
 205
 206 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 207 void
 208 unroll_and_peel_loops (int flags)
 209 {
 210   struct loop *loop;
 211   bool check;
 212   loop_iterator li;
 213
 214   /* First perform complete loop peeling (it is almost surely a win,
 215      and affects parameters for further decision a lot).  */
 216   peel_loops_completely (flags);
 217
 218   /* Now decide rest of unrolling and peeling.  */
 219   decide_unrolling_and_peeling (flags);
 220
 221   /* Scan the loops, inner ones first.  */
 222   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 223     {
 224       check = true;
 225       /* And perform the appropriate transformations.  */
 226       switch (loop->lpt_decision.decision)
 227         {
 228         case LPT_PEEL_COMPLETELY:
 229           /* Already done.  */
 230           gcc_unreachable ();
 231         case LPT_PEEL_SIMPLE:
 232           peel_loop_simple (loop);
 233           break;
 234         case LPT_UNROLL_CONSTANT:
 235           unroll_loop_constant_iterations (loop);
 236           break;
 237         case LPT_UNROLL_RUNTIME:
 238           unroll_loop_runtime_iterations (loop);
 239           break;
 240         case LPT_UNROLL_STUPID:
 241           unroll_loop_stupid (loop);
 242           break;
 243         case LPT_NONE:
 244           check = false;
 245           break;
 246         default:
 247           gcc_unreachable ();
 248         }
 249       if (check)
 250         {
 251 #ifdef ENABLE_CHECKING
 252           verify_loop_structure ();
 253 #endif
 254         }
 255     }
 256
 257   iv_analysis_done ();
 258 }
 259
 260 /* Check whether exit of the LOOP is at the end of loop body.  */
 261
 262 static bool
 263 loop_exit_at_end_p (struct loop *loop)
 264 {
 265   struct niter_desc *desc = get_simple_loop_desc (loop);
 266   rtx insn;
 267
 268   if (desc->in_edge->dest != loop->latch)
 269     return false;
 270
 271   /* Check that the latch is empty.  */
 272   FOR_BB_INSNS (loop->latch, insn)
 273     {
 274       if (NONDEBUG_INSN_P (insn))
 275         return false;
 276     }
 277
 278   return true;
 279 }
 280
 281 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 282 static void
 283 peel_loops_completely (int flags)
 284 {
 285   struct loop *loop;
 286   loop_iterator li;
 287
 288   /* Scan the loops, the inner ones first.  */
 289   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 290     {
 291       loop->lpt_decision.decision = LPT_NONE;
 292       location_t locus = get_loop_location (loop);
 293
 294       if (dump_enabled_p ())
 295         dump_printf_loc (TDF_RTL, locus,
 296                          ";; *** Considering loop %d at BB %d for "
 297                          "complete peeling ***\n",
 298                          loop->num, loop->header->index);
 299
 300       loop->ninsns = num_loop_insns (loop);
 301
 302       decide_peel_once_rolling (loop, flags);
 303       if (loop->lpt_decision.decision == LPT_NONE)
 304         decide_peel_completely (loop, flags);
 305
 306       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 307         {
 308           report_unroll_peel (loop, locus);
 309           peel_loop_completely (loop);
 310 #ifdef ENABLE_CHECKING
 311           verify_loop_structure ();
 312 #endif
 313         }
 314     }
 315 }
 316
 317 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 318 static void
 319 decide_unrolling_and_peeling (int flags)
 320 {
 321   struct loop *loop;
 322   loop_iterator li;
 323
 324   /* Scan the loops, inner ones first.  */
 325   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 326     {
 327       loop->lpt_decision.decision = LPT_NONE;
 328       location_t locus = get_loop_location (loop);
 329
 330       if (dump_enabled_p ())
 331         dump_printf_loc (TDF_RTL, locus,
 332                          ";; *** Considering loop %d at BB %d for "
 333                          "unrolling and peeling ***\n",
 334                          loop->num, loop->header->index);
 335
 336       /* Do not peel cold areas.  */
 337       if (optimize_loop_for_size_p (loop))
 338         {
 339           if (dump_file)
 340             fprintf (dump_file, ";; Not considering loop, cold area\n");
 341           continue;
 342         }
 343
 344       /* Can the loop be manipulated?  */
 345       if (!can_duplicate_loop_p (loop))
 346         {
 347           if (dump_file)
 348             fprintf (dump_file,
 349                      ";; Not considering loop, cannot duplicate\n");
 350           continue;
 351         }
 352
 353       /* Skip non-innermost loops.  */
 354       if (loop->inner)
 355         {
 356           if (dump_file)
 357             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 358           continue;
 359         }
 360
 361       loop->ninsns = num_loop_insns (loop);
 362       loop->av_ninsns = average_num_loop_insns (loop);
 363
 364       /* Try transformations one by one in decreasing order of
 365          priority.  */
 366
 367       decide_unroll_constant_iterations (loop, flags);
 368       if (loop->lpt_decision.decision == LPT_NONE)
 369         decide_unroll_runtime_iterations (loop, flags);
 370       if (loop->lpt_decision.decision == LPT_NONE)
 371         decide_unroll_stupid (loop, flags);
 372       if (loop->lpt_decision.decision == LPT_NONE)
 373         decide_peel_simple (loop, flags);
 374
 375       report_unroll_peel (loop, locus);
 376     }
 377 }
 378
 379 /* Decide whether the LOOP is once rolling and suitable for complete
 380    peeling.  */
 381 static void
 382 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 383 {
 384   struct niter_desc *desc;
 385
 386   if (dump_file)
 387     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 388
 389   /* Is the loop small enough?  */
 390   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 391     {
 392       if (dump_file)
 393         fprintf (dump_file, ";; Not considering loop, is too big\n");
 394       return;
 395     }
 396
 397   /* Check for simple loops.  */
 398   desc = get_simple_loop_desc (loop);
 399
 400   /* Check number of iterations.  */
 401   if (!desc->simple_p
 402       || desc->assumptions
 403       || desc->infinite
 404       || !desc->const_iter
 405       || (desc->niter != 0
 406           && max_loop_iterations_int (loop) != 0))
 407     {
 408       if (dump_file)
 409         fprintf (dump_file,
 410                  ";; Unable to prove that the loop rolls exactly once\n");
 411       return;
 412     }
 413
 414   /* Success.  */
 415   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 416 }
 417
 418 /* Decide whether the LOOP is suitable for complete peeling.  */
 419 static void
 420 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 421 {
 422   unsigned npeel;
 423   struct niter_desc *desc;
 424
 425   if (dump_file)
 426     fprintf (dump_file, "\n;; Considering peeling completely\n");
 427
 428   /* Skip non-innermost loops.  */
 429   if (loop->inner)
 430     {
 431       if (dump_file)
 432         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 433       return;
 434     }
 435
 436   /* Do not peel cold areas.  */
 437   if (optimize_loop_for_size_p (loop))
 438     {
 439       if (dump_file)
 440         fprintf (dump_file, ";; Not considering loop, cold area\n");
 441       return;
 442     }
 443
 444   /* Can the loop be manipulated?  */
 445   if (!can_duplicate_loop_p (loop))
 446     {
 447       if (dump_file)
 448         fprintf (dump_file,
 449                  ";; Not considering loop, cannot duplicate\n");
 450       return;
 451     }
 452
 453   /* npeel = number of iterations to peel.  */
 454   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 455   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 456     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 457
 458   /* Is the loop small enough?  */
 459   if (!npeel)
 460     {
 461       if (dump_file)
 462         fprintf (dump_file, ";; Not considering loop, is too big\n");
 463       return;
 464     }
 465
 466   /* Check for simple loops.  */
 467   desc = get_simple_loop_desc (loop);
 468
 469   /* Check number of iterations.  */
 470   if (!desc->simple_p
 471       || desc->assumptions
 472       || !desc->const_iter
 473       || desc->infinite)
 474     {
 475       if (dump_file)
 476         fprintf (dump_file,
 477                  ";; Unable to prove that the loop iterates constant times\n");
 478       return;
 479     }
 480
 481   if (desc->niter > npeel - 1)
 482     {
 483       if (dump_file)
 484         {
 485           fprintf (dump_file,
 486                    ";; Not peeling loop completely, rolls too much (");
 487           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 488           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 489         }
 490       return;
 491     }
 492
 493   /* Success.  */
 494   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 495 }
 496
 497 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 498    completely.  The transformation done:
 499
 500    for (i = 0; i < 4; i++)
 501      body;
 502
 503    ==>
 504
 505    i = 0;
 506    body; i++;
 507    body; i++;
 508    body; i++;
 509    body; i++;
 510    */
 511 static void
 512 peel_loop_completely (struct loop *loop)
 513 {
 514   sbitmap wont_exit;
 515   unsigned HOST_WIDE_INT npeel;
 516   unsigned i;
 517   vec<edge> remove_edges;
 518   edge ein;
 519   struct niter_desc *desc = get_simple_loop_desc (loop);
 520   struct opt_info *opt_info = NULL;
 521
 522   npeel = desc->niter;
 523
 524   if (npeel)
 525     {
 526       bool ok;
 527
 528       wont_exit = sbitmap_alloc (npeel + 1);
 529       bitmap_ones (wont_exit);
 530       bitmap_clear_bit (wont_exit, 0);
 531       if (desc->noloop_assumptions)
 532         bitmap_clear_bit (wont_exit, 1);
 533
 534       remove_edges.create (0);
 535
 536       if (flag_split_ivs_in_unroller)
 537         opt_info = analyze_insns_in_loop (loop);
 538
 539       opt_info_start_duplication (opt_info);
 540       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 541                                           npeel,
 542                                           wont_exit, desc->out_edge,
 543                                           &remove_edges,
 544                                           DLTHE_FLAG_UPDATE_FREQ
 545                                           | DLTHE_FLAG_COMPLETTE_PEEL
 546                                           | (opt_info
 547                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 548       gcc_assert (ok);
 549
 550       free (wont_exit);
 551
 552       if (opt_info)
 553         {
 554           apply_opt_in_copies (opt_info, npeel, false, true);
 555           free_opt_info (opt_info);
 556         }
 557
 558       /* Remove the exit edges.  */
 559       FOR_EACH_VEC_ELT (remove_edges, i, ein)
 560         remove_path (ein);
 561       remove_edges.release ();
 562     }
 563
 564   ein = desc->in_edge;
 565   free_simple_loop_desc (loop);
 566
 567   /* Now remove the unreachable part of the last iteration and cancel
 568      the loop.  */
 569   remove_path (ein);
 570
 571   if (dump_file)
 572     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 573 }
 574
 575 /* Decide whether to unroll LOOP iterating constant number of times
 576    and how much.  */
 577
 578 static void
 579 decide_unroll_constant_iterations (struct loop *loop, int flags)
 580 {
 581   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 582   struct niter_desc *desc;
 583   double_int iterations;
 584
 585   if (!(flags & UAP_UNROLL))
 586     {
 587       /* We were not asked to, just return back silently.  */
 588       return;
 589     }
 590
 591   if (dump_file)
 592     fprintf (dump_file,
 593              "\n;; Considering unrolling loop with constant "
 594              "number of iterations\n");
 595
 596   /* nunroll = total number of copies of the original loop body in
 597      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 598   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 599   nunroll_by_av
 600     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 601   if (nunroll > nunroll_by_av)
 602     nunroll = nunroll_by_av;
 603   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 604     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 605
 606   /* Skip big loops.  */
 607   if (nunroll <= 1)
 608     {
 609       if (dump_file)
 610         fprintf (dump_file, ";; Not considering loop, is too big\n");
 611       return;
 612     }
 613
 614   /* Check for simple loops.  */
 615   desc = get_simple_loop_desc (loop);
 616
 617   /* Check number of iterations.  */
 618   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 619     {
 620       if (dump_file)
 621         fprintf (dump_file,
 622                  ";; Unable to prove that the loop iterates constant times\n");
 623       return;
 624     }
 625
 626   /* Check whether the loop rolls enough to consider.
 627      Consult also loop bounds and profile; in the case the loop has more
 628      than one exit it may well loop less than determined maximal number
 629      of iterations.  */
 630   if (desc->niter < 2 * nunroll
 631       || ((estimated_loop_iterations (loop, &iterations)
 632            || max_loop_iterations (loop, &iterations))
 633           && iterations.ult (double_int::from_shwi (2 * nunroll))))
 634     {
 635       if (dump_file)
 636         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 637       return;
 638     }
 639
 640   /* Success; now compute number of iterations to unroll.  We alter
 641      nunroll so that as few as possible copies of loop body are
 642      necessary, while still not decreasing the number of unrollings
 643      too much (at most by 1).  */
 644   best_copies = 2 * nunroll + 10;
 645
 646   i = 2 * nunroll + 2;
 647   if (i - 1 >= desc->niter)
 648     i = desc->niter - 2;
 649
 650   for (; i >= nunroll - 1; i--)
 651     {
 652       unsigned exit_mod = desc->niter % (i + 1);
 653
 654       if (!loop_exit_at_end_p (loop))
 655         n_copies = exit_mod + i + 1;
 656       else if (exit_mod != (unsigned) i
 657                || desc->noloop_assumptions != NULL_RTX)
 658         n_copies = exit_mod + i + 2;
 659       else
 660         n_copies = i + 1;
 661
 662       if (n_copies < best_copies)
 663         {
 664           best_copies = n_copies;
 665           best_unroll = i;
 666         }
 667     }
 668
 669   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 670   loop->lpt_decision.times = best_unroll;
 671 }
 672
 673 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
 674    The transformation does this:
 675
 676    for (i = 0; i < 102; i++)
 677      body;
 678
 679    ==>  (LOOP->LPT_DECISION.TIMES == 3)
 680
 681    i = 0;
 682    body; i++;
 683    body; i++;
 684    while (i < 102)
 685      {
 686        body; i++;
 687        body; i++;
 688        body; i++;
 689        body; i++;
 690      }
 691   */
 692 static void
 693 unroll_loop_constant_iterations (struct loop *loop)
 694 {
 695   unsigned HOST_WIDE_INT niter;
 696   unsigned exit_mod;
 697   sbitmap wont_exit;
 698   unsigned i;
 699   vec<edge> remove_edges;
 700   edge e;
 701   unsigned max_unroll = loop->lpt_decision.times;
 702   struct niter_desc *desc = get_simple_loop_desc (loop);
 703   bool exit_at_end = loop_exit_at_end_p (loop);
 704   struct opt_info *opt_info = NULL;
 705   bool ok;
 706
 707   niter = desc->niter;
 708
 709   /* Should not get here (such loop should be peeled instead).  */
 710   gcc_assert (niter > max_unroll + 1);
 711
 712   exit_mod = niter % (max_unroll + 1);
 713
 714   wont_exit = sbitmap_alloc (max_unroll + 1);
 715   bitmap_ones (wont_exit);
 716
 717   remove_edges.create (0);
 718   if (flag_split_ivs_in_unroller
 719       || flag_variable_expansion_in_unroller)
 720     opt_info = analyze_insns_in_loop (loop);
 721
 722   if (!exit_at_end)
 723     {
 724       /* The exit is not at the end of the loop; leave exit test
 725          in the first copy, so that the loops that start with test
 726          of exit condition have continuous body after unrolling.  */
 727
 728       if (dump_file)
 729         fprintf (dump_file, ";; Condition at beginning of loop.\n");
 730
 731       /* Peel exit_mod iterations.  */
 732       bitmap_clear_bit (wont_exit, 0);
 733       if (desc->noloop_assumptions)
 734         bitmap_clear_bit (wont_exit, 1);
 735
 736       if (exit_mod)
 737         {
 738           opt_info_start_duplication (opt_info);
 739           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 740                                               exit_mod,
 741                                               wont_exit, desc->out_edge,
 742                                               &remove_edges,
 743                                               DLTHE_FLAG_UPDATE_FREQ
 744                                               | (opt_info && exit_mod > 1
 745                                                  ? DLTHE_RECORD_COPY_NUMBER
 746                                                    : 0));
 747           gcc_assert (ok);
 748
 749           if (opt_info && exit_mod > 1)
 750             apply_opt_in_copies (opt_info, exit_mod, false, false);
 751
 752           desc->noloop_assumptions = NULL_RTX;
 753           desc->niter -= exit_mod;
 754           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod);
 755           if (loop->any_estimate
 756               && double_int::from_uhwi (exit_mod).ule
 757                    (loop->nb_iterations_estimate))
 758             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod);
 759           else
 760             loop->any_estimate = false;
 761         }
 762
 763       bitmap_set_bit (wont_exit, 1);
 764     }
 765   else
 766     {
 767       /* Leave exit test in last copy, for the same reason as above if
 768          the loop tests the condition at the end of loop body.  */
 769
 770       if (dump_file)
 771         fprintf (dump_file, ";; Condition at end of loop.\n");
 772
 773       /* We know that niter >= max_unroll + 2; so we do not need to care of
 774          case when we would exit before reaching the loop.  So just peel
 775          exit_mod + 1 iterations.  */
 776       if (exit_mod != max_unroll
 777           || desc->noloop_assumptions)
 778         {
 779           bitmap_clear_bit (wont_exit, 0);
 780           if (desc->noloop_assumptions)
 781             bitmap_clear_bit (wont_exit, 1);
 782
 783           opt_info_start_duplication (opt_info);
 784           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 785                                               exit_mod + 1,
 786                                               wont_exit, desc->out_edge,
 787                                               &remove_edges,
 788                                               DLTHE_FLAG_UPDATE_FREQ
 789                                               | (opt_info && exit_mod > 0
 790                                                  ? DLTHE_RECORD_COPY_NUMBER
 791                                                    : 0));
 792           gcc_assert (ok);
 793
 794           if (opt_info && exit_mod > 0)
 795             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 796
 797           desc->niter -= exit_mod + 1;
 798           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod + 1);
 799           if (loop->any_estimate
 800               && double_int::from_uhwi (exit_mod + 1).ule
 801                    (loop->nb_iterations_estimate))
 802             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod + 1);
 803           else
 804             loop->any_estimate = false;
 805           desc->noloop_assumptions = NULL_RTX;
 806
 807           bitmap_set_bit (wont_exit, 0);
 808           bitmap_set_bit (wont_exit, 1);
 809         }
 810
 811       bitmap_clear_bit (wont_exit, max_unroll);
 812     }
 813
 814   /* Now unroll the loop.  */
 815
 816   opt_info_start_duplication (opt_info);
 817   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 818                                       max_unroll,
 819                                       wont_exit, desc->out_edge,
 820                                       &remove_edges,
 821                                       DLTHE_FLAG_UPDATE_FREQ
 822                                       | (opt_info
 823                                          ? DLTHE_RECORD_COPY_NUMBER
 824                                            : 0));
 825   gcc_assert (ok);
 826
 827   if (opt_info)
 828     {
 829       apply_opt_in_copies (opt_info, max_unroll, true, true);
 830       free_opt_info (opt_info);
 831     }
 832
 833   free (wont_exit);
 834
 835   if (exit_at_end)
 836     {
 837       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 838       /* Find a new in and out edge; they are in the last copy we have made.  */
 839
 840       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 841         {
 842           desc->out_edge = EDGE_SUCC (exit_block, 0);
 843           desc->in_edge = EDGE_SUCC (exit_block, 1);
 844         }
 845       else
 846         {
 847           desc->out_edge = EDGE_SUCC (exit_block, 1);
 848           desc->in_edge = EDGE_SUCC (exit_block, 0);
 849         }
 850     }
 851
 852   desc->niter /= max_unroll + 1;
 853   loop->nb_iterations_upper_bound
 854     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
 855                                                                    + 1),
 856                                             TRUNC_DIV_EXPR);
 857   if (loop->any_estimate)
 858     loop->nb_iterations_estimate
 859       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
 860                                                                   + 1),
 861                                            TRUNC_DIV_EXPR);
 862   desc->niter_expr = GEN_INT (desc->niter);
 863
 864   /* Remove the edges.  */
 865   FOR_EACH_VEC_ELT (remove_edges, i, e)
 866     remove_path (e);
 867   remove_edges.release ();
 868
 869   if (dump_file)
 870     fprintf (dump_file,
 871              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 872              max_unroll, num_loop_insns (loop));
 873 }
 874
 875 /* Decide whether to unroll LOOP iterating runtime computable number of times
 876    and how much.  */
 877 static void
 878 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 879 {
 880   unsigned nunroll, nunroll_by_av, i;
 881   struct niter_desc *desc;
 882   double_int iterations;
 883
 884   if (!(flags & UAP_UNROLL))
 885     {
 886       /* We were not asked to, just return back silently.  */
 887       return;
 888     }
 889
 890   if (dump_file)
 891     fprintf (dump_file,
 892              "\n;; Considering unrolling loop with runtime "
 893              "computable number of iterations\n");
 894
 895   /* nunroll = total number of copies of the original loop body in
 896      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 897   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 898   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 899   if (nunroll > nunroll_by_av)
 900     nunroll = nunroll_by_av;
 901   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 902     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 903
 904   if (targetm.loop_unroll_adjust)
 905     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 906
 907   /* Skip big loops.  */
 908   if (nunroll <= 1)
 909     {
 910       if (dump_file)
 911         fprintf (dump_file, ";; Not considering loop, is too big\n");
 912       return;
 913     }
 914
 915   /* Check for simple loops.  */
 916   desc = get_simple_loop_desc (loop);
 917
 918   /* Check simpleness.  */
 919   if (!desc->simple_p || desc->assumptions)
 920     {
 921       if (dump_file)
 922         fprintf (dump_file,
 923                  ";; Unable to prove that the number of iterations "
 924                  "can be counted in runtime\n");
 925       return;
 926     }
 927
 928   if (desc->const_iter)
 929     {
 930       if (dump_file)
 931         fprintf (dump_file, ";; Loop iterates constant times\n");
 932       return;
 933     }
 934
 935   /* Check whether the loop rolls.  */
 936   if ((estimated_loop_iterations (loop, &iterations)
 937        || max_loop_iterations (loop, &iterations))
 938       && iterations.ult (double_int::from_shwi (2 * nunroll)))
 939     {
 940       if (dump_file)
 941         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 942       return;
 943     }
 944
 945   /* Success; now force nunroll to be power of 2, as we are unable to
 946      cope with overflows in computation of number of iterations.  */
 947   for (i = 1; 2 * i <= nunroll; i *= 2)
 948     continue;
 949
 950   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
 951   loop->lpt_decision.times = i - 1;
 952 }
 953
 954 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
 955    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
 956    and NULL is returned instead.  */
 957
 958 basic_block
 959 split_edge_and_insert (edge e, rtx insns)
 960 {
 961   basic_block bb;
 962
 963   if (!insns)
 964     return NULL;
 965   bb = split_edge (e);
 966   emit_insn_after (insns, BB_END (bb));
 967
 968   /* ??? We used to assume that INSNS can contain control flow insns, and
 969      that we had to try to find sub basic blocks in BB to maintain a valid
 970      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
 971      and call break_superblocks when going out of cfglayout mode.  But it
 972      turns out that this never happens; and that if it does ever happen,
 973      the TODO_verify_flow at the end of the RTL loop passes would fail.
 974
 975      There are two reasons why we expected we could have control flow insns
 976      in INSNS.  The first is when a comparison has to be done in parts, and
 977      the second is when the number of iterations is computed for loops with
 978      the number of iterations known at runtime.  In both cases, test cases
 979      to get control flow in INSNS appear to be impossible to construct:
 980
 981       * If do_compare_rtx_and_jump needs several branches to do comparison
 982         in a mode that needs comparison by parts, we cannot analyze the
 983         number of iterations of the loop, and we never get to unrolling it.
 984
 985       * The code in expand_divmod that was suspected to cause creation of
 986         branching code seems to be only accessed for signed division.  The
 987         divisions used by # of iterations analysis are always unsigned.
 988         Problems might arise on architectures that emits branching code
 989         for some operations that may appear in the unroller (especially
 990         for division), but we have no such architectures.
 991
 992      Considering all this, it was decided that we should for now assume
 993      that INSNS can in theory contain control flow insns, but in practice
 994      it never does.  So we don't handle the theoretical case, and should
 995      a real failure ever show up, we have a pretty good clue for how to
 996      fix it.  */
 997
 998   return bb;
 999 }
1000
1001 /* Unroll LOOP for which we are able to count number of iterations in runtime
1002    LOOP->LPT_DECISION.TIMES times.  The transformation does this (with some
1003    extra care for case n < 0):
1004
1005    for (i = 0; i < n; i++)
1006      body;
1007
1008    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1009
1010    i = 0;
1011    mod = n % 4;
1012
1013    switch (mod)
1014      {
1015        case 3:
1016          body; i++;
1017        case 2:
1018          body; i++;
1019        case 1:
1020          body; i++;
1021        case 0: ;
1022      }
1023
1024    while (i < n)
1025      {
1026        body; i++;
1027        body; i++;
1028        body; i++;
1029        body; i++;
1030      }
1031    */
1032 static void
1033 unroll_loop_runtime_iterations (struct loop *loop)
1034 {
1035   rtx old_niter, niter, init_code, branch_code, tmp;
1036   unsigned i, j, p;
1037   basic_block preheader, *body, swtch, ezc_swtch;
1038   vec<basic_block> dom_bbs;
1039   sbitmap wont_exit;
1040   int may_exit_copy;
1041   unsigned n_peel;
1042   vec<edge> remove_edges;
1043   edge e;
1044   bool extra_zero_check, last_may_exit;
1045   unsigned max_unroll = loop->lpt_decision.times;
1046   struct niter_desc *desc = get_simple_loop_desc (loop);
1047   bool exit_at_end = loop_exit_at_end_p (loop);
1048   struct opt_info *opt_info = NULL;
1049   bool ok;
1050
1051   if (flag_split_ivs_in_unroller
1052       || flag_variable_expansion_in_unroller)
1053     opt_info = analyze_insns_in_loop (loop);
1054
1055   /* Remember blocks whose dominators will have to be updated.  */
1056   dom_bbs.create (0);
1057
1058   body = get_loop_body (loop);
1059   for (i = 0; i < loop->num_nodes; i++)
1060     {
1061       vec<basic_block> ldom;
1062       basic_block bb;
1063
1064       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1065       FOR_EACH_VEC_ELT (ldom, j, bb)
1066         if (!flow_bb_inside_loop_p (loop, bb))
1067           dom_bbs.safe_push (bb);
1068
1069       ldom.release ();
1070     }
1071   free (body);
1072
1073   if (!exit_at_end)
1074     {
1075       /* Leave exit in first copy (for explanation why see comment in
1076          unroll_loop_constant_iterations).  */
1077       may_exit_copy = 0;
1078       n_peel = max_unroll - 1;
1079       extra_zero_check = true;
1080       last_may_exit = false;
1081     }
1082   else
1083     {
1084       /* Leave exit in last copy (for explanation why see comment in
1085          unroll_loop_constant_iterations).  */
1086       may_exit_copy = max_unroll;
1087       n_peel = max_unroll;
1088       extra_zero_check = false;
1089       last_may_exit = true;
1090     }
1091
1092   /* Get expression for number of iterations.  */
1093   start_sequence ();
1094   old_niter = niter = gen_reg_rtx (desc->mode);
1095   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1096   if (tmp != niter)
1097     emit_move_insn (niter, tmp);
1098
1099   /* Count modulo by ANDing it with max_unroll; we use the fact that
1100      the number of unrollings is a power of two, and thus this is correct
1101      even if there is overflow in the computation.  */
1102   niter = expand_simple_binop (desc->mode, AND,
1103                                niter,
1104                                GEN_INT (max_unroll),
1105                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1106
1107   init_code = get_insns ();
1108   end_sequence ();
1109   unshare_all_rtl_in_chain (init_code);
1110
1111   /* Precondition the loop.  */
1112   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1113
1114   remove_edges.create (0);
1115
1116   wont_exit = sbitmap_alloc (max_unroll + 2);
1117
1118   /* Peel the first copy of loop body (almost always we must leave exit test
1119      here; the only exception is when we have extra zero check and the number
1120      of iterations is reliable.  Also record the place of (possible) extra
1121      zero check.  */
1122   bitmap_clear (wont_exit);
1123   if (extra_zero_check
1124       && !desc->noloop_assumptions)
1125     bitmap_set_bit (wont_exit, 1);
1126   ezc_swtch = loop_preheader_edge (loop)->src;
1127   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1128                                       1, wont_exit, desc->out_edge,
1129                                       &remove_edges,
1130                                       DLTHE_FLAG_UPDATE_FREQ);
1131   gcc_assert (ok);
1132
1133   /* Record the place where switch will be built for preconditioning.  */
1134   swtch = split_edge (loop_preheader_edge (loop));
1135
1136   for (i = 0; i < n_peel; i++)
1137     {
1138       /* Peel the copy.  */
1139       bitmap_clear (wont_exit);
1140       if (i != n_peel - 1 || !last_may_exit)
1141         bitmap_set_bit (wont_exit, 1);
1142       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1143                                           1, wont_exit, desc->out_edge,
1144                                           &remove_edges,
1145                                           DLTHE_FLAG_UPDATE_FREQ);
1146       gcc_assert (ok);
1147
1148       /* Create item for switch.  */
1149       j = n_peel - i - (extra_zero_check ? 0 : 1);
1150       p = REG_BR_PROB_BASE / (i + 2);
1151
1152       preheader = split_edge (loop_preheader_edge (loop));
1153       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1154                                           block_label (preheader), p,
1155                                           NULL_RTX);
1156
1157       /* We rely on the fact that the compare and jump cannot be optimized out,
1158          and hence the cfg we create is correct.  */
1159       gcc_assert (branch_code != NULL_RTX);
1160
1161       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1162       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1163       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1164       e = make_edge (swtch, preheader,
1165                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1166       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1167       e->probability = p;
1168     }
1169
1170   if (extra_zero_check)
1171     {
1172       /* Add branch for zero iterations.  */
1173       p = REG_BR_PROB_BASE / (max_unroll + 1);
1174       swtch = ezc_swtch;
1175       preheader = split_edge (loop_preheader_edge (loop));
1176       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1177                                           block_label (preheader), p,
1178                                           NULL_RTX);
1179       gcc_assert (branch_code != NULL_RTX);
1180
1181       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1182       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1183       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1184       e = make_edge (swtch, preheader,
1185                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1186       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1187       e->probability = p;
1188     }
1189
1190   /* Recount dominators for outer blocks.  */
1191   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1192
1193   /* And unroll loop.  */
1194
1195   bitmap_ones (wont_exit);
1196   bitmap_clear_bit (wont_exit, may_exit_copy);
1197   opt_info_start_duplication (opt_info);
1198
1199   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1200                                       max_unroll,
1201                                       wont_exit, desc->out_edge,
1202                                       &remove_edges,
1203                                       DLTHE_FLAG_UPDATE_FREQ
1204                                       | (opt_info
1205                                          ? DLTHE_RECORD_COPY_NUMBER
1206                                            : 0));
1207   gcc_assert (ok);
1208
1209   if (opt_info)
1210     {
1211       apply_opt_in_copies (opt_info, max_unroll, true, true);
1212       free_opt_info (opt_info);
1213     }
1214
1215   free (wont_exit);
1216
1217   if (exit_at_end)
1218     {
1219       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1220       /* Find a new in and out edge; they are in the last copy we have
1221          made.  */
1222
1223       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1224         {
1225           desc->out_edge = EDGE_SUCC (exit_block, 0);
1226           desc->in_edge = EDGE_SUCC (exit_block, 1);
1227         }
1228       else
1229         {
1230           desc->out_edge = EDGE_SUCC (exit_block, 1);
1231           desc->in_edge = EDGE_SUCC (exit_block, 0);
1232         }
1233     }
1234
1235   /* Remove the edges.  */
1236   FOR_EACH_VEC_ELT (remove_edges, i, e)
1237     remove_path (e);
1238   remove_edges.release ();
1239
1240   /* We must be careful when updating the number of iterations due to
1241      preconditioning and the fact that the value must be valid at entry
1242      of the loop.  After passing through the above code, we see that
1243      the correct new number of iterations is this:  */
1244   gcc_assert (!desc->const_iter);
1245   desc->niter_expr =
1246     simplify_gen_binary (UDIV, desc->mode, old_niter,
1247                          GEN_INT (max_unroll + 1));
1248   loop->nb_iterations_upper_bound
1249     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
1250                                                                    + 1),
1251                                             TRUNC_DIV_EXPR);
1252   if (loop->any_estimate)
1253     loop->nb_iterations_estimate
1254       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
1255                                                                   + 1),
1256                                            TRUNC_DIV_EXPR);
1257   if (exit_at_end)
1258     {
1259       desc->niter_expr =
1260         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1261       desc->noloop_assumptions = NULL_RTX;
1262       --loop->nb_iterations_upper_bound;
1263       if (loop->any_estimate
1264           && loop->nb_iterations_estimate != double_int_zero)
1265         --loop->nb_iterations_estimate;
1266       else
1267         loop->any_estimate = false;
1268     }
1269
1270   if (dump_file)
1271     fprintf (dump_file,
1272              ";; Unrolled loop %d times, counting # of iterations "
1273              "in runtime, %i insns\n",
1274              max_unroll, num_loop_insns (loop));
1275
1276   dom_bbs.release ();
1277 }
1278
1279 /* Decide whether to simply peel LOOP and how much.  */
1280 static void
1281 decide_peel_simple (struct loop *loop, int flags)
1282 {
1283   unsigned npeel;
1284   double_int iterations;
1285
1286   if (!(flags & UAP_PEEL))
1287     {
1288       /* We were not asked to, just return back silently.  */
1289       return;
1290     }
1291
1292   if (dump_file)
1293     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1294
1295   /* npeel = number of iterations to peel.  */
1296   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1297   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1298     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1299
1300   /* Skip big loops.  */
1301   if (!npeel)
1302     {
1303       if (dump_file)
1304         fprintf (dump_file, ";; Not considering loop, is too big\n");
1305       return;
1306     }
1307
1308   /* Do not simply peel loops with branches inside -- it increases number
1309      of mispredicts.
1310      Exception is when we do have profile and we however have good chance
1311      to peel proper number of iterations loop will iterate in practice.
1312      TODO: this heuristic needs tunning; while for complette unrolling
1313      the branch inside loop mostly eliminates any improvements, for
1314      peeling it is not the case.  Also a function call inside loop is
1315      also branch from branch prediction POV (and probably better reason
1316      to not unroll/peel).  */
1317   if (num_loop_branches (loop) > 1
1318       && profile_status != PROFILE_READ)
1319     {
1320       if (dump_file)
1321         fprintf (dump_file, ";; Not peeling, contains branches\n");
1322       return;
1323     }
1324
1325   /* If we have realistic estimate on number of iterations, use it.  */
1326   if (estimated_loop_iterations (loop, &iterations))
1327     {
1328       if (double_int::from_shwi (npeel).ule (iterations))
1329         {
1330           if (dump_file)
1331             {
1332               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1333               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1334                        (HOST_WIDEST_INT) (iterations.to_shwi () + 1));
1335               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1336                        npeel);
1337             }
1338           return;
1339         }
1340       npeel = iterations.to_shwi () + 1;
1341     }
1342   /* If we have small enough bound on iterations, we can still peel (completely
1343      unroll).  */
1344   else if (max_loop_iterations (loop, &iterations)
1345            && iterations.ult (double_int::from_shwi (npeel)))
1346     npeel = iterations.to_shwi () + 1;
1347   else
1348     {
1349       /* For now we have no good heuristics to decide whether loop peeling
1350          will be effective, so disable it.  */
1351       if (dump_file)
1352         fprintf (dump_file,
1353                  ";; Not peeling loop, no evidence it will be profitable\n");
1354       return;
1355     }
1356
1357   /* Success.  */
1358   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1359   loop->lpt_decision.times = npeel;
1360 }
1361
1362 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1363
1364    while (cond)
1365      body;
1366
1367    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1368
1369    if (!cond) goto end;
1370    body;
1371    if (!cond) goto end;
1372    body;
1373    if (!cond) goto end;
1374    body;
1375    while (cond)
1376      body;
1377    end: ;
1378    */
1379 static void
1380 peel_loop_simple (struct loop *loop)
1381 {
1382   sbitmap wont_exit;
1383   unsigned npeel = loop->lpt_decision.times;
1384   struct niter_desc *desc = get_simple_loop_desc (loop);
1385   struct opt_info *opt_info = NULL;
1386   bool ok;
1387
1388   if (flag_split_ivs_in_unroller && npeel > 1)
1389     opt_info = analyze_insns_in_loop (loop);
1390
1391   wont_exit = sbitmap_alloc (npeel + 1);
1392   bitmap_clear (wont_exit);
1393
1394   opt_info_start_duplication (opt_info);
1395
1396   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1397                                       npeel, wont_exit, NULL,
1398                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1399                                       | (opt_info
1400                                          ? DLTHE_RECORD_COPY_NUMBER
1401                                            : 0));
1402   gcc_assert (ok);
1403
1404   free (wont_exit);
1405
1406   if (opt_info)
1407     {
1408       apply_opt_in_copies (opt_info, npeel, false, false);
1409       free_opt_info (opt_info);
1410     }
1411
1412   if (desc->simple_p)
1413     {
1414       if (desc->const_iter)
1415         {
1416           desc->niter -= npeel;
1417           desc->niter_expr = GEN_INT (desc->niter);
1418           desc->noloop_assumptions = NULL_RTX;
1419         }
1420       else
1421         {
1422           /* We cannot just update niter_expr, as its value might be clobbered
1423              inside loop.  We could handle this by counting the number into
1424              temporary just like we do in runtime unrolling, but it does not
1425              seem worthwhile.  */
1426           free_simple_loop_desc (loop);
1427         }
1428     }
1429   if (dump_file)
1430     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1431 }
1432
1433 /* Decide whether to unroll LOOP stupidly and how much.  */
1434 static void
1435 decide_unroll_stupid (struct loop *loop, int flags)
1436 {
1437   unsigned nunroll, nunroll_by_av, i;
1438   struct niter_desc *desc;
1439   double_int iterations;
1440
1441   if (!(flags & UAP_UNROLL_ALL))
1442     {
1443       /* We were not asked to, just return back silently.  */
1444       return;
1445     }
1446
1447   if (dump_file)
1448     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1449
1450   /* nunroll = total number of copies of the original loop body in
1451      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1452   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1453   nunroll_by_av
1454     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1455   if (nunroll > nunroll_by_av)
1456     nunroll = nunroll_by_av;
1457   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1458     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1459
1460   if (targetm.loop_unroll_adjust)
1461     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1462
1463   /* Skip big loops.  */
1464   if (nunroll <= 1)
1465     {
1466       if (dump_file)
1467         fprintf (dump_file, ";; Not considering loop, is too big\n");
1468       return;
1469     }
1470
1471   /* Check for simple loops.  */
1472   desc = get_simple_loop_desc (loop);
1473
1474   /* Check simpleness.  */
1475   if (desc->simple_p && !desc->assumptions)
1476     {
1477       if (dump_file)
1478         fprintf (dump_file, ";; The loop is simple\n");
1479       return;
1480     }
1481
1482   /* Do not unroll loops with branches inside -- it increases number
1483      of mispredicts.
1484      TODO: this heuristic needs tunning; call inside the loop body
1485      is also relatively good reason to not unroll.  */
1486   if (num_loop_branches (loop) > 1)
1487     {
1488       if (dump_file)
1489         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1490       return;
1491     }
1492
1493   /* Check whether the loop rolls.  */
1494   if ((estimated_loop_iterations (loop, &iterations)
1495        || max_loop_iterations (loop, &iterations))
1496       && iterations.ult (double_int::from_shwi (2 * nunroll)))
1497     {
1498       if (dump_file)
1499         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1500       return;
1501     }
1502
1503   /* Success.  Now force nunroll to be power of 2, as it seems that this
1504      improves results (partially because of better alignments, partially
1505      because of some dark magic).  */
1506   for (i = 1; 2 * i <= nunroll; i *= 2)
1507     continue;
1508
1509   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1510   loop->lpt_decision.times = i - 1;
1511 }
1512
1513 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1514
1515    while (cond)
1516      body;
1517
1518    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1519
1520    while (cond)
1521      {
1522        body;
1523        if (!cond) break;
1524        body;
1525        if (!cond) break;
1526        body;
1527        if (!cond) break;
1528        body;
1529      }
1530    */
1531 static void
1532 unroll_loop_stupid (struct loop *loop)
1533 {
1534   sbitmap wont_exit;
1535   unsigned nunroll = loop->lpt_decision.times;
1536   struct niter_desc *desc = get_simple_loop_desc (loop);
1537   struct opt_info *opt_info = NULL;
1538   bool ok;
1539
1540   if (flag_split_ivs_in_unroller
1541       || flag_variable_expansion_in_unroller)
1542     opt_info = analyze_insns_in_loop (loop);
1543
1544
1545   wont_exit = sbitmap_alloc (nunroll + 1);
1546   bitmap_clear (wont_exit);
1547   opt_info_start_duplication (opt_info);
1548
1549   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1550                                       nunroll, wont_exit,
1551                                       NULL, NULL,
1552                                       DLTHE_FLAG_UPDATE_FREQ
1553                                       | (opt_info
1554                                          ? DLTHE_RECORD_COPY_NUMBER
1555                                            : 0));
1556   gcc_assert (ok);
1557
1558   if (opt_info)
1559     {
1560       apply_opt_in_copies (opt_info, nunroll, true, true);
1561       free_opt_info (opt_info);
1562     }
1563
1564   free (wont_exit);
1565
1566   if (desc->simple_p)
1567     {
1568       /* We indeed may get here provided that there are nontrivial assumptions
1569          for a loop to be really simple.  We could update the counts, but the
1570          problem is that we are unable to decide which exit will be taken
1571          (not really true in case the number of iterations is constant,
1572          but noone will do anything with this information, so we do not
1573          worry about it).  */
1574       desc->simple_p = false;
1575     }
1576
1577   if (dump_file)
1578     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1579              nunroll, num_loop_insns (loop));
1580 }
1581
1582 /* A hash function for information about insns to split.  */
1583
1584 static hashval_t
1585 si_info_hash (const void *ivts)
1586 {
1587   return (hashval_t) INSN_UID (((const struct iv_to_split *) ivts)->insn);
1588 }
1589
1590 /* An equality functions for information about insns to split.  */
1591
1592 static int
1593 si_info_eq (const void *ivts1, const void *ivts2)
1594 {
1595   const struct iv_to_split *const i1 = (const struct iv_to_split *) ivts1;
1596   const struct iv_to_split *const i2 = (const struct iv_to_split *) ivts2;
1597
1598   return i1->insn == i2->insn;
1599 }
1600
1601 /* Return a hash for VES, which is really a "var_to_expand *".  */
1602
1603 static hashval_t
1604 ve_info_hash (const void *ves)
1605 {
1606   return (hashval_t) INSN_UID (((const struct var_to_expand *) ves)->insn);
1607 }
1608
1609 /* Return true if IVTS1 and IVTS2 (which are really both of type
1610    "var_to_expand *") refer to the same instruction.  */
1611
1612 static int
1613 ve_info_eq (const void *ivts1, const void *ivts2)
1614 {
1615   const struct var_to_expand *const i1 = (const struct var_to_expand *) ivts1;
1616   const struct var_to_expand *const i2 = (const struct var_to_expand *) ivts2;
1617
1618   return i1->insn == i2->insn;
1619 }
1620
1621 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1622    Set *DEBUG_USES to the number of debug insns that reference the
1623    variable.  */
1624
1625 bool
1626 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1627                                   int *debug_uses)
1628 {
1629   basic_block *body, bb;
1630   unsigned i;
1631   int count_ref = 0;
1632   rtx insn;
1633
1634   body = get_loop_body (loop);
1635   for (i = 0; i < loop->num_nodes; i++)
1636     {
1637       bb = body[i];
1638
1639       FOR_BB_INSNS (bb, insn)
1640         if (!rtx_referenced_p (reg, insn))
1641           continue;
1642         else if (DEBUG_INSN_P (insn))
1643           ++*debug_uses;
1644         else if (++count_ref > 1)
1645           break;
1646     }
1647   free (body);
1648   return (count_ref  == 1);
1649 }
1650
1651 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1652
1653 static void
1654 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1655 {
1656   basic_block *body, bb;
1657   unsigned i;
1658   rtx insn;
1659
1660   body = get_loop_body (loop);
1661   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1662     {
1663       bb = body[i];
1664
1665       FOR_BB_INSNS (bb, insn)
1666         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1667           continue;
1668         else
1669           {
1670             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1671                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1672             if (!--debug_uses)
1673               break;
1674           }
1675     }
1676   free (body);
1677 }
1678
1679 /* Determine whether INSN contains an accumulator
1680    which can be expanded into separate copies,
1681    one for each copy of the LOOP body.
1682
1683    for (i = 0 ; i < n; i++)
1684      sum += a[i];
1685
1686    ==>
1687
1688    sum += a[i]
1689    ....
1690    i = i+1;
1691    sum1 += a[i]
1692    ....
1693    i = i+1
1694    sum2 += a[i];
1695    ....
1696
1697    Return NULL if INSN contains no opportunity for expansion of accumulator.
1698    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1699    information and return a pointer to it.
1700 */
1701
1702 static struct var_to_expand *
1703 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1704 {
1705   rtx set, dest, src;
1706   struct var_to_expand *ves;
1707   unsigned accum_pos;
1708   enum rtx_code code;
1709   int debug_uses = 0;
1710
1711   set = single_set (insn);
1712   if (!set)
1713     return NULL;
1714
1715   dest = SET_DEST (set);
1716   src = SET_SRC (set);
1717   code = GET_CODE (src);
1718
1719   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1720     return NULL;
1721
1722   if (FLOAT_MODE_P (GET_MODE (dest)))
1723     {
1724       if (!flag_associative_math)
1725         return NULL;
1726       /* In the case of FMA, we're also changing the rounding.  */
1727       if (code == FMA && !flag_unsafe_math_optimizations)
1728         return NULL;
1729     }
1730
1731   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1732      in MD.  But if there is no optab to generate the insn, we can not
1733      perform the variable expansion.  This can happen if an MD provides
1734      an insn but not a named pattern to generate it, for example to avoid
1735      producing code that needs additional mode switches like for x87/mmx.
1736
1737      So we check have_insn_for which looks for an optab for the operation
1738      in SRC.  If it doesn't exist, we can't perform the expansion even
1739      though INSN is valid.  */
1740   if (!have_insn_for (code, GET_MODE (src)))
1741     return NULL;
1742
1743   if (!REG_P (dest)
1744       && !(GET_CODE (dest) == SUBREG
1745            && REG_P (SUBREG_REG (dest))))
1746     return NULL;
1747
1748   /* Find the accumulator use within the operation.  */
1749   if (code == FMA)
1750     {
1751       /* We only support accumulation via FMA in the ADD position.  */
1752       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1753         return NULL;
1754       accum_pos = 2;
1755     }
1756   else if (rtx_equal_p (dest, XEXP (src, 0)))
1757     accum_pos = 0;
1758   else if (rtx_equal_p (dest, XEXP (src, 1)))
1759     {
1760       /* The method of expansion that we are using; which includes the
1761          initialization of the expansions with zero and the summation of
1762          the expansions at the end of the computation will yield wrong
1763          results for (x = something - x) thus avoid using it in that case.  */
1764       if (code == MINUS)
1765         return NULL;
1766       accum_pos = 1;
1767     }
1768   else
1769     return NULL;
1770
1771   /* It must not otherwise be used.  */
1772   if (code == FMA)
1773     {
1774       if (rtx_referenced_p (dest, XEXP (src, 0))
1775           || rtx_referenced_p (dest, XEXP (src, 1)))
1776         return NULL;
1777     }
1778   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1779     return NULL;
1780
1781   /* It must be used in exactly one insn.  */
1782   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1783     return NULL;
1784
1785   if (dump_file)
1786     {
1787       fprintf (dump_file, "\n;; Expanding Accumulator ");
1788       print_rtl (dump_file, dest);
1789       fprintf (dump_file, "\n");
1790     }
1791
1792   if (debug_uses)
1793     /* Instead of resetting the debug insns, we could replace each
1794        debug use in the loop with the sum or product of all expanded
1795        accummulators.  Since we'll only know of all expansions at the
1796        end, we'd have to keep track of which vars_to_expand a debug
1797        insn in the loop references, take note of each copy of the
1798        debug insn during unrolling, and when it's all done, compute
1799        the sum or product of each variable and adjust the original
1800        debug insn and each copy thereof.  What a pain!  */
1801     reset_debug_uses_in_loop (loop, dest, debug_uses);
1802
1803   /* Record the accumulator to expand.  */
1804   ves = XNEW (struct var_to_expand);
1805   ves->insn = insn;
1806   ves->reg = copy_rtx (dest);
1807   ves->var_expansions.create (1);
1808   ves->next = NULL;
1809   ves->op = GET_CODE (src);
1810   ves->expansion_count = 0;
1811   ves->reuse_expansion = 0;
1812   return ves;
1813 }
1814
1815 /* Determine whether there is an induction variable in INSN that
1816    we would like to split during unrolling.
1817
1818    I.e. replace
1819
1820    i = i + 1;
1821    ...
1822    i = i + 1;
1823    ...
1824    i = i + 1;
1825    ...
1826
1827    type chains by
1828
1829    i0 = i + 1
1830    ...
1831    i = i0 + 1
1832    ...
1833    i = i0 + 2
1834    ...
1835
1836    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1837    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1838    pointer to it.  */
1839
1840 static struct iv_to_split *
1841 analyze_iv_to_split_insn (rtx insn)
1842 {
1843   rtx set, dest;
1844   struct rtx_iv iv;
1845   struct iv_to_split *ivts;
1846   bool ok;
1847
1848   /* For now we just split the basic induction variables.  Later this may be
1849      extended for example by selecting also addresses of memory references.  */
1850   set = single_set (insn);
1851   if (!set)
1852     return NULL;
1853
1854   dest = SET_DEST (set);
1855   if (!REG_P (dest))
1856     return NULL;
1857
1858   if (!biv_p (insn, dest))
1859     return NULL;
1860
1861   ok = iv_analyze_result (insn, dest, &iv);
1862
1863   /* This used to be an assert under the assumption that if biv_p returns
1864      true that iv_analyze_result must also return true.  However, that
1865      assumption is not strictly correct as evidenced by pr25569.
1866
1867      Returning NULL when iv_analyze_result returns false is safe and
1868      avoids the problems in pr25569 until the iv_analyze_* routines
1869      can be fixed, which is apparently hard and time consuming
1870      according to their author.  */
1871   if (! ok)
1872     return NULL;
1873
1874   if (iv.step == const0_rtx
1875       || iv.mode != iv.extend_mode)
1876     return NULL;
1877
1878   /* Record the insn to split.  */
1879   ivts = XNEW (struct iv_to_split);
1880   ivts->insn = insn;
1881   ivts->orig_var = dest;
1882   ivts->base_var = NULL_RTX;
1883   ivts->step = iv.step;
1884   ivts->next = NULL;
1885   ivts->n_loc = 1;
1886   ivts->loc[0] = 1;
1887
1888   return ivts;
1889 }
1890
1891 /* Determines which of insns in LOOP can be optimized.
1892    Return a OPT_INFO struct with the relevant hash tables filled
1893    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1894    is undefined for the return value.  */
1895
1896 static struct opt_info *
1897 analyze_insns_in_loop (struct loop *loop)
1898 {
1899   basic_block *body, bb;
1900   unsigned i;
1901   struct opt_info *opt_info = XCNEW (struct opt_info);
1902   rtx insn;
1903   struct iv_to_split *ivts = NULL;
1904   struct var_to_expand *ves = NULL;
1905   PTR *slot1;
1906   PTR *slot2;
1907   vec<edge> edges = get_loop_exit_edges (loop);
1908   edge exit;
1909   bool can_apply = false;
1910
1911   iv_analysis_loop_init (loop);
1912
1913   body = get_loop_body (loop);
1914
1915   if (flag_split_ivs_in_unroller)
1916     {
1917       opt_info->insns_to_split = htab_create (5 * loop->num_nodes,
1918                                               si_info_hash, si_info_eq, free);
1919       opt_info->iv_to_split_head = NULL;
1920       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1921     }
1922
1923   /* Record the loop exit bb and loop preheader before the unrolling.  */
1924   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1925
1926   if (edges.length () == 1)
1927     {
1928       exit = edges[0];
1929       if (!(exit->flags & EDGE_COMPLEX))
1930         {
1931           opt_info->loop_exit = split_edge (exit);
1932           can_apply = true;
1933         }
1934     }
1935
1936   if (flag_variable_expansion_in_unroller
1937       && can_apply)
1938     {
1939       opt_info->insns_with_var_to_expand = htab_create (5 * loop->num_nodes,
1940                                                         ve_info_hash,
1941                                                         ve_info_eq, free);
1942       opt_info->var_to_expand_head = NULL;
1943       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1944     }
1945
1946   for (i = 0; i < loop->num_nodes; i++)
1947     {
1948       bb = body[i];
1949       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1950         continue;
1951
1952       FOR_BB_INSNS (bb, insn)
1953       {
1954         if (!INSN_P (insn))
1955           continue;
1956
1957         if (opt_info->insns_to_split)
1958           ivts = analyze_iv_to_split_insn (insn);
1959
1960         if (ivts)
1961           {
1962             slot1 = htab_find_slot (opt_info->insns_to_split, ivts, INSERT);
1963             gcc_assert (*slot1 == NULL);
1964             *slot1 = ivts;
1965             *opt_info->iv_to_split_tail = ivts;
1966             opt_info->iv_to_split_tail = &ivts->next;
1967             continue;
1968           }
1969
1970         if (opt_info->insns_with_var_to_expand)
1971           ves = analyze_insn_to_expand_var (loop, insn);
1972
1973         if (ves)
1974           {
1975             slot2 = htab_find_slot (opt_info->insns_with_var_to_expand, ves, INSERT);
1976             gcc_assert (*slot2 == NULL);
1977             *slot2 = ves;
1978             *opt_info->var_to_expand_tail = ves;
1979             opt_info->var_to_expand_tail = &ves->next;
1980           }
1981       }
1982     }
1983
1984   edges.release ();
1985   free (body);
1986   return opt_info;
1987 }
1988
1989 /* Called just before loop duplication.  Records start of duplicated area
1990    to OPT_INFO.  */
1991
1992 static void
1993 opt_info_start_duplication (struct opt_info *opt_info)
1994 {
1995   if (opt_info)
1996     opt_info->first_new_block = last_basic_block;
1997 }
1998
1999 /* Determine the number of iterations between initialization of the base
2000    variable and the current copy (N_COPY).  N_COPIES is the total number
2001    of newly created copies.  UNROLLING is true if we are unrolling
2002    (not peeling) the loop.  */
2003
2004 static unsigned
2005 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
2006 {
2007   if (unrolling)
2008     {
2009       /* If we are unrolling, initialization is done in the original loop
2010          body (number 0).  */
2011       return n_copy;
2012     }
2013   else
2014     {
2015       /* If we are peeling, the copy in that the initialization occurs has
2016          number 1.  The original loop (number 0) is the last.  */
2017       if (n_copy)
2018         return n_copy - 1;
2019       else
2020         return n_copies;
2021     }
2022 }
2023
2024 /* Locate in EXPR the expression corresponding to the location recorded
2025    in IVTS, and return a pointer to the RTX for this location.  */
2026
2027 static rtx *
2028 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
2029 {
2030   unsigned i;
2031   rtx *ret = &expr;
2032
2033   for (i = 0; i < ivts->n_loc; i++)
2034     ret = &XEXP (*ret, ivts->loc[i]);
2035
2036   return ret;
2037 }
2038
2039 /* Allocate basic variable for the induction variable chain.  */
2040
2041 static void
2042 allocate_basic_variable (struct iv_to_split *ivts)
2043 {
2044   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
2045
2046   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
2047 }
2048
2049 /* Insert initialization of basic variable of IVTS before INSN, taking
2050    the initial value from INSN.  */
2051
2052 static void
2053 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
2054 {
2055   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
2056   rtx seq;
2057
2058   start_sequence ();
2059   expr = force_operand (expr, ivts->base_var);
2060   if (expr != ivts->base_var)
2061     emit_move_insn (ivts->base_var, expr);
2062   seq = get_insns ();
2063   end_sequence ();
2064
2065   emit_insn_before (seq, insn);
2066 }
2067
2068 /* Replace the use of induction variable described in IVTS in INSN
2069    by base variable + DELTA * step.  */
2070
2071 static void
2072 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
2073 {
2074   rtx expr, *loc, seq, incr, var;
2075   enum machine_mode mode = GET_MODE (ivts->base_var);
2076   rtx src, dest, set;
2077
2078   /* Construct base + DELTA * step.  */
2079   if (!delta)
2080     expr = ivts->base_var;
2081   else
2082     {
2083       incr = simplify_gen_binary (MULT, mode,
2084                                   ivts->step, gen_int_mode (delta, mode));
2085       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2086                                   ivts->base_var, incr);
2087     }
2088
2089   /* Figure out where to do the replacement.  */
2090   loc = get_ivts_expr (single_set (insn), ivts);
2091
2092   /* If we can make the replacement right away, we're done.  */
2093   if (validate_change (insn, loc, expr, 0))
2094     return;
2095
2096   /* Otherwise, force EXPR into a register and try again.  */
2097   start_sequence ();
2098   var = gen_reg_rtx (mode);
2099   expr = force_operand (expr, var);
2100   if (expr != var)
2101     emit_move_insn (var, expr);
2102   seq = get_insns ();
2103   end_sequence ();
2104   emit_insn_before (seq, insn);
2105
2106   if (validate_change (insn, loc, var, 0))
2107     return;
2108
2109   /* The last chance.  Try recreating the assignment in insn
2110      completely from scratch.  */
2111   set = single_set (insn);
2112   gcc_assert (set);
2113
2114   start_sequence ();
2115   *loc = var;
2116   src = copy_rtx (SET_SRC (set));
2117   dest = copy_rtx (SET_DEST (set));
2118   src = force_operand (src, dest);
2119   if (src != dest)
2120     emit_move_insn (dest, src);
2121   seq = get_insns ();
2122   end_sequence ();
2123
2124   emit_insn_before (seq, insn);
2125   delete_insn (insn);
2126 }
2127
2128
2129 /* Return one expansion of the accumulator recorded in struct VE.  */
2130
2131 static rtx
2132 get_expansion (struct var_to_expand *ve)
2133 {
2134   rtx reg;
2135
2136   if (ve->reuse_expansion == 0)
2137     reg = ve->reg;
2138   else
2139     reg = ve->var_expansions[ve->reuse_expansion - 1];
2140
2141   if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
2142     ve->reuse_expansion = 0;
2143   else
2144     ve->reuse_expansion++;
2145
2146   return reg;
2147 }
2148
2149
2150 /* Given INSN replace the uses of the accumulator recorded in VE
2151    with a new register.  */
2152
2153 static void
2154 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2155 {
2156   rtx new_reg, set;
2157   bool really_new_expansion = false;
2158
2159   set = single_set (insn);
2160   gcc_assert (set);
2161
2162   /* Generate a new register only if the expansion limit has not been
2163      reached.  Else reuse an already existing expansion.  */
2164   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2165     {
2166       really_new_expansion = true;
2167       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2168     }
2169   else
2170     new_reg = get_expansion (ve);
2171
2172   validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
2173   if (apply_change_group ())
2174     if (really_new_expansion)
2175       {
2176         ve->var_expansions.safe_push (new_reg);
2177         ve->expansion_count++;
2178       }
2179 }
2180
2181 /* Initialize the variable expansions in loop preheader.  PLACE is the
2182    loop-preheader basic block where the initialization of the
2183    expansions should take place.  The expansions are initialized with
2184    (-0) when the operation is plus or minus to honor sign zero.  This
2185    way we can prevent cases where the sign of the final result is
2186    effected by the sign of the expansion.  Here is an example to
2187    demonstrate this:
2188
2189    for (i = 0 ; i < n; i++)
2190      sum += something;
2191
2192    ==>
2193
2194    sum += something
2195    ....
2196    i = i+1;
2197    sum1 += something
2198    ....
2199    i = i+1
2200    sum2 += something;
2201    ....
2202
2203    When SUM is initialized with -zero and SOMETHING is also -zero; the
2204    final result of sum should be -zero thus the expansions sum1 and sum2
2205    should be initialized with -zero as well (otherwise we will get +zero
2206    as the final result).  */
2207
2208 static void
2209 insert_var_expansion_initialization (struct var_to_expand *ve,
2210                                      basic_block place)
2211 {
2212   rtx seq, var, zero_init;
2213   unsigned i;
2214   enum machine_mode mode = GET_MODE (ve->reg);
2215   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2216
2217   if (ve->var_expansions.length () == 0)
2218     return;
2219
2220   start_sequence ();
2221   switch (ve->op)
2222     {
2223     case FMA:
2224       /* Note that we only accumulate FMA via the ADD operand.  */
2225     case PLUS:
2226     case MINUS:
2227       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2228         {
2229           if (honor_signed_zero_p)
2230             zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2231           else
2232             zero_init = CONST0_RTX (mode);
2233           emit_move_insn (var, zero_init);
2234         }
2235       break;
2236
2237     case MULT:
2238       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2239         {
2240           zero_init = CONST1_RTX (GET_MODE (var));
2241           emit_move_insn (var, zero_init);
2242         }
2243       break;
2244
2245     default:
2246       gcc_unreachable ();
2247     }
2248
2249   seq = get_insns ();
2250   end_sequence ();
2251
2252   emit_insn_after (seq, BB_END (place));
2253 }
2254
2255 /* Combine the variable expansions at the loop exit.  PLACE is the
2256    loop exit basic block where the summation of the expansions should
2257    take place.  */
2258
2259 static void
2260 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2261 {
2262   rtx sum = ve->reg;
2263   rtx expr, seq, var, insn;
2264   unsigned i;
2265
2266   if (ve->var_expansions.length () == 0)
2267     return;
2268
2269   start_sequence ();
2270   switch (ve->op)
2271     {
2272     case FMA:
2273       /* Note that we only accumulate FMA via the ADD operand.  */
2274     case PLUS:
2275     case MINUS:
2276       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2277         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2278       break;
2279
2280     case MULT:
2281       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2282         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2283       break;
2284
2285     default:
2286       gcc_unreachable ();
2287     }
2288
2289   expr = force_operand (sum, ve->reg);
2290   if (expr != ve->reg)
2291     emit_move_insn (ve->reg, expr);
2292   seq = get_insns ();
2293   end_sequence ();
2294
2295   insn = BB_HEAD (place);
2296   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2297     insn = NEXT_INSN (insn);
2298
2299   emit_insn_after (seq, insn);
2300 }
2301
2302 /* Strip away REG_EQUAL notes for IVs we're splitting.
2303
2304    Updating REG_EQUAL notes for IVs we split is tricky: We
2305    cannot tell until after unrolling, DF-rescanning, and liveness
2306    updating, whether an EQ_USE is reached by the split IV while
2307    the IV reg is still live.  See PR55006.
2308
2309    ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
2310    because RTL loop-iv requires us to defer rescanning insns and
2311    any notes attached to them.  So resort to old techniques...  */
2312
2313 static void
2314 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx insn)
2315 {
2316   struct iv_to_split *ivts;
2317   rtx note = find_reg_equal_equiv_note (insn);
2318   if (! note)
2319     return;
2320   for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2321     if (reg_mentioned_p (ivts->orig_var, note))
2322       {
2323         remove_note (insn, note);
2324         return;
2325       }
2326 }
2327
2328 /* Apply loop optimizations in loop copies using the
2329    data which gathered during the unrolling.  Structure
2330    OPT_INFO record that data.
2331
2332    UNROLLING is true if we unrolled (not peeled) the loop.
2333    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2334    the loop (as it should happen in complete unrolling, but not in ordinary
2335    peeling of the loop).  */
2336
2337 static void
2338 apply_opt_in_copies (struct opt_info *opt_info,
2339                      unsigned n_copies, bool unrolling,
2340                      bool rewrite_original_loop)
2341 {
2342   unsigned i, delta;
2343   basic_block bb, orig_bb;
2344   rtx insn, orig_insn, next;
2345   struct iv_to_split ivts_templ, *ivts;
2346   struct var_to_expand ve_templ, *ves;
2347
2348   /* Sanity check -- we need to put initialization in the original loop
2349      body.  */
2350   gcc_assert (!unrolling || rewrite_original_loop);
2351
2352   /* Allocate the basic variables (i0).  */
2353   if (opt_info->insns_to_split)
2354     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2355       allocate_basic_variable (ivts);
2356
2357   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2358     {
2359       bb = BASIC_BLOCK (i);
2360       orig_bb = get_bb_original (bb);
2361
2362       /* bb->aux holds position in copy sequence initialized by
2363          duplicate_loop_to_header_edge.  */
2364       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2365                                         unrolling);
2366       bb->aux = 0;
2367       orig_insn = BB_HEAD (orig_bb);
2368       FOR_BB_INSNS_SAFE (bb, insn, next)
2369         {
2370           if (!INSN_P (insn)
2371               || (DEBUG_INSN_P (insn)
2372                   && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2373             continue;
2374
2375           while (!INSN_P (orig_insn)
2376                  || (DEBUG_INSN_P (orig_insn)
2377                      && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2378                          == LABEL_DECL)))
2379             orig_insn = NEXT_INSN (orig_insn);
2380
2381           ivts_templ.insn = orig_insn;
2382           ve_templ.insn = orig_insn;
2383
2384           /* Apply splitting iv optimization.  */
2385           if (opt_info->insns_to_split)
2386             {
2387               maybe_strip_eq_note_for_split_iv (opt_info, insn);
2388
2389               ivts = (struct iv_to_split *)
2390                 htab_find (opt_info->insns_to_split, &ivts_templ);
2391
2392               if (ivts)
2393                 {
2394                   gcc_assert (GET_CODE (PATTERN (insn))
2395                               == GET_CODE (PATTERN (orig_insn)));
2396
2397                   if (!delta)
2398                     insert_base_initialization (ivts, insn);
2399                   split_iv (ivts, insn, delta);
2400                 }
2401             }
2402           /* Apply variable expansion optimization.  */
2403           if (unrolling && opt_info->insns_with_var_to_expand)
2404             {
2405               ves = (struct var_to_expand *)
2406                 htab_find (opt_info->insns_with_var_to_expand, &ve_templ);
2407               if (ves)
2408                 {
2409                   gcc_assert (GET_CODE (PATTERN (insn))
2410                               == GET_CODE (PATTERN (orig_insn)));
2411                   expand_var_during_unrolling (ves, insn);
2412                 }
2413             }
2414           orig_insn = NEXT_INSN (orig_insn);
2415         }
2416     }
2417
2418   if (!rewrite_original_loop)
2419     return;
2420
2421   /* Initialize the variable expansions in the loop preheader
2422      and take care of combining them at the loop exit.  */
2423   if (opt_info->insns_with_var_to_expand)
2424     {
2425       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2426         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2427       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2428         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2429     }
2430
2431   /* Rewrite also the original loop body.  Find them as originals of the blocks
2432      in the last copied iteration, i.e. those that have
2433      get_bb_copy (get_bb_original (bb)) == bb.  */
2434   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2435     {
2436       bb = BASIC_BLOCK (i);
2437       orig_bb = get_bb_original (bb);
2438       if (get_bb_copy (orig_bb) != bb)
2439         continue;
2440
2441       delta = determine_split_iv_delta (0, n_copies, unrolling);
2442       for (orig_insn = BB_HEAD (orig_bb);
2443            orig_insn != NEXT_INSN (BB_END (bb));
2444            orig_insn = next)
2445         {
2446           next = NEXT_INSN (orig_insn);
2447
2448           if (!INSN_P (orig_insn))
2449             continue;
2450
2451           ivts_templ.insn = orig_insn;
2452           if (opt_info->insns_to_split)
2453             {
2454               maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2455
2456               ivts = (struct iv_to_split *)
2457                 htab_find (opt_info->insns_to_split, &ivts_templ);
2458               if (ivts)
2459                 {
2460                   if (!delta)
2461                     insert_base_initialization (ivts, orig_insn);
2462                   split_iv (ivts, orig_insn, delta);
2463                   continue;
2464                 }
2465             }
2466
2467         }
2468     }
2469 }
2470
2471 /* Release OPT_INFO.  */
2472
2473 static void
2474 free_opt_info (struct opt_info *opt_info)
2475 {
2476   if (opt_info->insns_to_split)
2477     htab_delete (opt_info->insns_to_split);
2478   if (opt_info->insns_with_var_to_expand)
2479     {
2480       struct var_to_expand *ves;
2481
2482       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2483         ves->var_expansions.release ();
2484       htab_delete (opt_info->insns_with_var_to_expand);
2485     }
2486   free (opt_info);
2487 }