gcc/modulo-sched.c

   1 /* Swing Modulo Scheduling implementation.
   2    Copyright (C) 2004, 2005, 2006, 2007
   3    Free Software Foundation, Inc.
   4    Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 2, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING.  If not, write to the Free
  20 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
  21 02110-1301, USA.  */
  22
  23
  24 #include "config.h"
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "tm.h"
  28 #include "toplev.h"
  29 #include "rtl.h"
  30 #include "tm_p.h"
  31 #include "hard-reg-set.h"
  32 #include "regs.h"
  33 #include "function.h"
  34 #include "flags.h"
  35 #include "insn-config.h"
  36 #include "insn-attr.h"
  37 #include "except.h"
  38 #include "toplev.h"
  39 #include "recog.h"
  40 #include "sched-int.h"
  41 #include "target.h"
  42 #include "cfglayout.h"
  43 #include "cfgloop.h"
  44 #include "cfghooks.h"
  45 #include "expr.h"
  46 #include "params.h"
  47 #include "gcov-io.h"
  48 #include "df.h"
  49 #include "ddg.h"
  50 #include "timevar.h"
  51 #include "tree-pass.h"
  52
  53 #ifdef INSN_SCHEDULING
  54
  55 /* This file contains the implementation of the Swing Modulo Scheduler,
  56    described in the following references:
  57    [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
  58        Lifetime--sensitive modulo scheduling in a production environment.
  59        IEEE Trans. on Comps., 50(3), March 2001
  60    [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
  61        Swing Modulo Scheduling: A Lifetime Sensitive Approach.
  62        PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
  63
  64    The basic structure is:
  65    1. Build a data-dependence graph (DDG) for each loop.
  66    2. Use the DDG to order the insns of a loop (not in topological order
  67       necessarily, but rather) trying to place each insn after all its
  68       predecessors _or_ after all its successors.
  69    3. Compute MII: a lower bound on the number of cycles to schedule the loop.
  70    4. Use the ordering to perform list-scheduling of the loop:
  71       1. Set II = MII.  We will try to schedule the loop within II cycles.
  72       2. Try to schedule the insns one by one according to the ordering.
  73          For each insn compute an interval of cycles by considering already-
  74          scheduled preds and succs (and associated latencies); try to place
  75          the insn in the cycles of this window checking for potential
  76          resource conflicts (using the DFA interface).
  77          Note: this is different from the cycle-scheduling of schedule_insns;
  78          here the insns are not scheduled monotonically top-down (nor bottom-
  79          up).
  80       3. If failed in scheduling all insns - bump II++ and try again, unless
  81          II reaches an upper bound MaxII, in which case report failure.
  82    5. If we succeeded in scheduling the loop within II cycles, we now
  83       generate prolog and epilog, decrease the counter of the loop, and
  84       perform modulo variable expansion for live ranges that span more than
  85       II cycles (i.e. use register copies to prevent a def from overwriting
  86       itself before reaching the use).
  87 */
  88
  89 \f
  90 /* This page defines partial-schedule structures and functions for
  91    modulo scheduling.  */
  92
  93 typedef struct partial_schedule *partial_schedule_ptr;
  94 typedef struct ps_insn *ps_insn_ptr;
  95
  96 /* The minimum (absolute) cycle that a node of ps was scheduled in.  */
  97 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
  98
  99 /* The maximum (absolute) cycle that a node of ps was scheduled in.  */
 100 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
 101
 102 /* Perform signed modulo, always returning a non-negative value.  */
 103 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
 104
 105 /* The number of different iterations the nodes in ps span, assuming
 106    the stage boundaries are placed efficiently.  */
 107 #define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \
 108                              + 1 + (ps)->ii - 1) / (ps)->ii)
 109
 110 /* A single instruction in the partial schedule.  */
 111 struct ps_insn
 112 {
 113   /* The corresponding DDG_NODE.  */
 114   ddg_node_ptr node;
 115
 116   /* The (absolute) cycle in which the PS instruction is scheduled.
 117      Same as SCHED_TIME (node).  */
 118   int cycle;
 119
 120   /* The next/prev PS_INSN in the same row.  */
 121   ps_insn_ptr next_in_row,
 122               prev_in_row;
 123
 124   /* The number of nodes in the same row that come after this node.  */
 125   int row_rest_count;
 126 };
 127
 128 /* Holds the partial schedule as an array of II rows.  Each entry of the
 129    array points to a linked list of PS_INSNs, which represents the
 130    instructions that are scheduled for that row.  */
 131 struct partial_schedule
 132 {
 133   int ii;       /* Number of rows in the partial schedule.  */
 134   int history;  /* Threshold for conflict checking using DFA.  */
 135
 136   /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii).  */
 137   ps_insn_ptr *rows;
 138
 139   /* The earliest absolute cycle of an insn in the partial schedule.  */
 140   int min_cycle;
 141
 142   /* The latest absolute cycle of an insn in the partial schedule.  */
 143   int max_cycle;
 144
 145   ddg_ptr g;    /* The DDG of the insns in the partial schedule.  */
 146 };
 147
 148 /* We use this to record all the register replacements we do in
 149    the kernel so we can undo SMS if it is not profitable.  */
 150 struct undo_replace_buff_elem
 151 {
 152   rtx insn;
 153   rtx orig_reg;
 154   rtx new_reg;
 155   struct undo_replace_buff_elem *next;
 156 };
 157
 158
 159
 160 static partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
 161 static void free_partial_schedule (partial_schedule_ptr);
 162 static void reset_partial_schedule (partial_schedule_ptr, int new_ii);
 163 void print_partial_schedule (partial_schedule_ptr, FILE *);
 164 static int kernel_number_of_cycles (rtx first_insn, rtx last_insn);
 165 static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
 166                                                 ddg_node_ptr node, int cycle,
 167                                                 sbitmap must_precede,
 168                                                 sbitmap must_follow);
 169 static void rotate_partial_schedule (partial_schedule_ptr, int);
 170 void set_row_column_for_ps (partial_schedule_ptr);
 171 static bool ps_unschedule_node (partial_schedule_ptr, ddg_node_ptr );
 172
 173 \f
 174 /* This page defines constants and structures for the modulo scheduling
 175    driver.  */
 176
 177 /* As in haifa-sched.c:  */
 178 /* issue_rate is the number of insns that can be scheduled in the same
 179    machine cycle.  It can be defined in the config/mach/mach.h file,
 180    otherwise we set it to 1.  */
 181
 182 static int issue_rate;
 183
 184 static int sms_order_nodes (ddg_ptr, int, int * result);
 185 static void set_node_sched_params (ddg_ptr);
 186 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *);
 187 static void permute_partial_schedule (partial_schedule_ptr ps, rtx last);
 188 static void generate_prolog_epilog (partial_schedule_ptr ,struct loop * loop, rtx);
 189 static void duplicate_insns_of_cycles (partial_schedule_ptr ps,
 190                                        int from_stage, int to_stage,
 191                                        int is_prolog);
 192
 193 #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
 194 #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
 195 #define SCHED_FIRST_REG_MOVE(x) \
 196         (((node_sched_params_ptr)(x)->aux.info)->first_reg_move)
 197 #define SCHED_NREG_MOVES(x) \
 198         (((node_sched_params_ptr)(x)->aux.info)->nreg_moves)
 199 #define SCHED_ROW(x) (((node_sched_params_ptr)(x)->aux.info)->row)
 200 #define SCHED_STAGE(x) (((node_sched_params_ptr)(x)->aux.info)->stage)
 201 #define SCHED_COLUMN(x) (((node_sched_params_ptr)(x)->aux.info)->column)
 202
 203 /* The scheduling parameters held for each node.  */
 204 typedef struct node_sched_params
 205 {
 206   int asap;     /* A lower-bound on the absolute scheduling cycle.  */
 207   int time;     /* The absolute scheduling cycle (time >= asap).  */
 208
 209   /* The following field (first_reg_move) is a pointer to the first
 210      register-move instruction added to handle the modulo-variable-expansion
 211      of the register defined by this node.  This register-move copies the
 212      original register defined by the node.  */
 213   rtx first_reg_move;
 214
 215   /* The number of register-move instructions added, immediately preceding
 216      first_reg_move.  */
 217   int nreg_moves;
 218
 219   int row;    /* Holds time % ii.  */
 220   int stage;  /* Holds time / ii.  */
 221
 222   /* The column of a node inside the ps.  If nodes u, v are on the same row,
 223      u will precede v if column (u) < column (v).  */
 224   int column;
 225 } *node_sched_params_ptr;
 226
 227 \f
 228 /* The following three functions are copied from the current scheduler
 229    code in order to use sched_analyze() for computing the dependencies.
 230    They are used when initializing the sched_info structure.  */
 231 static const char *
 232 sms_print_insn (rtx insn, int aligned ATTRIBUTE_UNUSED)
 233 {
 234   static char tmp[80];
 235
 236   sprintf (tmp, "i%4d", INSN_UID (insn));
 237   return tmp;
 238 }
 239
 240 static void
 241 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
 242                                regset cond_exec ATTRIBUTE_UNUSED,
 243                                regset used ATTRIBUTE_UNUSED,
 244                                regset set ATTRIBUTE_UNUSED)
 245 {
 246 }
 247
 248 static struct sched_info sms_sched_info =
 249 {
 250   NULL,
 251   NULL,
 252   NULL,
 253   NULL,
 254   NULL,
 255   sms_print_insn,
 256   NULL,
 257   compute_jump_reg_dependencies,
 258   NULL, NULL,
 259   NULL, NULL,
 260   0, 0, 0,
 261
 262   NULL, NULL, NULL, NULL, NULL,
 263 #ifdef ENABLE_CHECKING
 264   NULL,
 265 #endif
 266   0
 267 };
 268
 269
 270 /* Return the register decremented and tested in INSN,
 271    or zero if it is not a decrement-and-branch insn.  */
 272
 273 static rtx
 274 doloop_register_get (rtx insn ATTRIBUTE_UNUSED)
 275 {
 276 #ifdef HAVE_doloop_end
 277   rtx pattern, reg, condition;
 278
 279   if (! JUMP_P (insn))
 280     return NULL_RTX;
 281
 282   pattern = PATTERN (insn);
 283   condition = doloop_condition_get (pattern);
 284   if (! condition)
 285     return NULL_RTX;
 286
 287   if (REG_P (XEXP (condition, 0)))
 288     reg = XEXP (condition, 0);
 289   else if (GET_CODE (XEXP (condition, 0)) == PLUS
 290            && REG_P (XEXP (XEXP (condition, 0), 0)))
 291     reg = XEXP (XEXP (condition, 0), 0);
 292   else
 293     gcc_unreachable ();
 294
 295   return reg;
 296 #else
 297   return NULL_RTX;
 298 #endif
 299 }
 300
 301 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
 302    that the number of iterations is a compile-time constant.  If so,
 303    return the rtx that sets COUNT_REG to a constant, and set COUNT to
 304    this constant.  Otherwise return 0.  */
 305 static rtx
 306 const_iteration_count (rtx count_reg, basic_block pre_header,
 307                        HOST_WIDEST_INT * count)
 308 {
 309   rtx insn;
 310   rtx head, tail;
 311
 312   if (! pre_header)
 313     return NULL_RTX;
 314
 315   get_ebb_head_tail (pre_header, pre_header, &head, &tail);
 316
 317   for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
 318     if (INSN_P (insn) && single_set (insn) &&
 319         rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
 320       {
 321         rtx pat = single_set (insn);
 322
 323         if (GET_CODE (SET_SRC (pat)) == CONST_INT)
 324           {
 325             *count = INTVAL (SET_SRC (pat));
 326             return insn;
 327           }
 328
 329         return NULL_RTX;
 330       }
 331
 332   return NULL_RTX;
 333 }
 334
 335 /* A very simple resource-based lower bound on the initiation interval.
 336    ??? Improve the accuracy of this bound by considering the
 337    utilization of various units.  */
 338 static int
 339 res_MII (ddg_ptr g)
 340 {
 341   return (g->num_nodes / issue_rate);
 342 }
 343
 344
 345 /* Points to the array that contains the sched data for each node.  */
 346 static node_sched_params_ptr node_sched_params;
 347
 348 /* Allocate sched_params for each node and initialize it.  Assumes that
 349    the aux field of each node contain the asap bound (computed earlier),
 350    and copies it into the sched_params field.  */
 351 static void
 352 set_node_sched_params (ddg_ptr g)
 353 {
 354   int i;
 355
 356   /* Allocate for each node in the DDG a place to hold the "sched_data".  */
 357   /* Initialize ASAP/ALAP/HIGHT to zero.  */
 358   node_sched_params = (node_sched_params_ptr)
 359                        xcalloc (g->num_nodes,
 360                                 sizeof (struct node_sched_params));
 361
 362   /* Set the pointer of the general data of the node to point to the
 363      appropriate sched_params structure.  */
 364   for (i = 0; i < g->num_nodes; i++)
 365     {
 366       /* Watch out for aliasing problems?  */
 367       node_sched_params[i].asap = g->nodes[i].aux.count;
 368       g->nodes[i].aux.info = &node_sched_params[i];
 369     }
 370 }
 371
 372 static void
 373 print_node_sched_params (FILE * file, int num_nodes)
 374 {
 375   int i;
 376
 377   if (! file)
 378     return;
 379   for (i = 0; i < num_nodes; i++)
 380     {
 381       node_sched_params_ptr nsp = &node_sched_params[i];
 382       rtx reg_move = nsp->first_reg_move;
 383       int j;
 384
 385       fprintf (file, "Node %d:\n", i);
 386       fprintf (file, " asap = %d:\n", nsp->asap);
 387       fprintf (file, " time = %d:\n", nsp->time);
 388       fprintf (file, " nreg_moves = %d:\n", nsp->nreg_moves);
 389       for (j = 0; j < nsp->nreg_moves; j++)
 390         {
 391           fprintf (file, " reg_move = ");
 392           print_rtl_single (file, reg_move);
 393           reg_move = PREV_INSN (reg_move);
 394         }
 395     }
 396 }
 397
 398 /* Calculate an upper bound for II.  SMS should not schedule the loop if it
 399    requires more cycles than this bound.  Currently set to the sum of the
 400    longest latency edge for each node.  Reset based on experiments.  */
 401 static int
 402 calculate_maxii (ddg_ptr g)
 403 {
 404   int i;
 405   int maxii = 0;
 406
 407   for (i = 0; i < g->num_nodes; i++)
 408     {
 409       ddg_node_ptr u = &g->nodes[i];
 410       ddg_edge_ptr e;
 411       int max_edge_latency = 0;
 412
 413       for (e = u->out; e; e = e->next_out)
 414         max_edge_latency = MAX (max_edge_latency, e->latency);
 415
 416       maxii += max_edge_latency;
 417     }
 418   return maxii;
 419 }
 420
 421 /*
 422    Breaking intra-loop register anti-dependences:
 423    Each intra-loop register anti-dependence implies a cross-iteration true
 424    dependence of distance 1. Therefore, we can remove such false dependencies
 425    and figure out if the partial schedule broke them by checking if (for a
 426    true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
 427    if so generate a register move.   The number of such moves is equal to:
 428               SCHED_TIME (use) - SCHED_TIME (def)       { 0 broken
 429    nreg_moves = ----------------------------------- + 1 - {   dependence.
 430                             ii                          { 1 if not.
 431 */
 432 static struct undo_replace_buff_elem *
 433 generate_reg_moves (partial_schedule_ptr ps)
 434 {
 435   ddg_ptr g = ps->g;
 436   int ii = ps->ii;
 437   int i;
 438   struct undo_replace_buff_elem *reg_move_replaces = NULL;
 439
 440   for (i = 0; i < g->num_nodes; i++)
 441     {
 442       ddg_node_ptr u = &g->nodes[i];
 443       ddg_edge_ptr e;
 444       int nreg_moves = 0, i_reg_move;
 445       sbitmap *uses_of_defs;
 446       rtx last_reg_move;
 447       rtx prev_reg, old_reg;
 448
 449       /* Compute the number of reg_moves needed for u, by looking at life
 450          ranges started at u (excluding self-loops).  */
 451       for (e = u->out; e; e = e->next_out)
 452         if (e->type == TRUE_DEP && e->dest != e->src)
 453           {
 454             int nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
 455
 456             if (e->distance == 1)
 457               nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
 458
 459             /* If dest precedes src in the schedule of the kernel, then dest
 460                will read before src writes and we can save one reg_copy.  */
 461             if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
 462                 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
 463               nreg_moves4e--;
 464
 465             nreg_moves = MAX (nreg_moves, nreg_moves4e);
 466           }
 467
 468       if (nreg_moves == 0)
 469         continue;
 470
 471       /* Every use of the register defined by node may require a different
 472          copy of this register, depending on the time the use is scheduled.
 473          Set a bitmap vector, telling which nodes use each copy of this
 474          register.  */
 475       uses_of_defs = sbitmap_vector_alloc (nreg_moves, g->num_nodes);
 476       sbitmap_vector_zero (uses_of_defs, nreg_moves);
 477       for (e = u->out; e; e = e->next_out)
 478         if (e->type == TRUE_DEP && e->dest != e->src)
 479           {
 480             int dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
 481
 482             if (e->distance == 1)
 483               dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
 484
 485             if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
 486                 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
 487               dest_copy--;
 488
 489             if (dest_copy)
 490               SET_BIT (uses_of_defs[dest_copy - 1], e->dest->cuid);
 491           }
 492
 493       /* Now generate the reg_moves, attaching relevant uses to them.  */
 494       SCHED_NREG_MOVES (u) = nreg_moves;
 495       old_reg = prev_reg = copy_rtx (SET_DEST (single_set (u->insn)));
 496       last_reg_move = u->insn;
 497
 498       for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
 499         {
 500           unsigned int i_use = 0;
 501           rtx new_reg = gen_reg_rtx (GET_MODE (prev_reg));
 502           rtx reg_move = gen_move_insn (new_reg, prev_reg);
 503           sbitmap_iterator sbi;
 504
 505           add_insn_before (reg_move, last_reg_move);
 506           last_reg_move = reg_move;
 507
 508           if (!SCHED_FIRST_REG_MOVE (u))
 509             SCHED_FIRST_REG_MOVE (u) = reg_move;
 510
 511           EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs[i_reg_move], 0, i_use, sbi)
 512             {
 513               struct undo_replace_buff_elem *rep;
 514
 515               rep = (struct undo_replace_buff_elem *)
 516                     xcalloc (1, sizeof (struct undo_replace_buff_elem));
 517               rep->insn = g->nodes[i_use].insn;
 518               rep->orig_reg = old_reg;
 519               rep->new_reg = new_reg;
 520
 521               if (! reg_move_replaces)
 522                 reg_move_replaces = rep;
 523               else
 524                 {
 525                   rep->next = reg_move_replaces;
 526                   reg_move_replaces = rep;
 527                 }
 528
 529               replace_rtx (g->nodes[i_use].insn, old_reg, new_reg);
 530             }
 531
 532           prev_reg = new_reg;
 533         }
 534       sbitmap_vector_free (uses_of_defs);
 535     }
 536   return reg_move_replaces;
 537 }
 538
 539 /* We call this when we want to undo the SMS schedule for a given loop.
 540    One of the things that we do is to delete the register moves generated
 541    for the sake of SMS; this function deletes the register move instructions
 542    recorded in the undo buffer.  */
 543 static void
 544 undo_generate_reg_moves (partial_schedule_ptr ps,
 545                          struct undo_replace_buff_elem *reg_move_replaces)
 546 {
 547   int i,j;
 548
 549   for (i = 0; i < ps->g->num_nodes; i++)
 550     {
 551       ddg_node_ptr u = &ps->g->nodes[i];
 552       rtx prev;
 553       rtx crr = SCHED_FIRST_REG_MOVE (u);
 554
 555       for (j = 0; j < SCHED_NREG_MOVES (u); j++)
 556         {
 557           prev = PREV_INSN (crr);
 558           delete_insn (crr);
 559           crr = prev;
 560         }
 561       SCHED_FIRST_REG_MOVE (u) = NULL_RTX;
 562     }
 563
 564   while (reg_move_replaces)
 565     {
 566       struct undo_replace_buff_elem *rep = reg_move_replaces;
 567
 568       reg_move_replaces = reg_move_replaces->next;
 569       replace_rtx (rep->insn, rep->new_reg, rep->orig_reg);
 570     }
 571 }
 572
 573 /* Free memory allocated for the undo buffer.  */
 574 static void
 575 free_undo_replace_buff (struct undo_replace_buff_elem *reg_move_replaces)
 576 {
 577
 578   while (reg_move_replaces)
 579     {
 580       struct undo_replace_buff_elem *rep = reg_move_replaces;
 581
 582       reg_move_replaces = reg_move_replaces->next;
 583       free (rep);
 584     }
 585 }
 586
 587 /* Bump the SCHED_TIMEs of all nodes to start from zero.  Set the values
 588    of SCHED_ROW and SCHED_STAGE.  */
 589 static void
 590 normalize_sched_times (partial_schedule_ptr ps)
 591 {
 592   int i;
 593   ddg_ptr g = ps->g;
 594   int amount = PS_MIN_CYCLE (ps);
 595   int ii = ps->ii;
 596
 597   /* Don't include the closing branch assuming that it is the last node.  */
 598   for (i = 0; i < g->num_nodes - 1; i++)
 599     {
 600       ddg_node_ptr u = &g->nodes[i];
 601       int normalized_time = SCHED_TIME (u) - amount;
 602
 603       gcc_assert (normalized_time >= 0);
 604
 605       SCHED_TIME (u) = normalized_time;
 606       SCHED_ROW (u) = normalized_time % ii;
 607       SCHED_STAGE (u) = normalized_time / ii;
 608     }
 609 }
 610
 611 /* Set SCHED_COLUMN of each node according to its position in PS.  */
 612 static void
 613 set_columns_for_ps (partial_schedule_ptr ps)
 614 {
 615   int row;
 616
 617   for (row = 0; row < ps->ii; row++)
 618     {
 619       ps_insn_ptr cur_insn = ps->rows[row];
 620       int column = 0;
 621
 622       for (; cur_insn; cur_insn = cur_insn->next_in_row)
 623         SCHED_COLUMN (cur_insn->node) = column++;
 624     }
 625 }
 626
 627 /* Permute the insns according to their order in PS, from row 0 to
 628    row ii-1, and position them right before LAST.  This schedules
 629    the insns of the loop kernel.  */
 630 static void
 631 permute_partial_schedule (partial_schedule_ptr ps, rtx last)
 632 {
 633   int ii = ps->ii;
 634   int row;
 635   ps_insn_ptr ps_ij;
 636
 637   for (row = 0; row < ii ; row++)
 638     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
 639       if (PREV_INSN (last) != ps_ij->node->insn)
 640         reorder_insns_nobb (ps_ij->node->first_note, ps_ij->node->insn,
 641                             PREV_INSN (last));
 642 }
 643
 644 /* As part of undoing SMS we return to the original ordering of the
 645    instructions inside the loop kernel.  Given the partial schedule PS, this
 646    function returns the ordering of the instruction according to their CUID
 647    in the DDG (PS->G), which is the original order of the instruction before
 648    performing SMS.  */
 649 static void
 650 undo_permute_partial_schedule (partial_schedule_ptr ps, rtx last)
 651 {
 652   int i;
 653
 654   for (i = 0 ; i < ps->g->num_nodes; i++)
 655     if (last == ps->g->nodes[i].insn
 656         || last == ps->g->nodes[i].first_note)
 657       break;
 658     else if (PREV_INSN (last) != ps->g->nodes[i].insn)
 659       reorder_insns_nobb (ps->g->nodes[i].first_note, ps->g->nodes[i].insn,
 660                           PREV_INSN (last));
 661 }
 662
 663 /* Used to generate the prologue & epilogue.  Duplicate the subset of
 664    nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive
 665    of both), together with a prefix/suffix of their reg_moves.  */
 666 static void
 667 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
 668                            int to_stage, int for_prolog)
 669 {
 670   int row;
 671   ps_insn_ptr ps_ij;
 672
 673   for (row = 0; row < ps->ii; row++)
 674     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
 675       {
 676         ddg_node_ptr u_node = ps_ij->node;
 677         int j, i_reg_moves;
 678         rtx reg_move = NULL_RTX;
 679
 680         if (for_prolog)
 681           {
 682             /* SCHED_STAGE (u_node) >= from_stage == 0.  Generate increasing
 683                number of reg_moves starting with the second occurrence of
 684                u_node, which is generated if its SCHED_STAGE <= to_stage.  */
 685             i_reg_moves = to_stage - SCHED_STAGE (u_node) + 1;
 686             i_reg_moves = MAX (i_reg_moves, 0);
 687             i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
 688
 689             /* The reg_moves start from the *first* reg_move backwards.  */
 690             if (i_reg_moves)
 691               {
 692                 reg_move = SCHED_FIRST_REG_MOVE (u_node);
 693                 for (j = 1; j < i_reg_moves; j++)
 694                   reg_move = PREV_INSN (reg_move);
 695               }
 696           }
 697         else /* It's for the epilog.  */
 698           {
 699             /* SCHED_STAGE (u_node) <= to_stage.  Generate all reg_moves,
 700                starting to decrease one stage after u_node no longer occurs;
 701                that is, generate all reg_moves until
 702                SCHED_STAGE (u_node) == from_stage - 1.  */
 703             i_reg_moves = SCHED_NREG_MOVES (u_node)
 704                        - (from_stage - SCHED_STAGE (u_node) - 1);
 705             i_reg_moves = MAX (i_reg_moves, 0);
 706             i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
 707
 708             /* The reg_moves start from the *last* reg_move forwards.  */
 709             if (i_reg_moves)
 710               {
 711                 reg_move = SCHED_FIRST_REG_MOVE (u_node);
 712                 for (j = 1; j < SCHED_NREG_MOVES (u_node); j++)
 713                   reg_move = PREV_INSN (reg_move);
 714               }
 715           }
 716
 717         for (j = 0; j < i_reg_moves; j++, reg_move = NEXT_INSN (reg_move))
 718           emit_insn (copy_rtx (PATTERN (reg_move)));
 719         if (SCHED_STAGE (u_node) >= from_stage
 720             && SCHED_STAGE (u_node) <= to_stage)
 721           duplicate_insn_chain (u_node->first_note, u_node->insn);
 722       }
 723 }
 724
 725
 726 /* Generate the instructions (including reg_moves) for prolog & epilog.  */
 727 static void
 728 generate_prolog_epilog (partial_schedule_ptr ps, struct loop * loop, rtx count_reg)
 729 {
 730   int i;
 731   int last_stage = PS_STAGE_COUNT (ps) - 1;
 732   edge e;
 733
 734   /* Generate the prolog, inserting its insns on the loop-entry edge.  */
 735   start_sequence ();
 736
 737   if (count_reg)
 738    /* Generate a subtract instruction at the beginning of the prolog to
 739       adjust the loop count by STAGE_COUNT.  */
 740    emit_insn (gen_sub2_insn (count_reg, GEN_INT (last_stage)));
 741
 742   for (i = 0; i < last_stage; i++)
 743     duplicate_insns_of_cycles (ps, 0, i, 1);
 744
 745   /* Put the prolog on the entry edge.  */
 746   e = loop_preheader_edge (loop);
 747   split_edge_and_insert (e, get_insns ());
 748
 749   end_sequence ();
 750
 751   /* Generate the epilog, inserting its insns on the loop-exit edge.  */
 752   start_sequence ();
 753
 754   for (i = 0; i < last_stage; i++)
 755     duplicate_insns_of_cycles (ps, i + 1, last_stage, 0);
 756
 757   /* Put the epilogue on the exit edge.  */
 758   gcc_assert (single_exit (loop));
 759   e = single_exit (loop);
 760   split_edge_and_insert (e, get_insns ());
 761   end_sequence ();
 762 }
 763
 764 /* Return true if all the BBs of the loop are empty except the
 765    loop header.  */
 766 static bool
 767 loop_single_full_bb_p (struct loop *loop)
 768 {
 769   unsigned i;
 770   basic_block *bbs = get_loop_body (loop);
 771
 772   for (i = 0; i < loop->num_nodes ; i++)
 773     {
 774       rtx head, tail;
 775       bool empty_bb = true;
 776
 777       if (bbs[i] == loop->header)
 778         continue;
 779
 780       /* Make sure that basic blocks other than the header
 781          have only notes labels or jumps.  */
 782       get_ebb_head_tail (bbs[i], bbs[i], &head, &tail);
 783       for (; head != NEXT_INSN (tail); head = NEXT_INSN (head))
 784         {
 785           if (NOTE_P (head) || LABEL_P (head)
 786               || (INSN_P (head) && JUMP_P (head)))
 787             continue;
 788           empty_bb = false;
 789           break;
 790         }
 791
 792       if (! empty_bb)
 793         {
 794           free (bbs);
 795           return false;
 796         }
 797     }
 798   free (bbs);
 799   return true;
 800 }
 801
 802 /* A simple loop from SMS point of view; it is a loop that is composed of
 803    either a single basic block or two BBs - a header and a latch.  */
 804 #define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 )                     \
 805                                   && (EDGE_COUNT (loop->latch->preds) == 1) \
 806                                   && (EDGE_COUNT (loop->latch->succs) == 1))
 807
 808 /* Return true if the loop is in its canonical form and false if not.
 809    i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit.  */
 810 static bool
 811 loop_canon_p (struct loop *loop)
 812 {
 813
 814   if (loop->inner || !loop_outer (loop))
 815     return false;
 816
 817   if (!single_exit (loop))
 818     {
 819       if (dump_file)
 820         {
 821           rtx insn = BB_END (loop->header);
 822
 823           fprintf (dump_file, "SMS loop many exits ");
 824                   fprintf (dump_file, " %s %d (file, line)\n",
 825                            insn_file (insn), insn_line (insn));
 826         }
 827       return false;
 828     }
 829
 830   if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop))
 831     {
 832       if (dump_file)
 833         {
 834           rtx insn = BB_END (loop->header);
 835
 836           fprintf (dump_file, "SMS loop many BBs. ");
 837           fprintf (dump_file, " %s %d (file, line)\n",
 838                    insn_file (insn), insn_line (insn));
 839         }
 840       return false;
 841     }
 842
 843     return true;
 844 }
 845
 846 /* If there are more than one entry for the loop,
 847    make it one by splitting the first entry edge and
 848    redirecting the others to the new BB.  */
 849 static void
 850 canon_loop (struct loop *loop)
 851 {
 852   edge e;
 853   edge_iterator i;
 854
 855   /* Avoid annoying special cases of edges going to exit
 856      block.  */
 857   FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR->preds)
 858     if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1))
 859       split_edge (e);
 860
 861   if (loop->latch == loop->header
 862       || EDGE_COUNT (loop->latch->succs) > 1)
 863     {
 864       FOR_EACH_EDGE (e, i, loop->header->preds)
 865         if (e->src == loop->latch)
 866           break;
 867       split_edge (e);
 868     }
 869 }
 870
 871 /* Probability in % that the sms-ed loop rolls enough so that optimized
 872    version may be entered.  Just a guess.  */
 873 #define PROB_SMS_ENOUGH_ITERATIONS 80
 874
 875 /* Main entry point, perform SMS scheduling on the loops of the function
 876    that consist of single basic blocks.  */
 877 static void
 878 sms_schedule (void)
 879 {
 880   static int passes = 0;
 881   rtx insn;
 882   ddg_ptr *g_arr, g;
 883   int * node_order;
 884   int maxii;
 885   loop_iterator li;
 886   partial_schedule_ptr ps;
 887   struct df *df;
 888   basic_block bb = NULL;
 889   struct loop *loop;
 890   basic_block condition_bb = NULL;
 891   edge latch_edge;
 892   gcov_type trip_count = 0;
 893
 894   loop_optimizer_init (LOOPS_HAVE_PREHEADERS
 895                        | LOOPS_HAVE_RECORDED_EXITS);
 896   if (number_of_loops () <= 1)
 897     {
 898       loop_optimizer_finalize ();
 899       return;  /* There are no loops to schedule.  */
 900     }
 901
 902   /* Initialize issue_rate.  */
 903   if (targetm.sched.issue_rate)
 904     {
 905       int temp = reload_completed;
 906
 907       reload_completed = 1;
 908       issue_rate = targetm.sched.issue_rate ();
 909       reload_completed = temp;
 910     }
 911   else
 912     issue_rate = 1;
 913
 914   /* Initialize the scheduler.  */
 915   current_sched_info = &sms_sched_info;
 916   sched_init ();
 917
 918   /* Init Data Flow analysis, to be used in interloop dep calculation.  */
 919   df = df_init (DF_HARD_REGS | DF_EQUIV_NOTES | DF_SUBREGS);
 920   df_rd_add_problem (df, 0);
 921   df_ru_add_problem (df, 0);
 922   df_chain_add_problem (df, DF_DU_CHAIN | DF_UD_CHAIN);
 923   df_analyze (df);
 924
 925   if (dump_file)
 926     df_dump (df, dump_file);
 927
 928   /* Allocate memory to hold the DDG array one entry for each loop.
 929      We use loop->num as index into this array.  */
 930   g_arr = XCNEWVEC (ddg_ptr, number_of_loops ());
 931
 932   /* Build DDGs for all the relevant loops and hold them in G_ARR
 933      indexed by the loop index.  */
 934   FOR_EACH_LOOP (li, loop, 0)
 935     {
 936       rtx head, tail;
 937       rtx count_reg;
 938
 939       /* For debugging.  */
 940       if ((passes++ > MAX_SMS_LOOP_NUMBER) && (MAX_SMS_LOOP_NUMBER != -1))
 941         {
 942           if (dump_file)
 943             fprintf (dump_file, "SMS reached MAX_PASSES... \n");
 944
 945           break;
 946         }
 947
 948       if (! loop_canon_p (loop))
 949         continue;
 950
 951       if (! loop_single_full_bb_p (loop))
 952         continue;
 953
 954       bb = loop->header;
 955
 956       get_ebb_head_tail (bb, bb, &head, &tail);
 957       latch_edge = loop_latch_edge (loop);
 958       gcc_assert (single_exit (loop));
 959       if (single_exit (loop)->count)
 960         trip_count = latch_edge->count / single_exit (loop)->count;
 961
 962       /* Perfrom SMS only on loops that their average count is above threshold.  */
 963
 964       if ( latch_edge->count
 965           && (latch_edge->count < single_exit (loop)->count * SMS_LOOP_AVERAGE_COUNT_THRESHOLD))
 966         {
 967           if (dump_file)
 968             {
 969               fprintf (dump_file, " %s %d (file, line)\n",
 970                        insn_file (tail), insn_line (tail));
 971               fprintf (dump_file, "SMS single-bb-loop\n");
 972               if (profile_info && flag_branch_probabilities)
 973                 {
 974                   fprintf (dump_file, "SMS loop-count ");
 975                   fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
 976                            (HOST_WIDEST_INT) bb->count);
 977                   fprintf (dump_file, "\n");
 978                   fprintf (dump_file, "SMS trip-count ");
 979                   fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
 980                            (HOST_WIDEST_INT) trip_count);
 981                   fprintf (dump_file, "\n");
 982                   fprintf (dump_file, "SMS profile-sum-max ");
 983                   fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
 984                            (HOST_WIDEST_INT) profile_info->sum_max);
 985                   fprintf (dump_file, "\n");
 986                 }
 987             }
 988           continue;
 989         }
 990
 991       /* Make sure this is a doloop.  */
 992       if ( !(count_reg = doloop_register_get (tail)))
 993         continue;
 994
 995       /* Don't handle BBs with calls or barriers, or !single_set insns.  */
 996       for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
 997         if (CALL_P (insn)
 998             || BARRIER_P (insn)
 999             || (INSN_P (insn) && !JUMP_P (insn)
1000                 && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
1001           break;
1002
1003       if (insn != NEXT_INSN (tail))
1004         {
1005           if (dump_file)
1006             {
1007               if (CALL_P (insn))
1008                 fprintf (dump_file, "SMS loop-with-call\n");
1009               else if (BARRIER_P (insn))
1010                 fprintf (dump_file, "SMS loop-with-barrier\n");
1011               else
1012                 fprintf (dump_file, "SMS loop-with-not-single-set\n");
1013               print_rtl_single (dump_file, insn);
1014             }
1015
1016           continue;
1017         }
1018
1019       if (! (g = create_ddg (bb, df, 0)))
1020         {
1021           if (dump_file)
1022             fprintf (dump_file, "SMS doloop\n");
1023           continue;
1024         }
1025
1026       g_arr[loop->num] = g;
1027     }
1028
1029   /* Release Data Flow analysis data structures.  */
1030   df_finish (df);
1031   df = NULL;
1032
1033   /* We don't want to perform SMS on new loops - created by versioning.  */
1034   FOR_EACH_LOOP (li, loop, 0)
1035     {
1036       rtx head, tail;
1037       rtx count_reg, count_init;
1038       int mii, rec_mii;
1039       unsigned stage_count = 0;
1040       HOST_WIDEST_INT loop_count = 0;
1041
1042       if (! (g = g_arr[loop->num]))
1043         continue;
1044
1045       if (dump_file)
1046         print_ddg (dump_file, g);
1047
1048       get_ebb_head_tail (loop->header, loop->header, &head, &tail);
1049
1050       latch_edge = loop_latch_edge (loop);
1051       gcc_assert (single_exit (loop));
1052       if (single_exit (loop)->count)
1053         trip_count = latch_edge->count / single_exit (loop)->count;
1054
1055       if (dump_file)
1056         {
1057           fprintf (dump_file, " %s %d (file, line)\n",
1058                    insn_file (tail), insn_line (tail));
1059           fprintf (dump_file, "SMS single-bb-loop\n");
1060           if (profile_info && flag_branch_probabilities)
1061             {
1062               fprintf (dump_file, "SMS loop-count ");
1063               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1064                        (HOST_WIDEST_INT) bb->count);
1065               fprintf (dump_file, "\n");
1066               fprintf (dump_file, "SMS profile-sum-max ");
1067               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1068                        (HOST_WIDEST_INT) profile_info->sum_max);
1069               fprintf (dump_file, "\n");
1070             }
1071           fprintf (dump_file, "SMS doloop\n");
1072           fprintf (dump_file, "SMS built-ddg %d\n", g->num_nodes);
1073           fprintf (dump_file, "SMS num-loads %d\n", g->num_loads);
1074           fprintf (dump_file, "SMS num-stores %d\n", g->num_stores);
1075         }
1076
1077
1078       /* In case of th loop have doloop register it gets special
1079          handling.  */
1080       count_init = NULL_RTX;
1081       if ((count_reg = doloop_register_get (tail)))
1082         {
1083           basic_block pre_header;
1084
1085           pre_header = loop_preheader_edge (loop)->src;
1086           count_init = const_iteration_count (count_reg, pre_header,
1087                                               &loop_count);
1088         }
1089       gcc_assert (count_reg);
1090
1091       if (dump_file && count_init)
1092         {
1093           fprintf (dump_file, "SMS const-doloop ");
1094           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1095                      loop_count);
1096           fprintf (dump_file, "\n");
1097         }
1098
1099       node_order = XNEWVEC (int, g->num_nodes);
1100
1101       mii = 1; /* Need to pass some estimate of mii.  */
1102       rec_mii = sms_order_nodes (g, mii, node_order);
1103       mii = MAX (res_MII (g), rec_mii);
1104       maxii = (calculate_maxii (g) * SMS_MAX_II_FACTOR) / 100;
1105
1106       if (dump_file)
1107         fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1108                  rec_mii, mii, maxii);
1109
1110       /* After sms_order_nodes and before sms_schedule_by_order, to copy over
1111          ASAP.  */
1112       set_node_sched_params (g);
1113
1114       ps = sms_schedule_by_order (g, mii, maxii, node_order);
1115
1116       if (ps)
1117         stage_count = PS_STAGE_COUNT (ps);
1118
1119       /* Stage count of 1 means that there is no interleaving between
1120          iterations, let the scheduling passes do the job.  */
1121       if (stage_count < 1
1122           || (count_init && (loop_count <= stage_count))
1123           || (flag_branch_probabilities && (trip_count <= stage_count)))
1124         {
1125           if (dump_file)
1126             {
1127               fprintf (dump_file, "SMS failed... \n");
1128               fprintf (dump_file, "SMS sched-failed (stage-count=%d, loop-count=", stage_count);
1129               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, loop_count);
1130               fprintf (dump_file, ", trip-count=");
1131               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, trip_count);
1132               fprintf (dump_file, ")\n");
1133             }
1134           continue;
1135         }
1136       else
1137         {
1138           int orig_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1139           int new_cycles;
1140           struct undo_replace_buff_elem *reg_move_replaces;
1141
1142           if (dump_file)
1143             {
1144               fprintf (dump_file,
1145                        "SMS succeeded %d %d (with ii, sc)\n", ps->ii,
1146                        stage_count);
1147               print_partial_schedule (ps, dump_file);
1148               fprintf (dump_file,
1149                        "SMS Branch (%d) will later be scheduled at cycle %d.\n",
1150                        g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1);
1151             }
1152
1153           /* Set the stage boundaries.  If the DDG is built with closing_branch_deps,
1154              the closing_branch was scheduled and should appear in the last (ii-1)
1155              row.  Otherwise, we are free to schedule the branch, and we let nodes
1156              that were scheduled at the first PS_MIN_CYCLE cycle appear in the first
1157              row; this should reduce stage_count to minimum.  */
1158           normalize_sched_times (ps);
1159           rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
1160           set_columns_for_ps (ps);
1161
1162           /* Generate the kernel just to be able to measure its cycles.  */
1163           permute_partial_schedule (ps, g->closing_branch->first_note);
1164           reg_move_replaces = generate_reg_moves (ps);
1165
1166           /* Get the number of cycles the new kernel expect to execute in.  */
1167           new_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1168
1169           /* Get back to the original loop so we can do loop versioning.  */
1170           undo_permute_partial_schedule (ps, g->closing_branch->first_note);
1171           if (reg_move_replaces)
1172             undo_generate_reg_moves (ps, reg_move_replaces);
1173
1174           if ( new_cycles >= orig_cycles)
1175             {
1176               /* SMS is not profitable so undo the permutation and reg move generation
1177                  and return the kernel to its original state.  */
1178               if (dump_file)
1179                 fprintf (dump_file, "Undoing SMS because it is not profitable.\n");
1180
1181             }
1182           else
1183             {
1184               canon_loop (loop);
1185
1186               /* case the BCT count is not known , Do loop-versioning */
1187               if (count_reg && ! count_init)
1188                 {
1189                   rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
1190                                                  GEN_INT(stage_count));
1191                   unsigned prob = (PROB_SMS_ENOUGH_ITERATIONS
1192                                    * REG_BR_PROB_BASE) / 100;
1193
1194                   loop_version (loop, comp_rtx, &condition_bb,
1195                                 prob, prob, REG_BR_PROB_BASE - prob,
1196                                 true);
1197                 }
1198
1199               /* Set new iteration count of loop kernel.  */
1200               if (count_reg && count_init)
1201                 SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1202                                                              - stage_count + 1);
1203
1204               /* Now apply the scheduled kernel to the RTL of the loop.  */
1205               permute_partial_schedule (ps, g->closing_branch->first_note);
1206
1207               /* Mark this loop as software pipelined so the later
1208               scheduling passes doesn't touch it.  */
1209               if (! flag_resched_modulo_sched)
1210                 g->bb->flags |= BB_DISABLE_SCHEDULE;
1211               /* The life-info is not valid any more.  */
1212               g->bb->flags |= BB_DIRTY;
1213
1214               reg_move_replaces = generate_reg_moves (ps);
1215               if (dump_file)
1216                 print_node_sched_params (dump_file, g->num_nodes);
1217               /* Generate prolog and epilog.  */
1218               if (count_reg && !count_init)
1219                 generate_prolog_epilog (ps, loop, count_reg);
1220               else
1221                 generate_prolog_epilog (ps, loop, NULL_RTX);
1222             }
1223           free_undo_replace_buff (reg_move_replaces);
1224         }
1225
1226       free_partial_schedule (ps);
1227       free (node_sched_params);
1228       free (node_order);
1229       free_ddg (g);
1230     }
1231
1232   free (g_arr);
1233
1234   /* Release scheduler data, needed until now because of DFA.  */
1235   sched_finish ();
1236   loop_optimizer_finalize ();
1237 }
1238
1239 /* The SMS scheduling algorithm itself
1240    -----------------------------------
1241    Input: 'O' an ordered list of insns of a loop.
1242    Output: A scheduling of the loop - kernel, prolog, and epilogue.
1243
1244    'Q' is the empty Set
1245    'PS' is the partial schedule; it holds the currently scheduled nodes with
1246         their cycle/slot.
1247    'PSP' previously scheduled predecessors.
1248    'PSS' previously scheduled successors.
1249    't(u)' the cycle where u is scheduled.
1250    'l(u)' is the latency of u.
1251    'd(v,u)' is the dependence distance from v to u.
1252    'ASAP(u)' the earliest time at which u could be scheduled as computed in
1253              the node ordering phase.
1254    'check_hardware_resources_conflicts(u, PS, c)'
1255                              run a trace around cycle/slot through DFA model
1256                              to check resource conflicts involving instruction u
1257                              at cycle c given the partial schedule PS.
1258    'add_to_partial_schedule_at_time(u, PS, c)'
1259                              Add the node/instruction u to the partial schedule
1260                              PS at time c.
1261    'calculate_register_pressure(PS)'
1262                              Given a schedule of instructions, calculate the register
1263                              pressure it implies.  One implementation could be the
1264                              maximum number of overlapping live ranges.
1265    'maxRP' The maximum allowed register pressure, it is usually derived from the number
1266            registers available in the hardware.
1267
1268    1. II = MII.
1269    2. PS = empty list
1270    3. for each node u in O in pre-computed order
1271    4.   if (PSP(u) != Q && PSS(u) == Q) then
1272    5.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1273    6.     start = Early_start; end = Early_start + II - 1; step = 1
1274    11.  else if (PSP(u) == Q && PSS(u) != Q) then
1275    12.      Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1276    13.     start = Late_start; end = Late_start - II + 1; step = -1
1277    14.  else if (PSP(u) != Q && PSS(u) != Q) then
1278    15.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1279    16.     Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1280    17.     start = Early_start;
1281    18.     end = min(Early_start + II - 1 , Late_start);
1282    19.     step = 1
1283    20.     else "if (PSP(u) == Q && PSS(u) == Q)"
1284    21.    start = ASAP(u); end = start + II - 1; step = 1
1285    22.  endif
1286
1287    23.  success = false
1288    24.  for (c = start ; c != end ; c += step)
1289    25.     if check_hardware_resources_conflicts(u, PS, c) then
1290    26.       add_to_partial_schedule_at_time(u, PS, c)
1291    27.       success = true
1292    28.       break
1293    29.     endif
1294    30.  endfor
1295    31.  if (success == false) then
1296    32.    II = II + 1
1297    33.    if (II > maxII) then
1298    34.       finish - failed to schedule
1299    35.   endif
1300    36.    goto 2.
1301    37.  endif
1302    38. endfor
1303    39. if (calculate_register_pressure(PS) > maxRP) then
1304    40.    goto 32.
1305    41. endif
1306    42. compute epilogue & prologue
1307    43. finish - succeeded to schedule
1308 */
1309
1310 /* A limit on the number of cycles that resource conflicts can span.  ??? Should
1311    be provided by DFA, and be dependent on the type of insn scheduled.  Currently
1312    set to 0 to save compile time.  */
1313 #define DFA_HISTORY SMS_DFA_HISTORY
1314
1315 /* Given the partial schedule PS, this function calculates and returns the
1316    cycles in which we can schedule the node with the given index I.
1317    NOTE: Here we do the backtracking in SMS, in some special cases. We have
1318    noticed that there are several cases in which we fail    to SMS the loop
1319    because the sched window of a node is empty    due to tight data-deps. In
1320    such cases we want to unschedule    some of the predecessors/successors
1321    until we get non-empty    scheduling window.  It returns -1 if the
1322    scheduling window is empty and zero otherwise.  */
1323
1324 static int
1325 get_sched_window (partial_schedule_ptr ps, int *nodes_order, int i,
1326                   sbitmap sched_nodes, int ii, int *start_p, int *step_p, int *end_p)
1327 {
1328   int start, step, end;
1329   ddg_edge_ptr e;
1330   int u = nodes_order [i];
1331   ddg_node_ptr u_node = &ps->g->nodes[u];
1332   sbitmap psp = sbitmap_alloc (ps->g->num_nodes);
1333   sbitmap pss = sbitmap_alloc (ps->g->num_nodes);
1334   sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1335   sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1336   int psp_not_empty;
1337   int pss_not_empty;
1338
1339   /* 1. compute sched window for u (start, end, step).  */
1340   sbitmap_zero (psp);
1341   sbitmap_zero (pss);
1342   psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes);
1343   pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes);
1344
1345   if (psp_not_empty && !pss_not_empty)
1346     {
1347       int early_start = INT_MIN;
1348
1349       end = INT_MAX;
1350       for (e = u_node->in; e != 0; e = e->next_in)
1351         {
1352           ddg_node_ptr v_node = e->src;
1353           if (TEST_BIT (sched_nodes, v_node->cuid))
1354             {
1355               int node_st = SCHED_TIME (v_node)
1356                             + e->latency - (e->distance * ii);
1357
1358               early_start = MAX (early_start, node_st);
1359
1360               if (e->data_type == MEM_DEP)
1361                 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1362             }
1363         }
1364       start = early_start;
1365       end = MIN (end, early_start + ii);
1366       step = 1;
1367     }
1368
1369   else if (!psp_not_empty && pss_not_empty)
1370     {
1371       int late_start = INT_MAX;
1372
1373       end = INT_MIN;
1374       for (e = u_node->out; e != 0; e = e->next_out)
1375         {
1376           ddg_node_ptr v_node = e->dest;
1377           if (TEST_BIT (sched_nodes, v_node->cuid))
1378             {
1379               late_start = MIN (late_start,
1380                                 SCHED_TIME (v_node) - e->latency
1381                                 + (e->distance * ii));
1382               if (e->data_type == MEM_DEP)
1383                 end = MAX (end, SCHED_TIME (v_node) - ii + 1);
1384             }
1385         }
1386       start = late_start;
1387       end = MAX (end, late_start - ii);
1388       step = -1;
1389     }
1390
1391   else if (psp_not_empty && pss_not_empty)
1392     {
1393       int early_start = INT_MIN;
1394       int late_start = INT_MAX;
1395
1396       start = INT_MIN;
1397       end = INT_MAX;
1398       for (e = u_node->in; e != 0; e = e->next_in)
1399         {
1400           ddg_node_ptr v_node = e->src;
1401
1402           if (TEST_BIT (sched_nodes, v_node->cuid))
1403             {
1404               early_start = MAX (early_start,
1405                                  SCHED_TIME (v_node) + e->latency
1406                                  - (e->distance * ii));
1407               if (e->data_type == MEM_DEP)
1408                 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1409             }
1410         }
1411       for (e = u_node->out; e != 0; e = e->next_out)
1412         {
1413           ddg_node_ptr v_node = e->dest;
1414
1415           if (TEST_BIT (sched_nodes, v_node->cuid))
1416             {
1417               late_start = MIN (late_start,
1418                                 SCHED_TIME (v_node) - e->latency
1419                                 + (e->distance * ii));
1420               if (e->data_type == MEM_DEP)
1421                 start = MAX (start, SCHED_TIME (v_node) - ii + 1);
1422             }
1423         }
1424       start = MAX (start, early_start);
1425       end = MIN (end, MIN (early_start + ii, late_start + 1));
1426       step = 1;
1427     }
1428   else /* psp is empty && pss is empty.  */
1429     {
1430       start = SCHED_ASAP (u_node);
1431       end = start + ii;
1432       step = 1;
1433     }
1434
1435   *start_p = start;
1436   *step_p = step;
1437   *end_p = end;
1438   sbitmap_free (psp);
1439   sbitmap_free (pss);
1440
1441   if ((start >= end && step == 1) || (start <= end && step == -1))
1442     return -1;
1443   else
1444     return 0;
1445 }
1446
1447 /* This function implements the scheduling algorithm for SMS according to the
1448    above algorithm.  */
1449 static partial_schedule_ptr
1450 sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order)
1451 {
1452   int ii = mii;
1453   int i, c, success;
1454   int try_again_with_larger_ii = true;
1455   int num_nodes = g->num_nodes;
1456   ddg_edge_ptr e;
1457   int start, end, step; /* Place together into one struct?  */
1458   sbitmap sched_nodes = sbitmap_alloc (num_nodes);
1459   sbitmap must_precede = sbitmap_alloc (num_nodes);
1460   sbitmap must_follow = sbitmap_alloc (num_nodes);
1461   sbitmap tobe_scheduled = sbitmap_alloc (num_nodes);
1462
1463   partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY);
1464
1465   sbitmap_ones (tobe_scheduled);
1466   sbitmap_zero (sched_nodes);
1467
1468   while ((! sbitmap_equal (tobe_scheduled, sched_nodes)
1469          || try_again_with_larger_ii ) && ii < maxii)
1470     {
1471       int j;
1472       bool unscheduled_nodes = false;
1473
1474       if (dump_file)
1475         fprintf (dump_file, "Starting with ii=%d\n", ii);
1476       if (try_again_with_larger_ii)
1477         {
1478           try_again_with_larger_ii = false;
1479           sbitmap_zero (sched_nodes);
1480         }
1481
1482       for (i = 0; i < num_nodes; i++)
1483         {
1484           int u = nodes_order[i];
1485           ddg_node_ptr u_node = &ps->g->nodes[u];
1486           rtx insn = u_node->insn;
1487
1488           if (!INSN_P (insn))
1489             {
1490               RESET_BIT (tobe_scheduled, u);
1491               continue;
1492             }
1493
1494           if (JUMP_P (insn)) /* Closing branch handled later.  */
1495             {
1496               RESET_BIT (tobe_scheduled, u);
1497               continue;
1498             }
1499
1500           if (TEST_BIT (sched_nodes, u))
1501             continue;
1502
1503           /* Try to get non-empty scheduling window.  */
1504           j = i;
1505           while (get_sched_window (ps, nodes_order, i, sched_nodes, ii, &start, &step, &end) < 0
1506                  && j > 0)
1507             {
1508               unscheduled_nodes = true;
1509               if (TEST_BIT (NODE_PREDECESSORS (u_node), nodes_order[j - 1])
1510                   || TEST_BIT (NODE_SUCCESSORS (u_node), nodes_order[j - 1]))
1511                 {
1512                   ps_unschedule_node (ps, &ps->g->nodes[nodes_order[j - 1]]);
1513                   RESET_BIT (sched_nodes, nodes_order [j - 1]);
1514                 }
1515               j--;
1516             }
1517           if (j < 0)
1518             {
1519               /* ??? Try backtracking instead of immediately ii++?  */
1520               ii++;
1521               try_again_with_larger_ii = true;
1522               reset_partial_schedule (ps, ii);
1523               break;
1524             }
1525           /* 2. Try scheduling u in window.  */
1526           if (dump_file)
1527             fprintf (dump_file,
1528                      "Trying to schedule node %d in (%d .. %d) step %d\n",
1529                      u, start, end, step);
1530
1531           /* use must_follow & must_precede bitmaps to determine order
1532              of nodes within the cycle.  */
1533           sbitmap_zero (must_precede);
1534           sbitmap_zero (must_follow);
1535           for (e = u_node->in; e != 0; e = e->next_in)
1536             if (TEST_BIT (sched_nodes, e->src->cuid)
1537                 && e->latency == (ii * e->distance)
1538                 && start == SCHED_TIME (e->src))
1539              SET_BIT (must_precede, e->src->cuid);
1540
1541           for (e = u_node->out; e != 0; e = e->next_out)
1542             if (TEST_BIT (sched_nodes, e->dest->cuid)
1543                 && e->latency == (ii * e->distance)
1544                 && end == SCHED_TIME (e->dest))
1545              SET_BIT (must_follow, e->dest->cuid);
1546
1547           success = 0;
1548           if ((step > 0 && start < end) ||  (step < 0 && start > end))
1549             for (c = start; c != end; c += step)
1550               {
1551                 ps_insn_ptr psi;
1552
1553                 psi = ps_add_node_check_conflicts (ps, u_node, c,
1554                                                    must_precede,
1555                                                    must_follow);
1556
1557                 if (psi)
1558                   {
1559                     SCHED_TIME (u_node) = c;
1560                     SET_BIT (sched_nodes, u);
1561                     success = 1;
1562                     if (dump_file)
1563                       fprintf (dump_file, "Schedule in %d\n", c);
1564                     break;
1565                   }
1566               }
1567           if (!success)
1568             {
1569               /* ??? Try backtracking instead of immediately ii++?  */
1570               ii++;
1571               try_again_with_larger_ii = true;
1572               reset_partial_schedule (ps, ii);
1573               break;
1574             }
1575           if (unscheduled_nodes)
1576             break;
1577
1578           /* ??? If (success), check register pressure estimates.  */
1579         } /* Continue with next node.  */
1580     } /* While try_again_with_larger_ii.  */
1581
1582   sbitmap_free (sched_nodes);
1583   sbitmap_free (must_precede);
1584   sbitmap_free (must_follow);
1585   sbitmap_free (tobe_scheduled);
1586
1587   if (ii >= maxii)
1588     {
1589       free_partial_schedule (ps);
1590       ps = NULL;
1591     }
1592   return ps;
1593 }
1594
1595 \f
1596 /* This page implements the algorithm for ordering the nodes of a DDG
1597    for modulo scheduling, activated through the
1598    "int sms_order_nodes (ddg_ptr, int mii, int * result)" API.  */
1599
1600 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
1601 #define ASAP(x) (ORDER_PARAMS ((x))->asap)
1602 #define ALAP(x) (ORDER_PARAMS ((x))->alap)
1603 #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
1604 #define MOB(x) (ALAP ((x)) - ASAP ((x)))
1605 #define DEPTH(x) (ASAP ((x)))
1606
1607 typedef struct node_order_params * nopa;
1608
1609 static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
1610 static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
1611 static nopa  calculate_order_params (ddg_ptr, int mii);
1612 static int find_max_asap (ddg_ptr, sbitmap);
1613 static int find_max_hv_min_mob (ddg_ptr, sbitmap);
1614 static int find_max_dv_min_mob (ddg_ptr, sbitmap);
1615
1616 enum sms_direction {BOTTOMUP, TOPDOWN};
1617
1618 struct node_order_params
1619 {
1620   int asap;
1621   int alap;
1622   int height;
1623 };
1624
1625 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1.  */
1626 static void
1627 check_nodes_order (int *node_order, int num_nodes)
1628 {
1629   int i;
1630   sbitmap tmp = sbitmap_alloc (num_nodes);
1631
1632   sbitmap_zero (tmp);
1633
1634   for (i = 0; i < num_nodes; i++)
1635     {
1636       int u = node_order[i];
1637
1638       gcc_assert (u < num_nodes && u >= 0 && !TEST_BIT (tmp, u));
1639
1640       SET_BIT (tmp, u);
1641     }
1642
1643   sbitmap_free (tmp);
1644 }
1645
1646 /* Order the nodes of G for scheduling and pass the result in
1647    NODE_ORDER.  Also set aux.count of each node to ASAP.
1648    Return the recMII for the given DDG.  */
1649 static int
1650 sms_order_nodes (ddg_ptr g, int mii, int * node_order)
1651 {
1652   int i;
1653   int rec_mii = 0;
1654   ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
1655
1656   nopa nops = calculate_order_params (g, mii);
1657
1658   order_nodes_of_sccs (sccs, node_order);
1659
1660   if (sccs->num_sccs > 0)
1661     /* First SCC has the largest recurrence_length.  */
1662     rec_mii = sccs->sccs[0]->recurrence_length;
1663
1664   /* Save ASAP before destroying node_order_params.  */
1665   for (i = 0; i < g->num_nodes; i++)
1666     {
1667       ddg_node_ptr v = &g->nodes[i];
1668       v->aux.count = ASAP (v);
1669     }
1670
1671   free (nops);
1672   free_ddg_all_sccs (sccs);
1673   check_nodes_order (node_order, g->num_nodes);
1674
1675   return rec_mii;
1676 }
1677
1678 static void
1679 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
1680 {
1681   int i, pos = 0;
1682   ddg_ptr g = all_sccs->ddg;
1683   int num_nodes = g->num_nodes;
1684   sbitmap prev_sccs = sbitmap_alloc (num_nodes);
1685   sbitmap on_path = sbitmap_alloc (num_nodes);
1686   sbitmap tmp = sbitmap_alloc (num_nodes);
1687   sbitmap ones = sbitmap_alloc (num_nodes);
1688
1689   sbitmap_zero (prev_sccs);
1690   sbitmap_ones (ones);
1691
1692   /* Perfrom the node ordering starting from the SCC with the highest recMII.
1693      For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc.  */
1694   for (i = 0; i < all_sccs->num_sccs; i++)
1695     {
1696       ddg_scc_ptr scc = all_sccs->sccs[i];
1697
1698       /* Add nodes on paths from previous SCCs to the current SCC.  */
1699       find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
1700       sbitmap_a_or_b (tmp, scc->nodes, on_path);
1701
1702       /* Add nodes on paths from the current SCC to previous SCCs.  */
1703       find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
1704       sbitmap_a_or_b (tmp, tmp, on_path);
1705
1706       /* Remove nodes of previous SCCs from current extended SCC.  */
1707       sbitmap_difference (tmp, tmp, prev_sccs);
1708
1709       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1710       /* Above call to order_nodes_in_scc updated prev_sccs |= tmp.  */
1711     }
1712
1713   /* Handle the remaining nodes that do not belong to any scc.  Each call
1714      to order_nodes_in_scc handles a single connected component.  */
1715   while (pos < g->num_nodes)
1716     {
1717       sbitmap_difference (tmp, ones, prev_sccs);
1718       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1719     }
1720   sbitmap_free (prev_sccs);
1721   sbitmap_free (on_path);
1722   sbitmap_free (tmp);
1723   sbitmap_free (ones);
1724 }
1725
1726 /* MII is needed if we consider backarcs (that do not close recursive cycles).  */
1727 static struct node_order_params *
1728 calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED)
1729 {
1730   int u;
1731   int max_asap;
1732   int num_nodes = g->num_nodes;
1733   ddg_edge_ptr e;
1734   /* Allocate a place to hold ordering params for each node in the DDG.  */
1735   nopa node_order_params_arr;
1736
1737   /* Initialize of ASAP/ALAP/HEIGHT to zero.  */
1738   node_order_params_arr = (nopa) xcalloc (num_nodes,
1739                                           sizeof (struct node_order_params));
1740
1741   /* Set the aux pointer of each node to point to its order_params structure.  */
1742   for (u = 0; u < num_nodes; u++)
1743     g->nodes[u].aux.info = &node_order_params_arr[u];
1744
1745   /* Disregarding a backarc from each recursive cycle to obtain a DAG,
1746      calculate ASAP, ALAP, mobility, distance, and height for each node
1747      in the dependence (direct acyclic) graph.  */
1748
1749   /* We assume that the nodes in the array are in topological order.  */
1750
1751   max_asap = 0;
1752   for (u = 0; u < num_nodes; u++)
1753     {
1754       ddg_node_ptr u_node = &g->nodes[u];
1755
1756       ASAP (u_node) = 0;
1757       for (e = u_node->in; e; e = e->next_in)
1758         if (e->distance == 0)
1759           ASAP (u_node) = MAX (ASAP (u_node),
1760                                ASAP (e->src) + e->latency);
1761       max_asap = MAX (max_asap, ASAP (u_node));
1762     }
1763
1764   for (u = num_nodes - 1; u > -1; u--)
1765     {
1766       ddg_node_ptr u_node = &g->nodes[u];
1767
1768       ALAP (u_node) = max_asap;
1769       HEIGHT (u_node) = 0;
1770       for (e = u_node->out; e; e = e->next_out)
1771         if (e->distance == 0)
1772           {
1773             ALAP (u_node) = MIN (ALAP (u_node),
1774                                  ALAP (e->dest) - e->latency);
1775             HEIGHT (u_node) = MAX (HEIGHT (u_node),
1776                                    HEIGHT (e->dest) + e->latency);
1777           }
1778     }
1779
1780   return node_order_params_arr;
1781 }
1782
1783 static int
1784 find_max_asap (ddg_ptr g, sbitmap nodes)
1785 {
1786   unsigned int u = 0;
1787   int max_asap = -1;
1788   int result = -1;
1789   sbitmap_iterator sbi;
1790
1791   EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1792     {
1793       ddg_node_ptr u_node = &g->nodes[u];
1794
1795       if (max_asap < ASAP (u_node))
1796         {
1797           max_asap = ASAP (u_node);
1798           result = u;
1799         }
1800     }
1801   return result;
1802 }
1803
1804 static int
1805 find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
1806 {
1807   unsigned int u = 0;
1808   int max_hv = -1;
1809   int min_mob = INT_MAX;
1810   int result = -1;
1811   sbitmap_iterator sbi;
1812
1813   EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1814     {
1815       ddg_node_ptr u_node = &g->nodes[u];
1816
1817       if (max_hv < HEIGHT (u_node))
1818         {
1819           max_hv = HEIGHT (u_node);
1820           min_mob = MOB (u_node);
1821           result = u;
1822         }
1823       else if ((max_hv == HEIGHT (u_node))
1824                && (min_mob > MOB (u_node)))
1825         {
1826           min_mob = MOB (u_node);
1827           result = u;
1828         }
1829     }
1830   return result;
1831 }
1832
1833 static int
1834 find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
1835 {
1836   unsigned int u = 0;
1837   int max_dv = -1;
1838   int min_mob = INT_MAX;
1839   int result = -1;
1840   sbitmap_iterator sbi;
1841
1842   EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1843     {
1844       ddg_node_ptr u_node = &g->nodes[u];
1845
1846       if (max_dv < DEPTH (u_node))
1847         {
1848           max_dv = DEPTH (u_node);
1849           min_mob = MOB (u_node);
1850           result = u;
1851         }
1852       else if ((max_dv == DEPTH (u_node))
1853                && (min_mob > MOB (u_node)))
1854         {
1855           min_mob = MOB (u_node);
1856           result = u;
1857         }
1858     }
1859   return result;
1860 }
1861
1862 /* Places the nodes of SCC into the NODE_ORDER array starting
1863    at position POS, according to the SMS ordering algorithm.
1864    NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
1865    the NODE_ORDER array, starting from position zero.  */
1866 static int
1867 order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
1868                     int * node_order, int pos)
1869 {
1870   enum sms_direction dir;
1871   int num_nodes = g->num_nodes;
1872   sbitmap workset = sbitmap_alloc (num_nodes);
1873   sbitmap tmp = sbitmap_alloc (num_nodes);
1874   sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
1875   sbitmap predecessors = sbitmap_alloc (num_nodes);
1876   sbitmap successors = sbitmap_alloc (num_nodes);
1877
1878   sbitmap_zero (predecessors);
1879   find_predecessors (predecessors, g, nodes_ordered);
1880
1881   sbitmap_zero (successors);
1882   find_successors (successors, g, nodes_ordered);
1883
1884   sbitmap_zero (tmp);
1885   if (sbitmap_a_and_b_cg (tmp, predecessors, scc))
1886     {
1887       sbitmap_copy (workset, tmp);
1888       dir = BOTTOMUP;
1889     }
1890   else if (sbitmap_a_and_b_cg (tmp, successors, scc))
1891     {
1892       sbitmap_copy (workset, tmp);
1893       dir = TOPDOWN;
1894     }
1895   else
1896     {
1897       int u;
1898
1899       sbitmap_zero (workset);
1900       if ((u = find_max_asap (g, scc)) >= 0)
1901         SET_BIT (workset, u);
1902       dir = BOTTOMUP;
1903     }
1904
1905   sbitmap_zero (zero_bitmap);
1906   while (!sbitmap_equal (workset, zero_bitmap))
1907     {
1908       int v;
1909       ddg_node_ptr v_node;
1910       sbitmap v_node_preds;
1911       sbitmap v_node_succs;
1912
1913       if (dir == TOPDOWN)
1914         {
1915           while (!sbitmap_equal (workset, zero_bitmap))
1916             {
1917               v = find_max_hv_min_mob (g, workset);
1918               v_node = &g->nodes[v];
1919               node_order[pos++] = v;
1920               v_node_succs = NODE_SUCCESSORS (v_node);
1921               sbitmap_a_and_b (tmp, v_node_succs, scc);
1922
1923               /* Don't consider the already ordered successors again.  */
1924               sbitmap_difference (tmp, tmp, nodes_ordered);
1925               sbitmap_a_or_b (workset, workset, tmp);
1926               RESET_BIT (workset, v);
1927               SET_BIT (nodes_ordered, v);
1928             }
1929           dir = BOTTOMUP;
1930           sbitmap_zero (predecessors);
1931           find_predecessors (predecessors, g, nodes_ordered);
1932           sbitmap_a_and_b (workset, predecessors, scc);
1933         }
1934       else
1935         {
1936           while (!sbitmap_equal (workset, zero_bitmap))
1937             {
1938               v = find_max_dv_min_mob (g, workset);
1939               v_node = &g->nodes[v];
1940               node_order[pos++] = v;
1941               v_node_preds = NODE_PREDECESSORS (v_node);
1942               sbitmap_a_and_b (tmp, v_node_preds, scc);
1943
1944               /* Don't consider the already ordered predecessors again.  */
1945               sbitmap_difference (tmp, tmp, nodes_ordered);
1946               sbitmap_a_or_b (workset, workset, tmp);
1947               RESET_BIT (workset, v);
1948               SET_BIT (nodes_ordered, v);
1949             }
1950           dir = TOPDOWN;
1951           sbitmap_zero (successors);
1952           find_successors (successors, g, nodes_ordered);
1953           sbitmap_a_and_b (workset, successors, scc);
1954         }
1955     }
1956   sbitmap_free (tmp);
1957   sbitmap_free (workset);
1958   sbitmap_free (zero_bitmap);
1959   sbitmap_free (predecessors);
1960   sbitmap_free (successors);
1961   return pos;
1962 }
1963
1964 \f
1965 /* This page contains functions for manipulating partial-schedules during
1966    modulo scheduling.  */
1967
1968 /* Create a partial schedule and allocate a memory to hold II rows.  */
1969
1970 static partial_schedule_ptr
1971 create_partial_schedule (int ii, ddg_ptr g, int history)
1972 {
1973   partial_schedule_ptr ps = XNEW (struct partial_schedule);
1974   ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
1975   ps->ii = ii;
1976   ps->history = history;
1977   ps->min_cycle = INT_MAX;
1978   ps->max_cycle = INT_MIN;
1979   ps->g = g;
1980
1981   return ps;
1982 }
1983
1984 /* Free the PS_INSNs in rows array of the given partial schedule.
1985    ??? Consider caching the PS_INSN's.  */
1986 static void
1987 free_ps_insns (partial_schedule_ptr ps)
1988 {
1989   int i;
1990
1991   for (i = 0; i < ps->ii; i++)
1992     {
1993       while (ps->rows[i])
1994         {
1995           ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
1996
1997           free (ps->rows[i]);
1998           ps->rows[i] = ps_insn;
1999         }
2000       ps->rows[i] = NULL;
2001     }
2002 }
2003
2004 /* Free all the memory allocated to the partial schedule.  */
2005
2006 static void
2007 free_partial_schedule (partial_schedule_ptr ps)
2008 {
2009   if (!ps)
2010     return;
2011   free_ps_insns (ps);
2012   free (ps->rows);
2013   free (ps);
2014 }
2015
2016 /* Clear the rows array with its PS_INSNs, and create a new one with
2017    NEW_II rows.  */
2018
2019 static void
2020 reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
2021 {
2022   if (!ps)
2023     return;
2024   free_ps_insns (ps);
2025   if (new_ii == ps->ii)
2026     return;
2027   ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
2028                                                  * sizeof (ps_insn_ptr));
2029   memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
2030   ps->ii = new_ii;
2031   ps->min_cycle = INT_MAX;
2032   ps->max_cycle = INT_MIN;
2033 }
2034
2035 /* Prints the partial schedule as an ii rows array, for each rows
2036    print the ids of the insns in it.  */
2037 void
2038 print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
2039 {
2040   int i;
2041
2042   for (i = 0; i < ps->ii; i++)
2043     {
2044       ps_insn_ptr ps_i = ps->rows[i];
2045
2046       fprintf (dump, "\n[CYCLE %d ]: ", i);
2047       while (ps_i)
2048         {
2049           fprintf (dump, "%d, ",
2050                    INSN_UID (ps_i->node->insn));
2051           ps_i = ps_i->next_in_row;
2052         }
2053     }
2054 }
2055
2056 /* Creates an object of PS_INSN and initializes it to the given parameters.  */
2057 static ps_insn_ptr
2058 create_ps_insn (ddg_node_ptr node, int rest_count, int cycle)
2059 {
2060   ps_insn_ptr ps_i = XNEW (struct ps_insn);
2061
2062   ps_i->node = node;
2063   ps_i->next_in_row = NULL;
2064   ps_i->prev_in_row = NULL;
2065   ps_i->row_rest_count = rest_count;
2066   ps_i->cycle = cycle;
2067
2068   return ps_i;
2069 }
2070
2071
2072 /* Removes the given PS_INSN from the partial schedule.  Returns false if the
2073    node is not found in the partial schedule, else returns true.  */
2074 static bool
2075 remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
2076 {
2077   int row;
2078
2079   if (!ps || !ps_i)
2080     return false;
2081
2082   row = SMODULO (ps_i->cycle, ps->ii);
2083   if (! ps_i->prev_in_row)
2084     {
2085       if (ps_i != ps->rows[row])
2086         return false;
2087
2088       ps->rows[row] = ps_i->next_in_row;
2089       if (ps->rows[row])
2090         ps->rows[row]->prev_in_row = NULL;
2091     }
2092   else
2093     {
2094       ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
2095       if (ps_i->next_in_row)
2096         ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
2097     }
2098   free (ps_i);
2099   return true;
2100 }
2101
2102 /* Unlike what literature describes for modulo scheduling (which focuses
2103    on VLIW machines) the order of the instructions inside a cycle is
2104    important.  Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
2105    where the current instruction should go relative to the already
2106    scheduled instructions in the given cycle.  Go over these
2107    instructions and find the first possible column to put it in.  */
2108 static bool
2109 ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2110                      sbitmap must_precede, sbitmap must_follow)
2111 {
2112   ps_insn_ptr next_ps_i;
2113   ps_insn_ptr first_must_follow = NULL;
2114   ps_insn_ptr last_must_precede = NULL;
2115   int row;
2116
2117   if (! ps_i)
2118     return false;
2119
2120   row = SMODULO (ps_i->cycle, ps->ii);
2121
2122   /* Find the first must follow and the last must precede
2123      and insert the node immediately after the must precede
2124      but make sure that it there is no must follow after it.  */
2125   for (next_ps_i = ps->rows[row];
2126        next_ps_i;
2127        next_ps_i = next_ps_i->next_in_row)
2128     {
2129       if (TEST_BIT (must_follow, next_ps_i->node->cuid)
2130           && ! first_must_follow)
2131         first_must_follow = next_ps_i;
2132       if (TEST_BIT (must_precede, next_ps_i->node->cuid))
2133         {
2134           /* If we have already met a node that must follow, then
2135              there is no possible column.  */
2136           if (first_must_follow)
2137             return false;
2138           else
2139             last_must_precede = next_ps_i;
2140         }
2141     }
2142
2143   /* Now insert the node after INSERT_AFTER_PSI.  */
2144
2145   if (! last_must_precede)
2146     {
2147       ps_i->next_in_row = ps->rows[row];
2148       ps_i->prev_in_row = NULL;
2149       if (ps_i->next_in_row)
2150         ps_i->next_in_row->prev_in_row = ps_i;
2151       ps->rows[row] = ps_i;
2152     }
2153   else
2154     {
2155       ps_i->next_in_row = last_must_precede->next_in_row;
2156       last_must_precede->next_in_row = ps_i;
2157       ps_i->prev_in_row = last_must_precede;
2158       if (ps_i->next_in_row)
2159         ps_i->next_in_row->prev_in_row = ps_i;
2160     }
2161
2162   return true;
2163 }
2164
2165 /* Advances the PS_INSN one column in its current row; returns false
2166    in failure and true in success.  Bit N is set in MUST_FOLLOW if
2167    the node with cuid N must be come after the node pointed to by
2168    PS_I when scheduled in the same cycle.  */
2169 static int
2170 ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2171                         sbitmap must_follow)
2172 {
2173   ps_insn_ptr prev, next;
2174   int row;
2175   ddg_node_ptr next_node;
2176
2177   if (!ps || !ps_i)
2178     return false;
2179
2180   row = SMODULO (ps_i->cycle, ps->ii);
2181
2182   if (! ps_i->next_in_row)
2183     return false;
2184
2185   next_node = ps_i->next_in_row->node;
2186
2187   /* Check if next_in_row is dependent on ps_i, both having same sched
2188      times (typically ANTI_DEP).  If so, ps_i cannot skip over it.  */
2189   if (TEST_BIT (must_follow, next_node->cuid))
2190     return false;
2191
2192   /* Advance PS_I over its next_in_row in the doubly linked list.  */
2193   prev = ps_i->prev_in_row;
2194   next = ps_i->next_in_row;
2195
2196   if (ps_i == ps->rows[row])
2197     ps->rows[row] = next;
2198
2199   ps_i->next_in_row = next->next_in_row;
2200
2201   if (next->next_in_row)
2202     next->next_in_row->prev_in_row = ps_i;
2203
2204   next->next_in_row = ps_i;
2205   ps_i->prev_in_row = next;
2206
2207   next->prev_in_row = prev;
2208   if (prev)
2209     prev->next_in_row = next;
2210
2211   return true;
2212 }
2213
2214 /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
2215    Returns 0 if this is not possible and a PS_INSN otherwise.  Bit N is
2216    set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
2217    before/after (respectively) the node pointed to by PS_I when scheduled
2218    in the same cycle.  */
2219 static ps_insn_ptr
2220 add_node_to_ps (partial_schedule_ptr ps, ddg_node_ptr node, int cycle,
2221                 sbitmap must_precede, sbitmap must_follow)
2222 {
2223   ps_insn_ptr ps_i;
2224   int rest_count = 1;
2225   int row = SMODULO (cycle, ps->ii);
2226
2227   if (ps->rows[row]
2228       && ps->rows[row]->row_rest_count >= issue_rate)
2229     return NULL;
2230
2231   if (ps->rows[row])
2232     rest_count += ps->rows[row]->row_rest_count;
2233
2234   ps_i = create_ps_insn (node, rest_count, cycle);
2235
2236   /* Finds and inserts PS_I according to MUST_FOLLOW and
2237      MUST_PRECEDE.  */
2238   if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow))
2239     {
2240       free (ps_i);
2241       return NULL;
2242     }
2243
2244   return ps_i;
2245 }
2246
2247 /* Advance time one cycle.  Assumes DFA is being used.  */
2248 static void
2249 advance_one_cycle (void)
2250 {
2251   if (targetm.sched.dfa_pre_cycle_insn)
2252     state_transition (curr_state,
2253                       targetm.sched.dfa_pre_cycle_insn ());
2254
2255   state_transition (curr_state, NULL);
2256
2257   if (targetm.sched.dfa_post_cycle_insn)
2258     state_transition (curr_state,
2259                       targetm.sched.dfa_post_cycle_insn ());
2260 }
2261
2262 /* Given the kernel of a loop (from FIRST_INSN to LAST_INSN), finds
2263    the number of cycles according to DFA that the kernel fits in,
2264    we use this to check if we done well with SMS after we add
2265    register moves.  In some cases register moves overhead makes
2266    it even worse than the original loop.  We want SMS to be performed
2267    when it gives less cycles after register moves are added.  */
2268 static int
2269 kernel_number_of_cycles (rtx first_insn, rtx last_insn)
2270 {
2271   int cycles = 0;
2272   rtx insn;
2273   int can_issue_more = issue_rate;
2274
2275   state_reset (curr_state);
2276
2277   for (insn = first_insn;
2278        insn != NULL_RTX && insn != last_insn;
2279        insn = NEXT_INSN (insn))
2280     {
2281       if (! INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE)
2282         continue;
2283
2284       /* Check if there is room for the current insn.  */
2285       if (!can_issue_more || state_dead_lock_p (curr_state))
2286         {
2287           cycles ++;
2288           advance_one_cycle ();
2289           can_issue_more = issue_rate;
2290         }
2291
2292         /* Update the DFA state and return with failure if the DFA found
2293            recource conflicts.  */
2294       if (state_transition (curr_state, insn) >= 0)
2295         {
2296           cycles ++;
2297           advance_one_cycle ();
2298           can_issue_more = issue_rate;
2299         }
2300
2301       if (targetm.sched.variable_issue)
2302         can_issue_more =
2303           targetm.sched.variable_issue (sched_dump, sched_verbose,
2304                                         insn, can_issue_more);
2305       /* A naked CLOBBER or USE generates no instruction, so don't
2306          let them consume issue slots.  */
2307       else if (GET_CODE (PATTERN (insn)) != USE
2308                && GET_CODE (PATTERN (insn)) != CLOBBER)
2309         can_issue_more--;
2310     }
2311   return cycles;
2312 }
2313
2314 /* Checks if PS has resource conflicts according to DFA, starting from
2315    FROM cycle to TO cycle; returns true if there are conflicts and false
2316    if there are no conflicts.  Assumes DFA is being used.  */
2317 static int
2318 ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
2319 {
2320   int cycle;
2321
2322   state_reset (curr_state);
2323
2324   for (cycle = from; cycle <= to; cycle++)
2325     {
2326       ps_insn_ptr crr_insn;
2327       /* Holds the remaining issue slots in the current row.  */
2328       int can_issue_more = issue_rate;
2329
2330       /* Walk through the DFA for the current row.  */
2331       for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
2332            crr_insn;
2333            crr_insn = crr_insn->next_in_row)
2334         {
2335           rtx insn = crr_insn->node->insn;
2336
2337           if (!INSN_P (insn))
2338             continue;
2339
2340           /* Check if there is room for the current insn.  */
2341           if (!can_issue_more || state_dead_lock_p (curr_state))
2342             return true;
2343
2344           /* Update the DFA state and return with failure if the DFA found
2345              recource conflicts.  */
2346           if (state_transition (curr_state, insn) >= 0)
2347             return true;
2348
2349           if (targetm.sched.variable_issue)
2350             can_issue_more =
2351               targetm.sched.variable_issue (sched_dump, sched_verbose,
2352                                             insn, can_issue_more);
2353           /* A naked CLOBBER or USE generates no instruction, so don't
2354              let them consume issue slots.  */
2355           else if (GET_CODE (PATTERN (insn)) != USE
2356                    && GET_CODE (PATTERN (insn)) != CLOBBER)
2357             can_issue_more--;
2358         }
2359
2360       /* Advance the DFA to the next cycle.  */
2361       advance_one_cycle ();
2362     }
2363   return false;
2364 }
2365
2366 /* Checks if the given node causes resource conflicts when added to PS at
2367    cycle C.  If not the node is added to PS and returned; otherwise zero
2368    is returned.  Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
2369    cuid N must be come before/after (respectively) the node pointed to by
2370    PS_I when scheduled in the same cycle.  */
2371 ps_insn_ptr
2372 ps_add_node_check_conflicts (partial_schedule_ptr ps, ddg_node_ptr n,
2373                              int c, sbitmap must_precede,
2374                              sbitmap must_follow)
2375 {
2376   int has_conflicts = 0;
2377   ps_insn_ptr ps_i;
2378
2379   /* First add the node to the PS, if this succeeds check for
2380      conflicts, trying different issue slots in the same row.  */
2381   if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow)))
2382     return NULL; /* Failed to insert the node at the given cycle.  */
2383
2384   has_conflicts = ps_has_conflicts (ps, c, c)
2385                   || (ps->history > 0
2386                       && ps_has_conflicts (ps,
2387                                            c - ps->history,
2388                                            c + ps->history));
2389
2390   /* Try different issue slots to find one that the given node can be
2391      scheduled in without conflicts.  */
2392   while (has_conflicts)
2393     {
2394       if (! ps_insn_advance_column (ps, ps_i, must_follow))
2395         break;
2396       has_conflicts = ps_has_conflicts (ps, c, c)
2397                       || (ps->history > 0
2398                           && ps_has_conflicts (ps,
2399                                                c - ps->history,
2400                                                c + ps->history));
2401     }
2402
2403   if (has_conflicts)
2404     {
2405       remove_node_from_ps (ps, ps_i);
2406       return NULL;
2407     }
2408
2409   ps->min_cycle = MIN (ps->min_cycle, c);
2410   ps->max_cycle = MAX (ps->max_cycle, c);
2411   return ps_i;
2412 }
2413
2414 /* Rotate the rows of PS such that insns scheduled at time
2415    START_CYCLE will appear in row 0.  Updates max/min_cycles.  */
2416 void
2417 rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
2418 {
2419   int i, row, backward_rotates;
2420   int last_row = ps->ii - 1;
2421
2422   if (start_cycle == 0)
2423     return;
2424
2425   backward_rotates = SMODULO (start_cycle, ps->ii);
2426
2427   /* Revisit later and optimize this into a single loop.  */
2428   for (i = 0; i < backward_rotates; i++)
2429     {
2430       ps_insn_ptr first_row = ps->rows[0];
2431
2432       for (row = 0; row < last_row; row++)
2433         ps->rows[row] = ps->rows[row+1];
2434
2435       ps->rows[last_row] = first_row;
2436     }
2437
2438   ps->max_cycle -= start_cycle;
2439   ps->min_cycle -= start_cycle;
2440 }
2441
2442 /* Remove the node N from the partial schedule PS; because we restart the DFA
2443    each time we want to check for resource conflicts; this is equivalent to
2444    unscheduling the node N.  */
2445 static bool
2446 ps_unschedule_node (partial_schedule_ptr ps, ddg_node_ptr n)
2447 {
2448   ps_insn_ptr ps_i;
2449   int row = SMODULO (SCHED_TIME (n), ps->ii);
2450
2451   if (row < 0 || row > ps->ii)
2452     return false;
2453
2454   for (ps_i = ps->rows[row];
2455        ps_i &&  ps_i->node != n;
2456        ps_i = ps_i->next_in_row);
2457   if (!ps_i)
2458     return false;
2459
2460   return remove_node_from_ps (ps, ps_i);
2461 }
2462 #endif /* INSN_SCHEDULING */
2463 \f
2464 static bool
2465 gate_handle_sms (void)
2466 {
2467   return (optimize > 0 && flag_modulo_sched);
2468 }
2469
2470
2471 /* Run instruction scheduler.  */
2472 /* Perform SMS module scheduling.  */
2473 static unsigned int
2474 rest_of_handle_sms (void)
2475 {
2476 #ifdef INSN_SCHEDULING
2477   basic_block bb;
2478
2479   /* We want to be able to create new pseudos.  */
2480   no_new_pseudos = 0;
2481   /* Collect loop information to be used in SMS.  */
2482   cfg_layout_initialize (CLEANUP_UPDATE_LIFE);
2483   sms_schedule ();
2484
2485   /* Update the life information, because we add pseudos.  */
2486   max_regno = max_reg_num ();
2487   allocate_reg_info (max_regno, FALSE, FALSE);
2488   update_life_info (NULL, UPDATE_LIFE_GLOBAL_RM_NOTES,
2489                     (PROP_DEATH_NOTES
2490                      | PROP_REG_INFO
2491                      | PROP_KILL_DEAD_CODE
2492                      | PROP_SCAN_DEAD_CODE));
2493
2494   no_new_pseudos = 1;
2495
2496   /* Finalize layout changes.  */
2497   FOR_EACH_BB (bb)
2498     if (bb->next_bb != EXIT_BLOCK_PTR)
2499       bb->aux = bb->next_bb;
2500   cfg_layout_finalize ();
2501   free_dominance_info (CDI_DOMINATORS);
2502 #endif /* INSN_SCHEDULING */
2503   return 0;
2504 }
2505
2506 struct tree_opt_pass pass_sms =
2507 {
2508   "sms",                                /* name */
2509   gate_handle_sms,                      /* gate */
2510   rest_of_handle_sms,                   /* execute */
2511   NULL,                                 /* sub */
2512   NULL,                                 /* next */
2513   0,                                    /* static_pass_number */
2514   TV_SMS,                               /* tv_id */
2515   0,                                    /* properties_required */
2516   0,                                    /* properties_provided */
2517   0,                                    /* properties_destroyed */
2518   TODO_dump_func,                       /* todo_flags_start */
2519   TODO_dump_func |
2520   TODO_ggc_collect,                     /* todo_flags_finish */
2521   'm'                                   /* letter */
2522 };
2523