gcc/modulo-sched.c

   1 /* Swing Modulo Scheduling implementation.
   2    Copyright (C) 2004, 2005, 2006, 2007
   3    Free Software Foundation, Inc.
   4    Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "tm.h"
  27 #include "toplev.h"
  28 #include "rtl.h"
  29 #include "tm_p.h"
  30 #include "hard-reg-set.h"
  31 #include "regs.h"
  32 #include "function.h"
  33 #include "flags.h"
  34 #include "insn-config.h"
  35 #include "insn-attr.h"
  36 #include "except.h"
  37 #include "toplev.h"
  38 #include "recog.h"
  39 #include "sched-int.h"
  40 #include "target.h"
  41 #include "cfglayout.h"
  42 #include "cfgloop.h"
  43 #include "cfghooks.h"
  44 #include "expr.h"
  45 #include "params.h"
  46 #include "gcov-io.h"
  47 #include "ddg.h"
  48 #include "timevar.h"
  49 #include "tree-pass.h"
  50
  51 #ifdef INSN_SCHEDULING
  52
  53 /* This file contains the implementation of the Swing Modulo Scheduler,
  54    described in the following references:
  55    [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
  56        Lifetime--sensitive modulo scheduling in a production environment.
  57        IEEE Trans. on Comps., 50(3), March 2001
  58    [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
  59        Swing Modulo Scheduling: A Lifetime Sensitive Approach.
  60        PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
  61
  62    The basic structure is:
  63    1. Build a data-dependence graph (DDG) for each loop.
  64    2. Use the DDG to order the insns of a loop (not in topological order
  65       necessarily, but rather) trying to place each insn after all its
  66       predecessors _or_ after all its successors.
  67    3. Compute MII: a lower bound on the number of cycles to schedule the loop.
  68    4. Use the ordering to perform list-scheduling of the loop:
  69       1. Set II = MII.  We will try to schedule the loop within II cycles.
  70       2. Try to schedule the insns one by one according to the ordering.
  71          For each insn compute an interval of cycles by considering already-
  72          scheduled preds and succs (and associated latencies); try to place
  73          the insn in the cycles of this window checking for potential
  74          resource conflicts (using the DFA interface).
  75          Note: this is different from the cycle-scheduling of schedule_insns;
  76          here the insns are not scheduled monotonically top-down (nor bottom-
  77          up).
  78       3. If failed in scheduling all insns - bump II++ and try again, unless
  79          II reaches an upper bound MaxII, in which case report failure.
  80    5. If we succeeded in scheduling the loop within II cycles, we now
  81       generate prolog and epilog, decrease the counter of the loop, and
  82       perform modulo variable expansion for live ranges that span more than
  83       II cycles (i.e. use register copies to prevent a def from overwriting
  84       itself before reaching the use).
  85 */
  86
  87 \f
  88 /* This page defines partial-schedule structures and functions for
  89    modulo scheduling.  */
  90
  91 typedef struct partial_schedule *partial_schedule_ptr;
  92 typedef struct ps_insn *ps_insn_ptr;
  93
  94 /* The minimum (absolute) cycle that a node of ps was scheduled in.  */
  95 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
  96
  97 /* The maximum (absolute) cycle that a node of ps was scheduled in.  */
  98 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
  99
 100 /* Perform signed modulo, always returning a non-negative value.  */
 101 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
 102
 103 /* The number of different iterations the nodes in ps span, assuming
 104    the stage boundaries are placed efficiently.  */
 105 #define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \
 106                              + 1 + (ps)->ii - 1) / (ps)->ii)
 107
 108 /* A single instruction in the partial schedule.  */
 109 struct ps_insn
 110 {
 111   /* The corresponding DDG_NODE.  */
 112   ddg_node_ptr node;
 113
 114   /* The (absolute) cycle in which the PS instruction is scheduled.
 115      Same as SCHED_TIME (node).  */
 116   int cycle;
 117
 118   /* The next/prev PS_INSN in the same row.  */
 119   ps_insn_ptr next_in_row,
 120               prev_in_row;
 121
 122   /* The number of nodes in the same row that come after this node.  */
 123   int row_rest_count;
 124 };
 125
 126 /* Holds the partial schedule as an array of II rows.  Each entry of the
 127    array points to a linked list of PS_INSNs, which represents the
 128    instructions that are scheduled for that row.  */
 129 struct partial_schedule
 130 {
 131   int ii;       /* Number of rows in the partial schedule.  */
 132   int history;  /* Threshold for conflict checking using DFA.  */
 133
 134   /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii).  */
 135   ps_insn_ptr *rows;
 136
 137   /* The earliest absolute cycle of an insn in the partial schedule.  */
 138   int min_cycle;
 139
 140   /* The latest absolute cycle of an insn in the partial schedule.  */
 141   int max_cycle;
 142
 143   ddg_ptr g;    /* The DDG of the insns in the partial schedule.  */
 144 };
 145
 146 /* We use this to record all the register replacements we do in
 147    the kernel so we can undo SMS if it is not profitable.  */
 148 struct undo_replace_buff_elem
 149 {
 150   rtx insn;
 151   rtx orig_reg;
 152   rtx new_reg;
 153   struct undo_replace_buff_elem *next;
 154 };
 155
 156
 157
 158 static partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
 159 static void free_partial_schedule (partial_schedule_ptr);
 160 static void reset_partial_schedule (partial_schedule_ptr, int new_ii);
 161 void print_partial_schedule (partial_schedule_ptr, FILE *);
 162 static int kernel_number_of_cycles (rtx first_insn, rtx last_insn);
 163 static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
 164                                                 ddg_node_ptr node, int cycle,
 165                                                 sbitmap must_precede,
 166                                                 sbitmap must_follow);
 167 static void rotate_partial_schedule (partial_schedule_ptr, int);
 168 void set_row_column_for_ps (partial_schedule_ptr);
 169 static bool ps_unschedule_node (partial_schedule_ptr, ddg_node_ptr );
 170
 171 \f
 172 /* This page defines constants and structures for the modulo scheduling
 173    driver.  */
 174
 175 /* As in haifa-sched.c:  */
 176 /* issue_rate is the number of insns that can be scheduled in the same
 177    machine cycle.  It can be defined in the config/mach/mach.h file,
 178    otherwise we set it to 1.  */
 179
 180 static int issue_rate;
 181
 182 static int sms_order_nodes (ddg_ptr, int, int * result);
 183 static void set_node_sched_params (ddg_ptr);
 184 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *);
 185 static void permute_partial_schedule (partial_schedule_ptr ps, rtx last);
 186 static void generate_prolog_epilog (partial_schedule_ptr ,struct loop * loop, rtx);
 187 static void duplicate_insns_of_cycles (partial_schedule_ptr ps,
 188                                        int from_stage, int to_stage,
 189                                        int is_prolog);
 190
 191 #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
 192 #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
 193 #define SCHED_FIRST_REG_MOVE(x) \
 194         (((node_sched_params_ptr)(x)->aux.info)->first_reg_move)
 195 #define SCHED_NREG_MOVES(x) \
 196         (((node_sched_params_ptr)(x)->aux.info)->nreg_moves)
 197 #define SCHED_ROW(x) (((node_sched_params_ptr)(x)->aux.info)->row)
 198 #define SCHED_STAGE(x) (((node_sched_params_ptr)(x)->aux.info)->stage)
 199 #define SCHED_COLUMN(x) (((node_sched_params_ptr)(x)->aux.info)->column)
 200
 201 /* The scheduling parameters held for each node.  */
 202 typedef struct node_sched_params
 203 {
 204   int asap;     /* A lower-bound on the absolute scheduling cycle.  */
 205   int time;     /* The absolute scheduling cycle (time >= asap).  */
 206
 207   /* The following field (first_reg_move) is a pointer to the first
 208      register-move instruction added to handle the modulo-variable-expansion
 209      of the register defined by this node.  This register-move copies the
 210      original register defined by the node.  */
 211   rtx first_reg_move;
 212
 213   /* The number of register-move instructions added, immediately preceding
 214      first_reg_move.  */
 215   int nreg_moves;
 216
 217   int row;    /* Holds time % ii.  */
 218   int stage;  /* Holds time / ii.  */
 219
 220   /* The column of a node inside the ps.  If nodes u, v are on the same row,
 221      u will precede v if column (u) < column (v).  */
 222   int column;
 223 } *node_sched_params_ptr;
 224
 225 \f
 226 /* The following three functions are copied from the current scheduler
 227    code in order to use sched_analyze() for computing the dependencies.
 228    They are used when initializing the sched_info structure.  */
 229 static const char *
 230 sms_print_insn (rtx insn, int aligned ATTRIBUTE_UNUSED)
 231 {
 232   static char tmp[80];
 233
 234   sprintf (tmp, "i%4d", INSN_UID (insn));
 235   return tmp;
 236 }
 237
 238 static void
 239 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
 240                                regset cond_exec ATTRIBUTE_UNUSED,
 241                                regset used ATTRIBUTE_UNUSED,
 242                                regset set ATTRIBUTE_UNUSED)
 243 {
 244 }
 245
 246 static struct sched_info sms_sched_info =
 247 {
 248   NULL,
 249   NULL,
 250   NULL,
 251   NULL,
 252   NULL,
 253   sms_print_insn,
 254   NULL,
 255   compute_jump_reg_dependencies,
 256   NULL, NULL,
 257   NULL, NULL,
 258   0, 0, 0,
 259
 260   NULL, NULL, NULL, NULL, NULL,
 261   0
 262 };
 263
 264
 265 /* Return the register decremented and tested in INSN,
 266    or zero if it is not a decrement-and-branch insn.  */
 267
 268 static rtx
 269 doloop_register_get (rtx insn ATTRIBUTE_UNUSED)
 270 {
 271 #ifdef HAVE_doloop_end
 272   rtx pattern, reg, condition;
 273
 274   if (! JUMP_P (insn))
 275     return NULL_RTX;
 276
 277   pattern = PATTERN (insn);
 278   condition = doloop_condition_get (pattern);
 279   if (! condition)
 280     return NULL_RTX;
 281
 282   if (REG_P (XEXP (condition, 0)))
 283     reg = XEXP (condition, 0);
 284   else if (GET_CODE (XEXP (condition, 0)) == PLUS
 285            && REG_P (XEXP (XEXP (condition, 0), 0)))
 286     reg = XEXP (XEXP (condition, 0), 0);
 287   else
 288     gcc_unreachable ();
 289
 290   return reg;
 291 #else
 292   return NULL_RTX;
 293 #endif
 294 }
 295
 296 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
 297    that the number of iterations is a compile-time constant.  If so,
 298    return the rtx that sets COUNT_REG to a constant, and set COUNT to
 299    this constant.  Otherwise return 0.  */
 300 static rtx
 301 const_iteration_count (rtx count_reg, basic_block pre_header,
 302                        HOST_WIDEST_INT * count)
 303 {
 304   rtx insn;
 305   rtx head, tail;
 306
 307   if (! pre_header)
 308     return NULL_RTX;
 309
 310   get_ebb_head_tail (pre_header, pre_header, &head, &tail);
 311
 312   for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
 313     if (INSN_P (insn) && single_set (insn) &&
 314         rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
 315       {
 316         rtx pat = single_set (insn);
 317
 318         if (GET_CODE (SET_SRC (pat)) == CONST_INT)
 319           {
 320             *count = INTVAL (SET_SRC (pat));
 321             return insn;
 322           }
 323
 324         return NULL_RTX;
 325       }
 326
 327   return NULL_RTX;
 328 }
 329
 330 /* A very simple resource-based lower bound on the initiation interval.
 331    ??? Improve the accuracy of this bound by considering the
 332    utilization of various units.  */
 333 static int
 334 res_MII (ddg_ptr g)
 335 {
 336   return (g->num_nodes / issue_rate);
 337 }
 338
 339
 340 /* Points to the array that contains the sched data for each node.  */
 341 static node_sched_params_ptr node_sched_params;
 342
 343 /* Allocate sched_params for each node and initialize it.  Assumes that
 344    the aux field of each node contain the asap bound (computed earlier),
 345    and copies it into the sched_params field.  */
 346 static void
 347 set_node_sched_params (ddg_ptr g)
 348 {
 349   int i;
 350
 351   /* Allocate for each node in the DDG a place to hold the "sched_data".  */
 352   /* Initialize ASAP/ALAP/HIGHT to zero.  */
 353   node_sched_params = (node_sched_params_ptr)
 354                        xcalloc (g->num_nodes,
 355                                 sizeof (struct node_sched_params));
 356
 357   /* Set the pointer of the general data of the node to point to the
 358      appropriate sched_params structure.  */
 359   for (i = 0; i < g->num_nodes; i++)
 360     {
 361       /* Watch out for aliasing problems?  */
 362       node_sched_params[i].asap = g->nodes[i].aux.count;
 363       g->nodes[i].aux.info = &node_sched_params[i];
 364     }
 365 }
 366
 367 static void
 368 print_node_sched_params (FILE * file, int num_nodes)
 369 {
 370   int i;
 371
 372   if (! file)
 373     return;
 374   for (i = 0; i < num_nodes; i++)
 375     {
 376       node_sched_params_ptr nsp = &node_sched_params[i];
 377       rtx reg_move = nsp->first_reg_move;
 378       int j;
 379
 380       fprintf (file, "Node %d:\n", i);
 381       fprintf (file, " asap = %d:\n", nsp->asap);
 382       fprintf (file, " time = %d:\n", nsp->time);
 383       fprintf (file, " nreg_moves = %d:\n", nsp->nreg_moves);
 384       for (j = 0; j < nsp->nreg_moves; j++)
 385         {
 386           fprintf (file, " reg_move = ");
 387           print_rtl_single (file, reg_move);
 388           reg_move = PREV_INSN (reg_move);
 389         }
 390     }
 391 }
 392
 393 /* Calculate an upper bound for II.  SMS should not schedule the loop if it
 394    requires more cycles than this bound.  Currently set to the sum of the
 395    longest latency edge for each node.  Reset based on experiments.  */
 396 static int
 397 calculate_maxii (ddg_ptr g)
 398 {
 399   int i;
 400   int maxii = 0;
 401
 402   for (i = 0; i < g->num_nodes; i++)
 403     {
 404       ddg_node_ptr u = &g->nodes[i];
 405       ddg_edge_ptr e;
 406       int max_edge_latency = 0;
 407
 408       for (e = u->out; e; e = e->next_out)
 409         max_edge_latency = MAX (max_edge_latency, e->latency);
 410
 411       maxii += max_edge_latency;
 412     }
 413   return maxii;
 414 }
 415
 416 /*
 417    Breaking intra-loop register anti-dependences:
 418    Each intra-loop register anti-dependence implies a cross-iteration true
 419    dependence of distance 1. Therefore, we can remove such false dependencies
 420    and figure out if the partial schedule broke them by checking if (for a
 421    true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
 422    if so generate a register move.   The number of such moves is equal to:
 423               SCHED_TIME (use) - SCHED_TIME (def)       { 0 broken
 424    nreg_moves = ----------------------------------- + 1 - {   dependence.
 425                             ii                          { 1 if not.
 426 */
 427 static struct undo_replace_buff_elem *
 428 generate_reg_moves (partial_schedule_ptr ps, bool rescan)
 429 {
 430   ddg_ptr g = ps->g;
 431   int ii = ps->ii;
 432   int i;
 433   struct undo_replace_buff_elem *reg_move_replaces = NULL;
 434
 435   for (i = 0; i < g->num_nodes; i++)
 436     {
 437       ddg_node_ptr u = &g->nodes[i];
 438       ddg_edge_ptr e;
 439       int nreg_moves = 0, i_reg_move;
 440       sbitmap *uses_of_defs;
 441       rtx last_reg_move;
 442       rtx prev_reg, old_reg;
 443
 444       /* Compute the number of reg_moves needed for u, by looking at life
 445          ranges started at u (excluding self-loops).  */
 446       for (e = u->out; e; e = e->next_out)
 447         if (e->type == TRUE_DEP && e->dest != e->src)
 448           {
 449             int nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
 450
 451             if (e->distance == 1)
 452               nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
 453
 454             /* If dest precedes src in the schedule of the kernel, then dest
 455                will read before src writes and we can save one reg_copy.  */
 456             if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
 457                 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
 458               nreg_moves4e--;
 459
 460             nreg_moves = MAX (nreg_moves, nreg_moves4e);
 461           }
 462
 463       if (nreg_moves == 0)
 464         continue;
 465
 466       /* Every use of the register defined by node may require a different
 467          copy of this register, depending on the time the use is scheduled.
 468          Set a bitmap vector, telling which nodes use each copy of this
 469          register.  */
 470       uses_of_defs = sbitmap_vector_alloc (nreg_moves, g->num_nodes);
 471       sbitmap_vector_zero (uses_of_defs, nreg_moves);
 472       for (e = u->out; e; e = e->next_out)
 473         if (e->type == TRUE_DEP && e->dest != e->src)
 474           {
 475             int dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
 476
 477             if (e->distance == 1)
 478               dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
 479
 480             if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
 481                 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
 482               dest_copy--;
 483
 484             if (dest_copy)
 485               SET_BIT (uses_of_defs[dest_copy - 1], e->dest->cuid);
 486           }
 487
 488       /* Now generate the reg_moves, attaching relevant uses to them.  */
 489       SCHED_NREG_MOVES (u) = nreg_moves;
 490       old_reg = prev_reg = copy_rtx (SET_DEST (single_set (u->insn)));
 491       last_reg_move = u->insn;
 492
 493       for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
 494         {
 495           unsigned int i_use = 0;
 496           rtx new_reg = gen_reg_rtx (GET_MODE (prev_reg));
 497           rtx reg_move = gen_move_insn (new_reg, prev_reg);
 498           sbitmap_iterator sbi;
 499
 500           add_insn_before (reg_move, last_reg_move, NULL);
 501           last_reg_move = reg_move;
 502
 503           if (!SCHED_FIRST_REG_MOVE (u))
 504             SCHED_FIRST_REG_MOVE (u) = reg_move;
 505
 506           EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs[i_reg_move], 0, i_use, sbi)
 507             {
 508               struct undo_replace_buff_elem *rep;
 509
 510               rep = (struct undo_replace_buff_elem *)
 511                     xcalloc (1, sizeof (struct undo_replace_buff_elem));
 512               rep->insn = g->nodes[i_use].insn;
 513               rep->orig_reg = old_reg;
 514               rep->new_reg = new_reg;
 515
 516               if (! reg_move_replaces)
 517                 reg_move_replaces = rep;
 518               else
 519                 {
 520                   rep->next = reg_move_replaces;
 521                   reg_move_replaces = rep;
 522                 }
 523
 524               replace_rtx (g->nodes[i_use].insn, old_reg, new_reg);
 525               if (rescan)
 526                 df_insn_rescan (g->nodes[i_use].insn);
 527             }
 528
 529           prev_reg = new_reg;
 530         }
 531       sbitmap_vector_free (uses_of_defs);
 532     }
 533   return reg_move_replaces;
 534 }
 535
 536 /* We call this when we want to undo the SMS schedule for a given loop.
 537    One of the things that we do is to delete the register moves generated
 538    for the sake of SMS; this function deletes the register move instructions
 539    recorded in the undo buffer.  */
 540 static void
 541 undo_generate_reg_moves (partial_schedule_ptr ps,
 542                          struct undo_replace_buff_elem *reg_move_replaces)
 543 {
 544   int i,j;
 545
 546   for (i = 0; i < ps->g->num_nodes; i++)
 547     {
 548       ddg_node_ptr u = &ps->g->nodes[i];
 549       rtx prev;
 550       rtx crr = SCHED_FIRST_REG_MOVE (u);
 551
 552       for (j = 0; j < SCHED_NREG_MOVES (u); j++)
 553         {
 554           prev = PREV_INSN (crr);
 555           delete_insn (crr);
 556           crr = prev;
 557         }
 558       SCHED_FIRST_REG_MOVE (u) = NULL_RTX;
 559     }
 560
 561   while (reg_move_replaces)
 562     {
 563       struct undo_replace_buff_elem *rep = reg_move_replaces;
 564
 565       reg_move_replaces = reg_move_replaces->next;
 566       replace_rtx (rep->insn, rep->new_reg, rep->orig_reg);
 567     }
 568 }
 569
 570 /* Free memory allocated for the undo buffer.  */
 571 static void
 572 free_undo_replace_buff (struct undo_replace_buff_elem *reg_move_replaces)
 573 {
 574
 575   while (reg_move_replaces)
 576     {
 577       struct undo_replace_buff_elem *rep = reg_move_replaces;
 578
 579       reg_move_replaces = reg_move_replaces->next;
 580       free (rep);
 581     }
 582 }
 583
 584 /* Bump the SCHED_TIMEs of all nodes to start from zero.  Set the values
 585    of SCHED_ROW and SCHED_STAGE.  */
 586 static void
 587 normalize_sched_times (partial_schedule_ptr ps)
 588 {
 589   int i;
 590   ddg_ptr g = ps->g;
 591   int amount = PS_MIN_CYCLE (ps);
 592   int ii = ps->ii;
 593
 594   /* Don't include the closing branch assuming that it is the last node.  */
 595   for (i = 0; i < g->num_nodes - 1; i++)
 596     {
 597       ddg_node_ptr u = &g->nodes[i];
 598       int normalized_time = SCHED_TIME (u) - amount;
 599
 600       gcc_assert (normalized_time >= 0);
 601
 602       SCHED_TIME (u) = normalized_time;
 603       SCHED_ROW (u) = normalized_time % ii;
 604       SCHED_STAGE (u) = normalized_time / ii;
 605     }
 606 }
 607
 608 /* Set SCHED_COLUMN of each node according to its position in PS.  */
 609 static void
 610 set_columns_for_ps (partial_schedule_ptr ps)
 611 {
 612   int row;
 613
 614   for (row = 0; row < ps->ii; row++)
 615     {
 616       ps_insn_ptr cur_insn = ps->rows[row];
 617       int column = 0;
 618
 619       for (; cur_insn; cur_insn = cur_insn->next_in_row)
 620         SCHED_COLUMN (cur_insn->node) = column++;
 621     }
 622 }
 623
 624 /* Permute the insns according to their order in PS, from row 0 to
 625    row ii-1, and position them right before LAST.  This schedules
 626    the insns of the loop kernel.  */
 627 static void
 628 permute_partial_schedule (partial_schedule_ptr ps, rtx last)
 629 {
 630   int ii = ps->ii;
 631   int row;
 632   ps_insn_ptr ps_ij;
 633
 634   for (row = 0; row < ii ; row++)
 635     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
 636       if (PREV_INSN (last) != ps_ij->node->insn)
 637         reorder_insns_nobb (ps_ij->node->first_note, ps_ij->node->insn,
 638                             PREV_INSN (last));
 639 }
 640
 641 /* As part of undoing SMS we return to the original ordering of the
 642    instructions inside the loop kernel.  Given the partial schedule PS, this
 643    function returns the ordering of the instruction according to their CUID
 644    in the DDG (PS->G), which is the original order of the instruction before
 645    performing SMS.  */
 646 static void
 647 undo_permute_partial_schedule (partial_schedule_ptr ps, rtx last)
 648 {
 649   int i;
 650
 651   for (i = 0 ; i < ps->g->num_nodes; i++)
 652     if (last == ps->g->nodes[i].insn
 653         || last == ps->g->nodes[i].first_note)
 654       break;
 655     else if (PREV_INSN (last) != ps->g->nodes[i].insn)
 656       reorder_insns_nobb (ps->g->nodes[i].first_note, ps->g->nodes[i].insn,
 657                           PREV_INSN (last));
 658 }
 659
 660 /* Used to generate the prologue & epilogue.  Duplicate the subset of
 661    nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive
 662    of both), together with a prefix/suffix of their reg_moves.  */
 663 static void
 664 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
 665                            int to_stage, int for_prolog)
 666 {
 667   int row;
 668   ps_insn_ptr ps_ij;
 669
 670   for (row = 0; row < ps->ii; row++)
 671     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
 672       {
 673         ddg_node_ptr u_node = ps_ij->node;
 674         int j, i_reg_moves;
 675         rtx reg_move = NULL_RTX;
 676
 677         if (for_prolog)
 678           {
 679             /* SCHED_STAGE (u_node) >= from_stage == 0.  Generate increasing
 680                number of reg_moves starting with the second occurrence of
 681                u_node, which is generated if its SCHED_STAGE <= to_stage.  */
 682             i_reg_moves = to_stage - SCHED_STAGE (u_node) + 1;
 683             i_reg_moves = MAX (i_reg_moves, 0);
 684             i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
 685
 686             /* The reg_moves start from the *first* reg_move backwards.  */
 687             if (i_reg_moves)
 688               {
 689                 reg_move = SCHED_FIRST_REG_MOVE (u_node);
 690                 for (j = 1; j < i_reg_moves; j++)
 691                   reg_move = PREV_INSN (reg_move);
 692               }
 693           }
 694         else /* It's for the epilog.  */
 695           {
 696             /* SCHED_STAGE (u_node) <= to_stage.  Generate all reg_moves,
 697                starting to decrease one stage after u_node no longer occurs;
 698                that is, generate all reg_moves until
 699                SCHED_STAGE (u_node) == from_stage - 1.  */
 700             i_reg_moves = SCHED_NREG_MOVES (u_node)
 701                        - (from_stage - SCHED_STAGE (u_node) - 1);
 702             i_reg_moves = MAX (i_reg_moves, 0);
 703             i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
 704
 705             /* The reg_moves start from the *last* reg_move forwards.  */
 706             if (i_reg_moves)
 707               {
 708                 reg_move = SCHED_FIRST_REG_MOVE (u_node);
 709                 for (j = 1; j < SCHED_NREG_MOVES (u_node); j++)
 710                   reg_move = PREV_INSN (reg_move);
 711               }
 712           }
 713
 714         for (j = 0; j < i_reg_moves; j++, reg_move = NEXT_INSN (reg_move))
 715           emit_insn (copy_rtx (PATTERN (reg_move)));
 716         if (SCHED_STAGE (u_node) >= from_stage
 717             && SCHED_STAGE (u_node) <= to_stage)
 718           duplicate_insn_chain (u_node->first_note, u_node->insn);
 719       }
 720 }
 721
 722
 723 /* Generate the instructions (including reg_moves) for prolog & epilog.  */
 724 static void
 725 generate_prolog_epilog (partial_schedule_ptr ps, struct loop * loop, rtx count_reg)
 726 {
 727   int i;
 728   int last_stage = PS_STAGE_COUNT (ps) - 1;
 729   edge e;
 730
 731   /* Generate the prolog, inserting its insns on the loop-entry edge.  */
 732   start_sequence ();
 733
 734   if (count_reg)
 735    /* Generate a subtract instruction at the beginning of the prolog to
 736       adjust the loop count by STAGE_COUNT.  */
 737    emit_insn (gen_sub2_insn (count_reg, GEN_INT (last_stage)));
 738
 739   for (i = 0; i < last_stage; i++)
 740     duplicate_insns_of_cycles (ps, 0, i, 1);
 741
 742   /* Put the prolog on the entry edge.  */
 743   e = loop_preheader_edge (loop);
 744   split_edge_and_insert (e, get_insns ());
 745
 746   end_sequence ();
 747
 748   /* Generate the epilog, inserting its insns on the loop-exit edge.  */
 749   start_sequence ();
 750
 751   for (i = 0; i < last_stage; i++)
 752     duplicate_insns_of_cycles (ps, i + 1, last_stage, 0);
 753
 754   /* Put the epilogue on the exit edge.  */
 755   gcc_assert (single_exit (loop));
 756   e = single_exit (loop);
 757   split_edge_and_insert (e, get_insns ());
 758   end_sequence ();
 759 }
 760
 761 /* Return true if all the BBs of the loop are empty except the
 762    loop header.  */
 763 static bool
 764 loop_single_full_bb_p (struct loop *loop)
 765 {
 766   unsigned i;
 767   basic_block *bbs = get_loop_body (loop);
 768
 769   for (i = 0; i < loop->num_nodes ; i++)
 770     {
 771       rtx head, tail;
 772       bool empty_bb = true;
 773
 774       if (bbs[i] == loop->header)
 775         continue;
 776
 777       /* Make sure that basic blocks other than the header
 778          have only notes labels or jumps.  */
 779       get_ebb_head_tail (bbs[i], bbs[i], &head, &tail);
 780       for (; head != NEXT_INSN (tail); head = NEXT_INSN (head))
 781         {
 782           if (NOTE_P (head) || LABEL_P (head)
 783               || (INSN_P (head) && JUMP_P (head)))
 784             continue;
 785           empty_bb = false;
 786           break;
 787         }
 788
 789       if (! empty_bb)
 790         {
 791           free (bbs);
 792           return false;
 793         }
 794     }
 795   free (bbs);
 796   return true;
 797 }
 798
 799 /* A simple loop from SMS point of view; it is a loop that is composed of
 800    either a single basic block or two BBs - a header and a latch.  */
 801 #define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 )                     \
 802                                   && (EDGE_COUNT (loop->latch->preds) == 1) \
 803                                   && (EDGE_COUNT (loop->latch->succs) == 1))
 804
 805 /* Return true if the loop is in its canonical form and false if not.
 806    i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit.  */
 807 static bool
 808 loop_canon_p (struct loop *loop)
 809 {
 810
 811   if (loop->inner || !loop_outer (loop))
 812     return false;
 813
 814   if (!single_exit (loop))
 815     {
 816       if (dump_file)
 817         {
 818           rtx insn = BB_END (loop->header);
 819
 820           fprintf (dump_file, "SMS loop many exits ");
 821                   fprintf (dump_file, " %s %d (file, line)\n",
 822                            insn_file (insn), insn_line (insn));
 823         }
 824       return false;
 825     }
 826
 827   if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop))
 828     {
 829       if (dump_file)
 830         {
 831           rtx insn = BB_END (loop->header);
 832
 833           fprintf (dump_file, "SMS loop many BBs. ");
 834           fprintf (dump_file, " %s %d (file, line)\n",
 835                    insn_file (insn), insn_line (insn));
 836         }
 837       return false;
 838     }
 839
 840     return true;
 841 }
 842
 843 /* If there are more than one entry for the loop,
 844    make it one by splitting the first entry edge and
 845    redirecting the others to the new BB.  */
 846 static void
 847 canon_loop (struct loop *loop)
 848 {
 849   edge e;
 850   edge_iterator i;
 851
 852   /* Avoid annoying special cases of edges going to exit
 853      block.  */
 854   FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR->preds)
 855     if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1))
 856       split_edge (e);
 857
 858   if (loop->latch == loop->header
 859       || EDGE_COUNT (loop->latch->succs) > 1)
 860     {
 861       FOR_EACH_EDGE (e, i, loop->header->preds)
 862         if (e->src == loop->latch)
 863           break;
 864       split_edge (e);
 865     }
 866 }
 867
 868 /* Probability in % that the sms-ed loop rolls enough so that optimized
 869    version may be entered.  Just a guess.  */
 870 #define PROB_SMS_ENOUGH_ITERATIONS 80
 871
 872 /* Main entry point, perform SMS scheduling on the loops of the function
 873    that consist of single basic blocks.  */
 874 static void
 875 sms_schedule (void)
 876 {
 877   static int passes = 0;
 878   rtx insn;
 879   ddg_ptr *g_arr, g;
 880   int * node_order;
 881   int maxii;
 882   loop_iterator li;
 883   partial_schedule_ptr ps;
 884   basic_block bb = NULL;
 885   struct loop *loop;
 886   basic_block condition_bb = NULL;
 887   edge latch_edge;
 888   gcov_type trip_count = 0;
 889
 890   loop_optimizer_init (LOOPS_HAVE_PREHEADERS
 891                        | LOOPS_HAVE_RECORDED_EXITS);
 892   if (number_of_loops () <= 1)
 893     {
 894       loop_optimizer_finalize ();
 895       return;  /* There are no loops to schedule.  */
 896     }
 897
 898   /* Initialize issue_rate.  */
 899   if (targetm.sched.issue_rate)
 900     {
 901       int temp = reload_completed;
 902
 903       reload_completed = 1;
 904       issue_rate = targetm.sched.issue_rate ();
 905       reload_completed = temp;
 906     }
 907   else
 908     issue_rate = 1;
 909
 910   /* Initialize the scheduler.  */
 911   current_sched_info = &sms_sched_info;
 912
 913   /* Init Data Flow analysis, to be used in interloop dep calculation.  */
 914   df_set_flags (DF_LR_RUN_DCE);
 915   df_rd_add_problem ();
 916   df_ru_add_problem ();
 917   df_note_add_problem ();
 918   df_chain_add_problem (DF_DU_CHAIN + DF_UD_CHAIN);
 919   df_analyze ();
 920   regstat_compute_calls_crossed ();
 921   sched_init ();
 922
 923   /* Allocate memory to hold the DDG array one entry for each loop.
 924      We use loop->num as index into this array.  */
 925   g_arr = XCNEWVEC (ddg_ptr, number_of_loops ());
 926
 927   /* Build DDGs for all the relevant loops and hold them in G_ARR
 928      indexed by the loop index.  */
 929   FOR_EACH_LOOP (li, loop, 0)
 930     {
 931       rtx head, tail;
 932       rtx count_reg;
 933
 934       /* For debugging.  */
 935       if ((passes++ > MAX_SMS_LOOP_NUMBER) && (MAX_SMS_LOOP_NUMBER != -1))
 936         {
 937           if (dump_file)
 938             fprintf (dump_file, "SMS reached MAX_PASSES... \n");
 939
 940           break;
 941         }
 942
 943       if (! loop_canon_p (loop))
 944         continue;
 945
 946       if (! loop_single_full_bb_p (loop))
 947         continue;
 948
 949       bb = loop->header;
 950
 951       get_ebb_head_tail (bb, bb, &head, &tail);
 952       latch_edge = loop_latch_edge (loop);
 953       gcc_assert (single_exit (loop));
 954       if (single_exit (loop)->count)
 955         trip_count = latch_edge->count / single_exit (loop)->count;
 956
 957       /* Perfrom SMS only on loops that their average count is above threshold.  */
 958
 959       if ( latch_edge->count
 960           && (latch_edge->count < single_exit (loop)->count * SMS_LOOP_AVERAGE_COUNT_THRESHOLD))
 961         {
 962           if (dump_file)
 963             {
 964               fprintf (dump_file, " %s %d (file, line)\n",
 965                        insn_file (tail), insn_line (tail));
 966               fprintf (dump_file, "SMS single-bb-loop\n");
 967               if (profile_info && flag_branch_probabilities)
 968                 {
 969                   fprintf (dump_file, "SMS loop-count ");
 970                   fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
 971                            (HOST_WIDEST_INT) bb->count);
 972                   fprintf (dump_file, "\n");
 973                   fprintf (dump_file, "SMS trip-count ");
 974                   fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
 975                            (HOST_WIDEST_INT) trip_count);
 976                   fprintf (dump_file, "\n");
 977                   fprintf (dump_file, "SMS profile-sum-max ");
 978                   fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
 979                            (HOST_WIDEST_INT) profile_info->sum_max);
 980                   fprintf (dump_file, "\n");
 981                 }
 982             }
 983           continue;
 984         }
 985
 986       /* Make sure this is a doloop.  */
 987       if ( !(count_reg = doloop_register_get (tail)))
 988         continue;
 989
 990       /* Don't handle BBs with calls or barriers, or !single_set insns.  */
 991       for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
 992         if (CALL_P (insn)
 993             || BARRIER_P (insn)
 994             || (INSN_P (insn) && !JUMP_P (insn)
 995                 && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
 996           break;
 997
 998       if (insn != NEXT_INSN (tail))
 999         {
1000           if (dump_file)
1001             {
1002               if (CALL_P (insn))
1003                 fprintf (dump_file, "SMS loop-with-call\n");
1004               else if (BARRIER_P (insn))
1005                 fprintf (dump_file, "SMS loop-with-barrier\n");
1006               else
1007                 fprintf (dump_file, "SMS loop-with-not-single-set\n");
1008               print_rtl_single (dump_file, insn);
1009             }
1010
1011           continue;
1012         }
1013
1014       if (! (g = create_ddg (bb, 0)))
1015         {
1016           if (dump_file)
1017             fprintf (dump_file, "SMS doloop\n");
1018           continue;
1019         }
1020
1021       g_arr[loop->num] = g;
1022     }
1023
1024   /* We don't want to perform SMS on new loops - created by versioning.  */
1025   FOR_EACH_LOOP (li, loop, 0)
1026     {
1027       rtx head, tail;
1028       rtx count_reg, count_init;
1029       int mii, rec_mii;
1030       unsigned stage_count = 0;
1031       HOST_WIDEST_INT loop_count = 0;
1032
1033       if (! (g = g_arr[loop->num]))
1034         continue;
1035
1036       if (dump_file)
1037         print_ddg (dump_file, g);
1038
1039       get_ebb_head_tail (loop->header, loop->header, &head, &tail);
1040
1041       latch_edge = loop_latch_edge (loop);
1042       gcc_assert (single_exit (loop));
1043       if (single_exit (loop)->count)
1044         trip_count = latch_edge->count / single_exit (loop)->count;
1045
1046       if (dump_file)
1047         {
1048           fprintf (dump_file, " %s %d (file, line)\n",
1049                    insn_file (tail), insn_line (tail));
1050           fprintf (dump_file, "SMS single-bb-loop\n");
1051           if (profile_info && flag_branch_probabilities)
1052             {
1053               fprintf (dump_file, "SMS loop-count ");
1054               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1055                        (HOST_WIDEST_INT) bb->count);
1056               fprintf (dump_file, "\n");
1057               fprintf (dump_file, "SMS profile-sum-max ");
1058               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1059                        (HOST_WIDEST_INT) profile_info->sum_max);
1060               fprintf (dump_file, "\n");
1061             }
1062           fprintf (dump_file, "SMS doloop\n");
1063           fprintf (dump_file, "SMS built-ddg %d\n", g->num_nodes);
1064           fprintf (dump_file, "SMS num-loads %d\n", g->num_loads);
1065           fprintf (dump_file, "SMS num-stores %d\n", g->num_stores);
1066         }
1067
1068
1069       /* In case of th loop have doloop register it gets special
1070          handling.  */
1071       count_init = NULL_RTX;
1072       if ((count_reg = doloop_register_get (tail)))
1073         {
1074           basic_block pre_header;
1075
1076           pre_header = loop_preheader_edge (loop)->src;
1077           count_init = const_iteration_count (count_reg, pre_header,
1078                                               &loop_count);
1079         }
1080       gcc_assert (count_reg);
1081
1082       if (dump_file && count_init)
1083         {
1084           fprintf (dump_file, "SMS const-doloop ");
1085           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1086                      loop_count);
1087           fprintf (dump_file, "\n");
1088         }
1089
1090       node_order = XNEWVEC (int, g->num_nodes);
1091
1092       mii = 1; /* Need to pass some estimate of mii.  */
1093       rec_mii = sms_order_nodes (g, mii, node_order);
1094       mii = MAX (res_MII (g), rec_mii);
1095       maxii = (calculate_maxii (g) * SMS_MAX_II_FACTOR) / 100;
1096
1097       if (dump_file)
1098         fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1099                  rec_mii, mii, maxii);
1100
1101       /* After sms_order_nodes and before sms_schedule_by_order, to copy over
1102          ASAP.  */
1103       set_node_sched_params (g);
1104
1105       ps = sms_schedule_by_order (g, mii, maxii, node_order);
1106
1107       if (ps)
1108         stage_count = PS_STAGE_COUNT (ps);
1109
1110       /* Stage count of 1 means that there is no interleaving between
1111          iterations, let the scheduling passes do the job.  */
1112       if (stage_count < 1
1113           || (count_init && (loop_count <= stage_count))
1114           || (flag_branch_probabilities && (trip_count <= stage_count)))
1115         {
1116           if (dump_file)
1117             {
1118               fprintf (dump_file, "SMS failed... \n");
1119               fprintf (dump_file, "SMS sched-failed (stage-count=%d, loop-count=", stage_count);
1120               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, loop_count);
1121               fprintf (dump_file, ", trip-count=");
1122               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, trip_count);
1123               fprintf (dump_file, ")\n");
1124             }
1125           continue;
1126         }
1127       else
1128         {
1129           int orig_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1130           int new_cycles;
1131           struct undo_replace_buff_elem *reg_move_replaces;
1132
1133           if (dump_file)
1134             {
1135               fprintf (dump_file,
1136                        "SMS succeeded %d %d (with ii, sc)\n", ps->ii,
1137                        stage_count);
1138               print_partial_schedule (ps, dump_file);
1139               fprintf (dump_file,
1140                        "SMS Branch (%d) will later be scheduled at cycle %d.\n",
1141                        g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1);
1142             }
1143
1144           /* Set the stage boundaries.  If the DDG is built with closing_branch_deps,
1145              the closing_branch was scheduled and should appear in the last (ii-1)
1146              row.  Otherwise, we are free to schedule the branch, and we let nodes
1147              that were scheduled at the first PS_MIN_CYCLE cycle appear in the first
1148              row; this should reduce stage_count to minimum.  */
1149           normalize_sched_times (ps);
1150           rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
1151           set_columns_for_ps (ps);
1152
1153           /* Generate the kernel just to be able to measure its cycles.  */
1154           permute_partial_schedule (ps, g->closing_branch->first_note);
1155           reg_move_replaces = generate_reg_moves (ps, false);
1156
1157           /* Get the number of cycles the new kernel expect to execute in.  */
1158           new_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1159
1160           /* Get back to the original loop so we can do loop versioning.  */
1161           undo_permute_partial_schedule (ps, g->closing_branch->first_note);
1162           if (reg_move_replaces)
1163             undo_generate_reg_moves (ps, reg_move_replaces);
1164
1165           if ( new_cycles >= orig_cycles)
1166             {
1167               /* SMS is not profitable so undo the permutation and reg move generation
1168                  and return the kernel to its original state.  */
1169               if (dump_file)
1170                 fprintf (dump_file, "Undoing SMS because it is not profitable.\n");
1171
1172             }
1173           else
1174             {
1175               canon_loop (loop);
1176
1177               /* case the BCT count is not known , Do loop-versioning */
1178               if (count_reg && ! count_init)
1179                 {
1180                   rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
1181                                                  GEN_INT(stage_count));
1182                   unsigned prob = (PROB_SMS_ENOUGH_ITERATIONS
1183                                    * REG_BR_PROB_BASE) / 100;
1184
1185                   loop_version (loop, comp_rtx, &condition_bb,
1186                                 prob, prob, REG_BR_PROB_BASE - prob,
1187                                 true);
1188                 }
1189
1190               /* Set new iteration count of loop kernel.  */
1191               if (count_reg && count_init)
1192                 SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1193                                                              - stage_count + 1);
1194
1195               /* Now apply the scheduled kernel to the RTL of the loop.  */
1196               permute_partial_schedule (ps, g->closing_branch->first_note);
1197
1198               /* Mark this loop as software pipelined so the later
1199               scheduling passes doesn't touch it.  */
1200               if (! flag_resched_modulo_sched)
1201                 g->bb->flags |= BB_DISABLE_SCHEDULE;
1202               /* The life-info is not valid any more.  */
1203               df_set_bb_dirty (g->bb);
1204
1205               reg_move_replaces = generate_reg_moves (ps, true);
1206               if (dump_file)
1207                 print_node_sched_params (dump_file, g->num_nodes);
1208               /* Generate prolog and epilog.  */
1209               if (count_reg && !count_init)
1210                 generate_prolog_epilog (ps, loop, count_reg);
1211               else
1212                 generate_prolog_epilog (ps, loop, NULL_RTX);
1213             }
1214           free_undo_replace_buff (reg_move_replaces);
1215         }
1216
1217       free_partial_schedule (ps);
1218       free (node_sched_params);
1219       free (node_order);
1220       free_ddg (g);
1221     }
1222
1223   regstat_free_calls_crossed ();
1224   free (g_arr);
1225
1226   /* Release scheduler data, needed until now because of DFA.  */
1227   sched_finish ();
1228   loop_optimizer_finalize ();
1229 }
1230
1231 /* The SMS scheduling algorithm itself
1232    -----------------------------------
1233    Input: 'O' an ordered list of insns of a loop.
1234    Output: A scheduling of the loop - kernel, prolog, and epilogue.
1235
1236    'Q' is the empty Set
1237    'PS' is the partial schedule; it holds the currently scheduled nodes with
1238         their cycle/slot.
1239    'PSP' previously scheduled predecessors.
1240    'PSS' previously scheduled successors.
1241    't(u)' the cycle where u is scheduled.
1242    'l(u)' is the latency of u.
1243    'd(v,u)' is the dependence distance from v to u.
1244    'ASAP(u)' the earliest time at which u could be scheduled as computed in
1245              the node ordering phase.
1246    'check_hardware_resources_conflicts(u, PS, c)'
1247                              run a trace around cycle/slot through DFA model
1248                              to check resource conflicts involving instruction u
1249                              at cycle c given the partial schedule PS.
1250    'add_to_partial_schedule_at_time(u, PS, c)'
1251                              Add the node/instruction u to the partial schedule
1252                              PS at time c.
1253    'calculate_register_pressure(PS)'
1254                              Given a schedule of instructions, calculate the register
1255                              pressure it implies.  One implementation could be the
1256                              maximum number of overlapping live ranges.
1257    'maxRP' The maximum allowed register pressure, it is usually derived from the number
1258            registers available in the hardware.
1259
1260    1. II = MII.
1261    2. PS = empty list
1262    3. for each node u in O in pre-computed order
1263    4.   if (PSP(u) != Q && PSS(u) == Q) then
1264    5.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1265    6.     start = Early_start; end = Early_start + II - 1; step = 1
1266    11.  else if (PSP(u) == Q && PSS(u) != Q) then
1267    12.      Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1268    13.     start = Late_start; end = Late_start - II + 1; step = -1
1269    14.  else if (PSP(u) != Q && PSS(u) != Q) then
1270    15.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1271    16.     Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1272    17.     start = Early_start;
1273    18.     end = min(Early_start + II - 1 , Late_start);
1274    19.     step = 1
1275    20.     else "if (PSP(u) == Q && PSS(u) == Q)"
1276    21.    start = ASAP(u); end = start + II - 1; step = 1
1277    22.  endif
1278
1279    23.  success = false
1280    24.  for (c = start ; c != end ; c += step)
1281    25.     if check_hardware_resources_conflicts(u, PS, c) then
1282    26.       add_to_partial_schedule_at_time(u, PS, c)
1283    27.       success = true
1284    28.       break
1285    29.     endif
1286    30.  endfor
1287    31.  if (success == false) then
1288    32.    II = II + 1
1289    33.    if (II > maxII) then
1290    34.       finish - failed to schedule
1291    35.   endif
1292    36.    goto 2.
1293    37.  endif
1294    38. endfor
1295    39. if (calculate_register_pressure(PS) > maxRP) then
1296    40.    goto 32.
1297    41. endif
1298    42. compute epilogue & prologue
1299    43. finish - succeeded to schedule
1300 */
1301
1302 /* A limit on the number of cycles that resource conflicts can span.  ??? Should
1303    be provided by DFA, and be dependent on the type of insn scheduled.  Currently
1304    set to 0 to save compile time.  */
1305 #define DFA_HISTORY SMS_DFA_HISTORY
1306
1307 /* Given the partial schedule PS, this function calculates and returns the
1308    cycles in which we can schedule the node with the given index I.
1309    NOTE: Here we do the backtracking in SMS, in some special cases. We have
1310    noticed that there are several cases in which we fail    to SMS the loop
1311    because the sched window of a node is empty    due to tight data-deps. In
1312    such cases we want to unschedule    some of the predecessors/successors
1313    until we get non-empty    scheduling window.  It returns -1 if the
1314    scheduling window is empty and zero otherwise.  */
1315
1316 static int
1317 get_sched_window (partial_schedule_ptr ps, int *nodes_order, int i,
1318                   sbitmap sched_nodes, int ii, int *start_p, int *step_p, int *end_p)
1319 {
1320   int start, step, end;
1321   ddg_edge_ptr e;
1322   int u = nodes_order [i];
1323   ddg_node_ptr u_node = &ps->g->nodes[u];
1324   sbitmap psp = sbitmap_alloc (ps->g->num_nodes);
1325   sbitmap pss = sbitmap_alloc (ps->g->num_nodes);
1326   sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1327   sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1328   int psp_not_empty;
1329   int pss_not_empty;
1330
1331   /* 1. compute sched window for u (start, end, step).  */
1332   sbitmap_zero (psp);
1333   sbitmap_zero (pss);
1334   psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes);
1335   pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes);
1336
1337   if (psp_not_empty && !pss_not_empty)
1338     {
1339       int early_start = INT_MIN;
1340
1341       end = INT_MAX;
1342       for (e = u_node->in; e != 0; e = e->next_in)
1343         {
1344           ddg_node_ptr v_node = e->src;
1345           if (TEST_BIT (sched_nodes, v_node->cuid))
1346             {
1347               int node_st = SCHED_TIME (v_node)
1348                             + e->latency - (e->distance * ii);
1349
1350               early_start = MAX (early_start, node_st);
1351
1352               if (e->data_type == MEM_DEP)
1353                 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1354             }
1355         }
1356       start = early_start;
1357       end = MIN (end, early_start + ii);
1358       step = 1;
1359     }
1360
1361   else if (!psp_not_empty && pss_not_empty)
1362     {
1363       int late_start = INT_MAX;
1364
1365       end = INT_MIN;
1366       for (e = u_node->out; e != 0; e = e->next_out)
1367         {
1368           ddg_node_ptr v_node = e->dest;
1369           if (TEST_BIT (sched_nodes, v_node->cuid))
1370             {
1371               late_start = MIN (late_start,
1372                                 SCHED_TIME (v_node) - e->latency
1373                                 + (e->distance * ii));
1374               if (e->data_type == MEM_DEP)
1375                 end = MAX (end, SCHED_TIME (v_node) - ii + 1);
1376             }
1377         }
1378       start = late_start;
1379       end = MAX (end, late_start - ii);
1380       step = -1;
1381     }
1382
1383   else if (psp_not_empty && pss_not_empty)
1384     {
1385       int early_start = INT_MIN;
1386       int late_start = INT_MAX;
1387
1388       start = INT_MIN;
1389       end = INT_MAX;
1390       for (e = u_node->in; e != 0; e = e->next_in)
1391         {
1392           ddg_node_ptr v_node = e->src;
1393
1394           if (TEST_BIT (sched_nodes, v_node->cuid))
1395             {
1396               early_start = MAX (early_start,
1397                                  SCHED_TIME (v_node) + e->latency
1398                                  - (e->distance * ii));
1399               if (e->data_type == MEM_DEP)
1400                 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1401             }
1402         }
1403       for (e = u_node->out; e != 0; e = e->next_out)
1404         {
1405           ddg_node_ptr v_node = e->dest;
1406
1407           if (TEST_BIT (sched_nodes, v_node->cuid))
1408             {
1409               late_start = MIN (late_start,
1410                                 SCHED_TIME (v_node) - e->latency
1411                                 + (e->distance * ii));
1412               if (e->data_type == MEM_DEP)
1413                 start = MAX (start, SCHED_TIME (v_node) - ii + 1);
1414             }
1415         }
1416       start = MAX (start, early_start);
1417       end = MIN (end, MIN (early_start + ii, late_start + 1));
1418       step = 1;
1419     }
1420   else /* psp is empty && pss is empty.  */
1421     {
1422       start = SCHED_ASAP (u_node);
1423       end = start + ii;
1424       step = 1;
1425     }
1426
1427   *start_p = start;
1428   *step_p = step;
1429   *end_p = end;
1430   sbitmap_free (psp);
1431   sbitmap_free (pss);
1432
1433   if ((start >= end && step == 1) || (start <= end && step == -1))
1434     return -1;
1435   else
1436     return 0;
1437 }
1438
1439 /* This function implements the scheduling algorithm for SMS according to the
1440    above algorithm.  */
1441 static partial_schedule_ptr
1442 sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order)
1443 {
1444   int ii = mii;
1445   int i, c, success;
1446   int try_again_with_larger_ii = true;
1447   int num_nodes = g->num_nodes;
1448   ddg_edge_ptr e;
1449   int start, end, step; /* Place together into one struct?  */
1450   sbitmap sched_nodes = sbitmap_alloc (num_nodes);
1451   sbitmap must_precede = sbitmap_alloc (num_nodes);
1452   sbitmap must_follow = sbitmap_alloc (num_nodes);
1453   sbitmap tobe_scheduled = sbitmap_alloc (num_nodes);
1454
1455   partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY);
1456
1457   sbitmap_ones (tobe_scheduled);
1458   sbitmap_zero (sched_nodes);
1459
1460   while ((! sbitmap_equal (tobe_scheduled, sched_nodes)
1461          || try_again_with_larger_ii ) && ii < maxii)
1462     {
1463       int j;
1464       bool unscheduled_nodes = false;
1465
1466       if (dump_file)
1467         fprintf (dump_file, "Starting with ii=%d\n", ii);
1468       if (try_again_with_larger_ii)
1469         {
1470           try_again_with_larger_ii = false;
1471           sbitmap_zero (sched_nodes);
1472         }
1473
1474       for (i = 0; i < num_nodes; i++)
1475         {
1476           int u = nodes_order[i];
1477           ddg_node_ptr u_node = &ps->g->nodes[u];
1478           rtx insn = u_node->insn;
1479
1480           if (!INSN_P (insn))
1481             {
1482               RESET_BIT (tobe_scheduled, u);
1483               continue;
1484             }
1485
1486           if (JUMP_P (insn)) /* Closing branch handled later.  */
1487             {
1488               RESET_BIT (tobe_scheduled, u);
1489               continue;
1490             }
1491
1492           if (TEST_BIT (sched_nodes, u))
1493             continue;
1494
1495           /* Try to get non-empty scheduling window.  */
1496           j = i;
1497           while (get_sched_window (ps, nodes_order, i, sched_nodes, ii, &start, &step, &end) < 0
1498                  && j > 0)
1499             {
1500               unscheduled_nodes = true;
1501               if (TEST_BIT (NODE_PREDECESSORS (u_node), nodes_order[j - 1])
1502                   || TEST_BIT (NODE_SUCCESSORS (u_node), nodes_order[j - 1]))
1503                 {
1504                   ps_unschedule_node (ps, &ps->g->nodes[nodes_order[j - 1]]);
1505                   RESET_BIT (sched_nodes, nodes_order [j - 1]);
1506                 }
1507               j--;
1508             }
1509           if (j < 0)
1510             {
1511               /* ??? Try backtracking instead of immediately ii++?  */
1512               ii++;
1513               try_again_with_larger_ii = true;
1514               reset_partial_schedule (ps, ii);
1515               break;
1516             }
1517           /* 2. Try scheduling u in window.  */
1518           if (dump_file)
1519             fprintf (dump_file,
1520                      "Trying to schedule node %d in (%d .. %d) step %d\n",
1521                      u, start, end, step);
1522
1523           /* use must_follow & must_precede bitmaps to determine order
1524              of nodes within the cycle.  */
1525           sbitmap_zero (must_precede);
1526           sbitmap_zero (must_follow);
1527           for (e = u_node->in; e != 0; e = e->next_in)
1528             if (TEST_BIT (sched_nodes, e->src->cuid)
1529                 && e->latency == (ii * e->distance)
1530                 && start == SCHED_TIME (e->src))
1531              SET_BIT (must_precede, e->src->cuid);
1532
1533           for (e = u_node->out; e != 0; e = e->next_out)
1534             if (TEST_BIT (sched_nodes, e->dest->cuid)
1535                 && e->latency == (ii * e->distance)
1536                 && end == SCHED_TIME (e->dest))
1537              SET_BIT (must_follow, e->dest->cuid);
1538
1539           success = 0;
1540           if ((step > 0 && start < end) ||  (step < 0 && start > end))
1541             for (c = start; c != end; c += step)
1542               {
1543                 ps_insn_ptr psi;
1544
1545                 psi = ps_add_node_check_conflicts (ps, u_node, c,
1546                                                    must_precede,
1547                                                    must_follow);
1548
1549                 if (psi)
1550                   {
1551                     SCHED_TIME (u_node) = c;
1552                     SET_BIT (sched_nodes, u);
1553                     success = 1;
1554                     if (dump_file)
1555                       fprintf (dump_file, "Schedule in %d\n", c);
1556                     break;
1557                   }
1558               }
1559           if (!success)
1560             {
1561               /* ??? Try backtracking instead of immediately ii++?  */
1562               ii++;
1563               try_again_with_larger_ii = true;
1564               reset_partial_schedule (ps, ii);
1565               break;
1566             }
1567           if (unscheduled_nodes)
1568             break;
1569
1570           /* ??? If (success), check register pressure estimates.  */
1571         } /* Continue with next node.  */
1572     } /* While try_again_with_larger_ii.  */
1573
1574   sbitmap_free (sched_nodes);
1575   sbitmap_free (must_precede);
1576   sbitmap_free (must_follow);
1577   sbitmap_free (tobe_scheduled);
1578
1579   if (ii >= maxii)
1580     {
1581       free_partial_schedule (ps);
1582       ps = NULL;
1583     }
1584   return ps;
1585 }
1586
1587 \f
1588 /* This page implements the algorithm for ordering the nodes of a DDG
1589    for modulo scheduling, activated through the
1590    "int sms_order_nodes (ddg_ptr, int mii, int * result)" API.  */
1591
1592 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
1593 #define ASAP(x) (ORDER_PARAMS ((x))->asap)
1594 #define ALAP(x) (ORDER_PARAMS ((x))->alap)
1595 #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
1596 #define MOB(x) (ALAP ((x)) - ASAP ((x)))
1597 #define DEPTH(x) (ASAP ((x)))
1598
1599 typedef struct node_order_params * nopa;
1600
1601 static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
1602 static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
1603 static nopa  calculate_order_params (ddg_ptr, int mii);
1604 static int find_max_asap (ddg_ptr, sbitmap);
1605 static int find_max_hv_min_mob (ddg_ptr, sbitmap);
1606 static int find_max_dv_min_mob (ddg_ptr, sbitmap);
1607
1608 enum sms_direction {BOTTOMUP, TOPDOWN};
1609
1610 struct node_order_params
1611 {
1612   int asap;
1613   int alap;
1614   int height;
1615 };
1616
1617 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1.  */
1618 static void
1619 check_nodes_order (int *node_order, int num_nodes)
1620 {
1621   int i;
1622   sbitmap tmp = sbitmap_alloc (num_nodes);
1623
1624   sbitmap_zero (tmp);
1625
1626   for (i = 0; i < num_nodes; i++)
1627     {
1628       int u = node_order[i];
1629
1630       gcc_assert (u < num_nodes && u >= 0 && !TEST_BIT (tmp, u));
1631
1632       SET_BIT (tmp, u);
1633     }
1634
1635   sbitmap_free (tmp);
1636 }
1637
1638 /* Order the nodes of G for scheduling and pass the result in
1639    NODE_ORDER.  Also set aux.count of each node to ASAP.
1640    Return the recMII for the given DDG.  */
1641 static int
1642 sms_order_nodes (ddg_ptr g, int mii, int * node_order)
1643 {
1644   int i;
1645   int rec_mii = 0;
1646   ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
1647
1648   nopa nops = calculate_order_params (g, mii);
1649
1650   if (dump_file)
1651     print_sccs (dump_file, sccs, g);
1652
1653   order_nodes_of_sccs (sccs, node_order);
1654
1655   if (sccs->num_sccs > 0)
1656     /* First SCC has the largest recurrence_length.  */
1657     rec_mii = sccs->sccs[0]->recurrence_length;
1658
1659   /* Save ASAP before destroying node_order_params.  */
1660   for (i = 0; i < g->num_nodes; i++)
1661     {
1662       ddg_node_ptr v = &g->nodes[i];
1663       v->aux.count = ASAP (v);
1664     }
1665
1666   free (nops);
1667   free_ddg_all_sccs (sccs);
1668   check_nodes_order (node_order, g->num_nodes);
1669
1670   return rec_mii;
1671 }
1672
1673 static void
1674 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
1675 {
1676   int i, pos = 0;
1677   ddg_ptr g = all_sccs->ddg;
1678   int num_nodes = g->num_nodes;
1679   sbitmap prev_sccs = sbitmap_alloc (num_nodes);
1680   sbitmap on_path = sbitmap_alloc (num_nodes);
1681   sbitmap tmp = sbitmap_alloc (num_nodes);
1682   sbitmap ones = sbitmap_alloc (num_nodes);
1683
1684   sbitmap_zero (prev_sccs);
1685   sbitmap_ones (ones);
1686
1687   /* Perfrom the node ordering starting from the SCC with the highest recMII.
1688      For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc.  */
1689   for (i = 0; i < all_sccs->num_sccs; i++)
1690     {
1691       ddg_scc_ptr scc = all_sccs->sccs[i];
1692
1693       /* Add nodes on paths from previous SCCs to the current SCC.  */
1694       find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
1695       sbitmap_a_or_b (tmp, scc->nodes, on_path);
1696
1697       /* Add nodes on paths from the current SCC to previous SCCs.  */
1698       find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
1699       sbitmap_a_or_b (tmp, tmp, on_path);
1700
1701       /* Remove nodes of previous SCCs from current extended SCC.  */
1702       sbitmap_difference (tmp, tmp, prev_sccs);
1703
1704       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1705       /* Above call to order_nodes_in_scc updated prev_sccs |= tmp.  */
1706     }
1707
1708   /* Handle the remaining nodes that do not belong to any scc.  Each call
1709      to order_nodes_in_scc handles a single connected component.  */
1710   while (pos < g->num_nodes)
1711     {
1712       sbitmap_difference (tmp, ones, prev_sccs);
1713       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1714     }
1715   sbitmap_free (prev_sccs);
1716   sbitmap_free (on_path);
1717   sbitmap_free (tmp);
1718   sbitmap_free (ones);
1719 }
1720
1721 /* MII is needed if we consider backarcs (that do not close recursive cycles).  */
1722 static struct node_order_params *
1723 calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED)
1724 {
1725   int u;
1726   int max_asap;
1727   int num_nodes = g->num_nodes;
1728   ddg_edge_ptr e;
1729   /* Allocate a place to hold ordering params for each node in the DDG.  */
1730   nopa node_order_params_arr;
1731
1732   /* Initialize of ASAP/ALAP/HEIGHT to zero.  */
1733   node_order_params_arr = (nopa) xcalloc (num_nodes,
1734                                           sizeof (struct node_order_params));
1735
1736   /* Set the aux pointer of each node to point to its order_params structure.  */
1737   for (u = 0; u < num_nodes; u++)
1738     g->nodes[u].aux.info = &node_order_params_arr[u];
1739
1740   /* Disregarding a backarc from each recursive cycle to obtain a DAG,
1741      calculate ASAP, ALAP, mobility, distance, and height for each node
1742      in the dependence (direct acyclic) graph.  */
1743
1744   /* We assume that the nodes in the array are in topological order.  */
1745
1746   max_asap = 0;
1747   for (u = 0; u < num_nodes; u++)
1748     {
1749       ddg_node_ptr u_node = &g->nodes[u];
1750
1751       ASAP (u_node) = 0;
1752       for (e = u_node->in; e; e = e->next_in)
1753         if (e->distance == 0)
1754           ASAP (u_node) = MAX (ASAP (u_node),
1755                                ASAP (e->src) + e->latency);
1756       max_asap = MAX (max_asap, ASAP (u_node));
1757     }
1758
1759   for (u = num_nodes - 1; u > -1; u--)
1760     {
1761       ddg_node_ptr u_node = &g->nodes[u];
1762
1763       ALAP (u_node) = max_asap;
1764       HEIGHT (u_node) = 0;
1765       for (e = u_node->out; e; e = e->next_out)
1766         if (e->distance == 0)
1767           {
1768             ALAP (u_node) = MIN (ALAP (u_node),
1769                                  ALAP (e->dest) - e->latency);
1770             HEIGHT (u_node) = MAX (HEIGHT (u_node),
1771                                    HEIGHT (e->dest) + e->latency);
1772           }
1773     }
1774
1775   return node_order_params_arr;
1776 }
1777
1778 static int
1779 find_max_asap (ddg_ptr g, sbitmap nodes)
1780 {
1781   unsigned int u = 0;
1782   int max_asap = -1;
1783   int result = -1;
1784   sbitmap_iterator sbi;
1785
1786   EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1787     {
1788       ddg_node_ptr u_node = &g->nodes[u];
1789
1790       if (max_asap < ASAP (u_node))
1791         {
1792           max_asap = ASAP (u_node);
1793           result = u;
1794         }
1795     }
1796   return result;
1797 }
1798
1799 static int
1800 find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
1801 {
1802   unsigned int u = 0;
1803   int max_hv = -1;
1804   int min_mob = INT_MAX;
1805   int result = -1;
1806   sbitmap_iterator sbi;
1807
1808   EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1809     {
1810       ddg_node_ptr u_node = &g->nodes[u];
1811
1812       if (max_hv < HEIGHT (u_node))
1813         {
1814           max_hv = HEIGHT (u_node);
1815           min_mob = MOB (u_node);
1816           result = u;
1817         }
1818       else if ((max_hv == HEIGHT (u_node))
1819                && (min_mob > MOB (u_node)))
1820         {
1821           min_mob = MOB (u_node);
1822           result = u;
1823         }
1824     }
1825   return result;
1826 }
1827
1828 static int
1829 find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
1830 {
1831   unsigned int u = 0;
1832   int max_dv = -1;
1833   int min_mob = INT_MAX;
1834   int result = -1;
1835   sbitmap_iterator sbi;
1836
1837   EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1838     {
1839       ddg_node_ptr u_node = &g->nodes[u];
1840
1841       if (max_dv < DEPTH (u_node))
1842         {
1843           max_dv = DEPTH (u_node);
1844           min_mob = MOB (u_node);
1845           result = u;
1846         }
1847       else if ((max_dv == DEPTH (u_node))
1848                && (min_mob > MOB (u_node)))
1849         {
1850           min_mob = MOB (u_node);
1851           result = u;
1852         }
1853     }
1854   return result;
1855 }
1856
1857 /* Places the nodes of SCC into the NODE_ORDER array starting
1858    at position POS, according to the SMS ordering algorithm.
1859    NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
1860    the NODE_ORDER array, starting from position zero.  */
1861 static int
1862 order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
1863                     int * node_order, int pos)
1864 {
1865   enum sms_direction dir;
1866   int num_nodes = g->num_nodes;
1867   sbitmap workset = sbitmap_alloc (num_nodes);
1868   sbitmap tmp = sbitmap_alloc (num_nodes);
1869   sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
1870   sbitmap predecessors = sbitmap_alloc (num_nodes);
1871   sbitmap successors = sbitmap_alloc (num_nodes);
1872
1873   sbitmap_zero (predecessors);
1874   find_predecessors (predecessors, g, nodes_ordered);
1875
1876   sbitmap_zero (successors);
1877   find_successors (successors, g, nodes_ordered);
1878
1879   sbitmap_zero (tmp);
1880   if (sbitmap_a_and_b_cg (tmp, predecessors, scc))
1881     {
1882       sbitmap_copy (workset, tmp);
1883       dir = BOTTOMUP;
1884     }
1885   else if (sbitmap_a_and_b_cg (tmp, successors, scc))
1886     {
1887       sbitmap_copy (workset, tmp);
1888       dir = TOPDOWN;
1889     }
1890   else
1891     {
1892       int u;
1893
1894       sbitmap_zero (workset);
1895       if ((u = find_max_asap (g, scc)) >= 0)
1896         SET_BIT (workset, u);
1897       dir = BOTTOMUP;
1898     }
1899
1900   sbitmap_zero (zero_bitmap);
1901   while (!sbitmap_equal (workset, zero_bitmap))
1902     {
1903       int v;
1904       ddg_node_ptr v_node;
1905       sbitmap v_node_preds;
1906       sbitmap v_node_succs;
1907
1908       if (dir == TOPDOWN)
1909         {
1910           while (!sbitmap_equal (workset, zero_bitmap))
1911             {
1912               v = find_max_hv_min_mob (g, workset);
1913               v_node = &g->nodes[v];
1914               node_order[pos++] = v;
1915               v_node_succs = NODE_SUCCESSORS (v_node);
1916               sbitmap_a_and_b (tmp, v_node_succs, scc);
1917
1918               /* Don't consider the already ordered successors again.  */
1919               sbitmap_difference (tmp, tmp, nodes_ordered);
1920               sbitmap_a_or_b (workset, workset, tmp);
1921               RESET_BIT (workset, v);
1922               SET_BIT (nodes_ordered, v);
1923             }
1924           dir = BOTTOMUP;
1925           sbitmap_zero (predecessors);
1926           find_predecessors (predecessors, g, nodes_ordered);
1927           sbitmap_a_and_b (workset, predecessors, scc);
1928         }
1929       else
1930         {
1931           while (!sbitmap_equal (workset, zero_bitmap))
1932             {
1933               v = find_max_dv_min_mob (g, workset);
1934               v_node = &g->nodes[v];
1935               node_order[pos++] = v;
1936               v_node_preds = NODE_PREDECESSORS (v_node);
1937               sbitmap_a_and_b (tmp, v_node_preds, scc);
1938
1939               /* Don't consider the already ordered predecessors again.  */
1940               sbitmap_difference (tmp, tmp, nodes_ordered);
1941               sbitmap_a_or_b (workset, workset, tmp);
1942               RESET_BIT (workset, v);
1943               SET_BIT (nodes_ordered, v);
1944             }
1945           dir = TOPDOWN;
1946           sbitmap_zero (successors);
1947           find_successors (successors, g, nodes_ordered);
1948           sbitmap_a_and_b (workset, successors, scc);
1949         }
1950     }
1951   sbitmap_free (tmp);
1952   sbitmap_free (workset);
1953   sbitmap_free (zero_bitmap);
1954   sbitmap_free (predecessors);
1955   sbitmap_free (successors);
1956   return pos;
1957 }
1958
1959 \f
1960 /* This page contains functions for manipulating partial-schedules during
1961    modulo scheduling.  */
1962
1963 /* Create a partial schedule and allocate a memory to hold II rows.  */
1964
1965 static partial_schedule_ptr
1966 create_partial_schedule (int ii, ddg_ptr g, int history)
1967 {
1968   partial_schedule_ptr ps = XNEW (struct partial_schedule);
1969   ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
1970   ps->ii = ii;
1971   ps->history = history;
1972   ps->min_cycle = INT_MAX;
1973   ps->max_cycle = INT_MIN;
1974   ps->g = g;
1975
1976   return ps;
1977 }
1978
1979 /* Free the PS_INSNs in rows array of the given partial schedule.
1980    ??? Consider caching the PS_INSN's.  */
1981 static void
1982 free_ps_insns (partial_schedule_ptr ps)
1983 {
1984   int i;
1985
1986   for (i = 0; i < ps->ii; i++)
1987     {
1988       while (ps->rows[i])
1989         {
1990           ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
1991
1992           free (ps->rows[i]);
1993           ps->rows[i] = ps_insn;
1994         }
1995       ps->rows[i] = NULL;
1996     }
1997 }
1998
1999 /* Free all the memory allocated to the partial schedule.  */
2000
2001 static void
2002 free_partial_schedule (partial_schedule_ptr ps)
2003 {
2004   if (!ps)
2005     return;
2006   free_ps_insns (ps);
2007   free (ps->rows);
2008   free (ps);
2009 }
2010
2011 /* Clear the rows array with its PS_INSNs, and create a new one with
2012    NEW_II rows.  */
2013
2014 static void
2015 reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
2016 {
2017   if (!ps)
2018     return;
2019   free_ps_insns (ps);
2020   if (new_ii == ps->ii)
2021     return;
2022   ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
2023                                                  * sizeof (ps_insn_ptr));
2024   memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
2025   ps->ii = new_ii;
2026   ps->min_cycle = INT_MAX;
2027   ps->max_cycle = INT_MIN;
2028 }
2029
2030 /* Prints the partial schedule as an ii rows array, for each rows
2031    print the ids of the insns in it.  */
2032 void
2033 print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
2034 {
2035   int i;
2036
2037   for (i = 0; i < ps->ii; i++)
2038     {
2039       ps_insn_ptr ps_i = ps->rows[i];
2040
2041       fprintf (dump, "\n[CYCLE %d ]: ", i);
2042       while (ps_i)
2043         {
2044           fprintf (dump, "%d, ",
2045                    INSN_UID (ps_i->node->insn));
2046           ps_i = ps_i->next_in_row;
2047         }
2048     }
2049 }
2050
2051 /* Creates an object of PS_INSN and initializes it to the given parameters.  */
2052 static ps_insn_ptr
2053 create_ps_insn (ddg_node_ptr node, int rest_count, int cycle)
2054 {
2055   ps_insn_ptr ps_i = XNEW (struct ps_insn);
2056
2057   ps_i->node = node;
2058   ps_i->next_in_row = NULL;
2059   ps_i->prev_in_row = NULL;
2060   ps_i->row_rest_count = rest_count;
2061   ps_i->cycle = cycle;
2062
2063   return ps_i;
2064 }
2065
2066
2067 /* Removes the given PS_INSN from the partial schedule.  Returns false if the
2068    node is not found in the partial schedule, else returns true.  */
2069 static bool
2070 remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
2071 {
2072   int row;
2073
2074   if (!ps || !ps_i)
2075     return false;
2076
2077   row = SMODULO (ps_i->cycle, ps->ii);
2078   if (! ps_i->prev_in_row)
2079     {
2080       if (ps_i != ps->rows[row])
2081         return false;
2082
2083       ps->rows[row] = ps_i->next_in_row;
2084       if (ps->rows[row])
2085         ps->rows[row]->prev_in_row = NULL;
2086     }
2087   else
2088     {
2089       ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
2090       if (ps_i->next_in_row)
2091         ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
2092     }
2093   free (ps_i);
2094   return true;
2095 }
2096
2097 /* Unlike what literature describes for modulo scheduling (which focuses
2098    on VLIW machines) the order of the instructions inside a cycle is
2099    important.  Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
2100    where the current instruction should go relative to the already
2101    scheduled instructions in the given cycle.  Go over these
2102    instructions and find the first possible column to put it in.  */
2103 static bool
2104 ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2105                      sbitmap must_precede, sbitmap must_follow)
2106 {
2107   ps_insn_ptr next_ps_i;
2108   ps_insn_ptr first_must_follow = NULL;
2109   ps_insn_ptr last_must_precede = NULL;
2110   int row;
2111
2112   if (! ps_i)
2113     return false;
2114
2115   row = SMODULO (ps_i->cycle, ps->ii);
2116
2117   /* Find the first must follow and the last must precede
2118      and insert the node immediately after the must precede
2119      but make sure that it there is no must follow after it.  */
2120   for (next_ps_i = ps->rows[row];
2121        next_ps_i;
2122        next_ps_i = next_ps_i->next_in_row)
2123     {
2124       if (TEST_BIT (must_follow, next_ps_i->node->cuid)
2125           && ! first_must_follow)
2126         first_must_follow = next_ps_i;
2127       if (TEST_BIT (must_precede, next_ps_i->node->cuid))
2128         {
2129           /* If we have already met a node that must follow, then
2130              there is no possible column.  */
2131           if (first_must_follow)
2132             return false;
2133           else
2134             last_must_precede = next_ps_i;
2135         }
2136     }
2137
2138   /* Now insert the node after INSERT_AFTER_PSI.  */
2139
2140   if (! last_must_precede)
2141     {
2142       ps_i->next_in_row = ps->rows[row];
2143       ps_i->prev_in_row = NULL;
2144       if (ps_i->next_in_row)
2145         ps_i->next_in_row->prev_in_row = ps_i;
2146       ps->rows[row] = ps_i;
2147     }
2148   else
2149     {
2150       ps_i->next_in_row = last_must_precede->next_in_row;
2151       last_must_precede->next_in_row = ps_i;
2152       ps_i->prev_in_row = last_must_precede;
2153       if (ps_i->next_in_row)
2154         ps_i->next_in_row->prev_in_row = ps_i;
2155     }
2156
2157   return true;
2158 }
2159
2160 /* Advances the PS_INSN one column in its current row; returns false
2161    in failure and true in success.  Bit N is set in MUST_FOLLOW if
2162    the node with cuid N must be come after the node pointed to by
2163    PS_I when scheduled in the same cycle.  */
2164 static int
2165 ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2166                         sbitmap must_follow)
2167 {
2168   ps_insn_ptr prev, next;
2169   int row;
2170   ddg_node_ptr next_node;
2171
2172   if (!ps || !ps_i)
2173     return false;
2174
2175   row = SMODULO (ps_i->cycle, ps->ii);
2176
2177   if (! ps_i->next_in_row)
2178     return false;
2179
2180   next_node = ps_i->next_in_row->node;
2181
2182   /* Check if next_in_row is dependent on ps_i, both having same sched
2183      times (typically ANTI_DEP).  If so, ps_i cannot skip over it.  */
2184   if (TEST_BIT (must_follow, next_node->cuid))
2185     return false;
2186
2187   /* Advance PS_I over its next_in_row in the doubly linked list.  */
2188   prev = ps_i->prev_in_row;
2189   next = ps_i->next_in_row;
2190
2191   if (ps_i == ps->rows[row])
2192     ps->rows[row] = next;
2193
2194   ps_i->next_in_row = next->next_in_row;
2195
2196   if (next->next_in_row)
2197     next->next_in_row->prev_in_row = ps_i;
2198
2199   next->next_in_row = ps_i;
2200   ps_i->prev_in_row = next;
2201
2202   next->prev_in_row = prev;
2203   if (prev)
2204     prev->next_in_row = next;
2205
2206   return true;
2207 }
2208
2209 /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
2210    Returns 0 if this is not possible and a PS_INSN otherwise.  Bit N is
2211    set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
2212    before/after (respectively) the node pointed to by PS_I when scheduled
2213    in the same cycle.  */
2214 static ps_insn_ptr
2215 add_node_to_ps (partial_schedule_ptr ps, ddg_node_ptr node, int cycle,
2216                 sbitmap must_precede, sbitmap must_follow)
2217 {
2218   ps_insn_ptr ps_i;
2219   int rest_count = 1;
2220   int row = SMODULO (cycle, ps->ii);
2221
2222   if (ps->rows[row]
2223       && ps->rows[row]->row_rest_count >= issue_rate)
2224     return NULL;
2225
2226   if (ps->rows[row])
2227     rest_count += ps->rows[row]->row_rest_count;
2228
2229   ps_i = create_ps_insn (node, rest_count, cycle);
2230
2231   /* Finds and inserts PS_I according to MUST_FOLLOW and
2232      MUST_PRECEDE.  */
2233   if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow))
2234     {
2235       free (ps_i);
2236       return NULL;
2237     }
2238
2239   return ps_i;
2240 }
2241
2242 /* Advance time one cycle.  Assumes DFA is being used.  */
2243 static void
2244 advance_one_cycle (void)
2245 {
2246   if (targetm.sched.dfa_pre_cycle_insn)
2247     state_transition (curr_state,
2248                       targetm.sched.dfa_pre_cycle_insn ());
2249
2250   state_transition (curr_state, NULL);
2251
2252   if (targetm.sched.dfa_post_cycle_insn)
2253     state_transition (curr_state,
2254                       targetm.sched.dfa_post_cycle_insn ());
2255 }
2256
2257 /* Given the kernel of a loop (from FIRST_INSN to LAST_INSN), finds
2258    the number of cycles according to DFA that the kernel fits in,
2259    we use this to check if we done well with SMS after we add
2260    register moves.  In some cases register moves overhead makes
2261    it even worse than the original loop.  We want SMS to be performed
2262    when it gives less cycles after register moves are added.  */
2263 static int
2264 kernel_number_of_cycles (rtx first_insn, rtx last_insn)
2265 {
2266   int cycles = 0;
2267   rtx insn;
2268   int can_issue_more = issue_rate;
2269
2270   state_reset (curr_state);
2271
2272   for (insn = first_insn;
2273        insn != NULL_RTX && insn != last_insn;
2274        insn = NEXT_INSN (insn))
2275     {
2276       if (! INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE)
2277         continue;
2278
2279       /* Check if there is room for the current insn.  */
2280       if (!can_issue_more || state_dead_lock_p (curr_state))
2281         {
2282           cycles ++;
2283           advance_one_cycle ();
2284           can_issue_more = issue_rate;
2285         }
2286
2287         /* Update the DFA state and return with failure if the DFA found
2288            recource conflicts.  */
2289       if (state_transition (curr_state, insn) >= 0)
2290         {
2291           cycles ++;
2292           advance_one_cycle ();
2293           can_issue_more = issue_rate;
2294         }
2295
2296       if (targetm.sched.variable_issue)
2297         can_issue_more =
2298           targetm.sched.variable_issue (sched_dump, sched_verbose,
2299                                         insn, can_issue_more);
2300       /* A naked CLOBBER or USE generates no instruction, so don't
2301          let them consume issue slots.  */
2302       else if (GET_CODE (PATTERN (insn)) != USE
2303                && GET_CODE (PATTERN (insn)) != CLOBBER)
2304         can_issue_more--;
2305     }
2306   return cycles;
2307 }
2308
2309 /* Checks if PS has resource conflicts according to DFA, starting from
2310    FROM cycle to TO cycle; returns true if there are conflicts and false
2311    if there are no conflicts.  Assumes DFA is being used.  */
2312 static int
2313 ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
2314 {
2315   int cycle;
2316
2317   state_reset (curr_state);
2318
2319   for (cycle = from; cycle <= to; cycle++)
2320     {
2321       ps_insn_ptr crr_insn;
2322       /* Holds the remaining issue slots in the current row.  */
2323       int can_issue_more = issue_rate;
2324
2325       /* Walk through the DFA for the current row.  */
2326       for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
2327            crr_insn;
2328            crr_insn = crr_insn->next_in_row)
2329         {
2330           rtx insn = crr_insn->node->insn;
2331
2332           if (!INSN_P (insn))
2333             continue;
2334
2335           /* Check if there is room for the current insn.  */
2336           if (!can_issue_more || state_dead_lock_p (curr_state))
2337             return true;
2338
2339           /* Update the DFA state and return with failure if the DFA found
2340              recource conflicts.  */
2341           if (state_transition (curr_state, insn) >= 0)
2342             return true;
2343
2344           if (targetm.sched.variable_issue)
2345             can_issue_more =
2346               targetm.sched.variable_issue (sched_dump, sched_verbose,
2347                                             insn, can_issue_more);
2348           /* A naked CLOBBER or USE generates no instruction, so don't
2349              let them consume issue slots.  */
2350           else if (GET_CODE (PATTERN (insn)) != USE
2351                    && GET_CODE (PATTERN (insn)) != CLOBBER)
2352             can_issue_more--;
2353         }
2354
2355       /* Advance the DFA to the next cycle.  */
2356       advance_one_cycle ();
2357     }
2358   return false;
2359 }
2360
2361 /* Checks if the given node causes resource conflicts when added to PS at
2362    cycle C.  If not the node is added to PS and returned; otherwise zero
2363    is returned.  Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
2364    cuid N must be come before/after (respectively) the node pointed to by
2365    PS_I when scheduled in the same cycle.  */
2366 ps_insn_ptr
2367 ps_add_node_check_conflicts (partial_schedule_ptr ps, ddg_node_ptr n,
2368                              int c, sbitmap must_precede,
2369                              sbitmap must_follow)
2370 {
2371   int has_conflicts = 0;
2372   ps_insn_ptr ps_i;
2373
2374   /* First add the node to the PS, if this succeeds check for
2375      conflicts, trying different issue slots in the same row.  */
2376   if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow)))
2377     return NULL; /* Failed to insert the node at the given cycle.  */
2378
2379   has_conflicts = ps_has_conflicts (ps, c, c)
2380                   || (ps->history > 0
2381                       && ps_has_conflicts (ps,
2382                                            c - ps->history,
2383                                            c + ps->history));
2384
2385   /* Try different issue slots to find one that the given node can be
2386      scheduled in without conflicts.  */
2387   while (has_conflicts)
2388     {
2389       if (! ps_insn_advance_column (ps, ps_i, must_follow))
2390         break;
2391       has_conflicts = ps_has_conflicts (ps, c, c)
2392                       || (ps->history > 0
2393                           && ps_has_conflicts (ps,
2394                                                c - ps->history,
2395                                                c + ps->history));
2396     }
2397
2398   if (has_conflicts)
2399     {
2400       remove_node_from_ps (ps, ps_i);
2401       return NULL;
2402     }
2403
2404   ps->min_cycle = MIN (ps->min_cycle, c);
2405   ps->max_cycle = MAX (ps->max_cycle, c);
2406   return ps_i;
2407 }
2408
2409 /* Rotate the rows of PS such that insns scheduled at time
2410    START_CYCLE will appear in row 0.  Updates max/min_cycles.  */
2411 void
2412 rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
2413 {
2414   int i, row, backward_rotates;
2415   int last_row = ps->ii - 1;
2416
2417   if (start_cycle == 0)
2418     return;
2419
2420   backward_rotates = SMODULO (start_cycle, ps->ii);
2421
2422   /* Revisit later and optimize this into a single loop.  */
2423   for (i = 0; i < backward_rotates; i++)
2424     {
2425       ps_insn_ptr first_row = ps->rows[0];
2426
2427       for (row = 0; row < last_row; row++)
2428         ps->rows[row] = ps->rows[row+1];
2429
2430       ps->rows[last_row] = first_row;
2431     }
2432
2433   ps->max_cycle -= start_cycle;
2434   ps->min_cycle -= start_cycle;
2435 }
2436
2437 /* Remove the node N from the partial schedule PS; because we restart the DFA
2438    each time we want to check for resource conflicts; this is equivalent to
2439    unscheduling the node N.  */
2440 static bool
2441 ps_unschedule_node (partial_schedule_ptr ps, ddg_node_ptr n)
2442 {
2443   ps_insn_ptr ps_i;
2444   int row = SMODULO (SCHED_TIME (n), ps->ii);
2445
2446   if (row < 0 || row > ps->ii)
2447     return false;
2448
2449   for (ps_i = ps->rows[row];
2450        ps_i &&  ps_i->node != n;
2451        ps_i = ps_i->next_in_row);
2452   if (!ps_i)
2453     return false;
2454
2455   return remove_node_from_ps (ps, ps_i);
2456 }
2457 #endif /* INSN_SCHEDULING */
2458 \f
2459 static bool
2460 gate_handle_sms (void)
2461 {
2462   return (optimize > 0 && flag_modulo_sched);
2463 }
2464
2465
2466 /* Run instruction scheduler.  */
2467 /* Perform SMS module scheduling.  */
2468 static unsigned int
2469 rest_of_handle_sms (void)
2470 {
2471 #ifdef INSN_SCHEDULING
2472   basic_block bb;
2473
2474   /* Collect loop information to be used in SMS.  */
2475   cfg_layout_initialize (0);
2476   sms_schedule ();
2477
2478   /* Update the life information, because we add pseudos.  */
2479   max_regno = max_reg_num ();
2480
2481   /* Finalize layout changes.  */
2482   FOR_EACH_BB (bb)
2483     if (bb->next_bb != EXIT_BLOCK_PTR)
2484       bb->aux = bb->next_bb;
2485   free_dominance_info (CDI_DOMINATORS);
2486   cfg_layout_finalize ();
2487 #endif /* INSN_SCHEDULING */
2488   return 0;
2489 }
2490
2491 struct tree_opt_pass pass_sms =
2492 {
2493   "sms",                                /* name */
2494   gate_handle_sms,                      /* gate */
2495   rest_of_handle_sms,                   /* execute */
2496   NULL,                                 /* sub */
2497   NULL,                                 /* next */
2498   0,                                    /* static_pass_number */
2499   TV_SMS,                               /* tv_id */
2500   0,                                    /* properties_required */
2501   0,                                    /* properties_provided */
2502   0,                                    /* properties_destroyed */
2503   TODO_dump_func,                       /* todo_flags_start */
2504   TODO_df_finish |
2505   TODO_dump_func |
2506   TODO_ggc_collect,                     /* todo_flags_finish */
2507   'm'                                   /* letter */
2508 };
2509