gcc/modulo-sched.c

   1 /* Swing Modulo Scheduling implementation.
   2    Copyright (C) 2004, 2005, 2006
   3    Free Software Foundation, Inc.
   4    Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 2, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING.  If not, write to the Free
  20 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
  21 02110-1301, USA.  */
  22
  23
  24 #include "config.h"
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "tm.h"
  28 #include "toplev.h"
  29 #include "rtl.h"
  30 #include "tm_p.h"
  31 #include "hard-reg-set.h"
  32 #include "regs.h"
  33 #include "function.h"
  34 #include "flags.h"
  35 #include "insn-config.h"
  36 #include "insn-attr.h"
  37 #include "except.h"
  38 #include "toplev.h"
  39 #include "recog.h"
  40 #include "sched-int.h"
  41 #include "target.h"
  42 #include "cfglayout.h"
  43 #include "cfgloop.h"
  44 #include "cfghooks.h"
  45 #include "expr.h"
  46 #include "params.h"
  47 #include "gcov-io.h"
  48 #include "df.h"
  49 #include "ddg.h"
  50 #include "timevar.h"
  51 #include "tree-pass.h"
  52
  53 #ifdef INSN_SCHEDULING
  54
  55 /* This file contains the implementation of the Swing Modulo Scheduler,
  56    described in the following references:
  57    [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
  58        Lifetime--sensitive modulo scheduling in a production environment.
  59        IEEE Trans. on Comps., 50(3), March 2001
  60    [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
  61        Swing Modulo Scheduling: A Lifetime Sensitive Approach.
  62        PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
  63
  64    The basic structure is:
  65    1. Build a data-dependence graph (DDG) for each loop.
  66    2. Use the DDG to order the insns of a loop (not in topological order
  67       necessarily, but rather) trying to place each insn after all its
  68       predecessors _or_ after all its successors.
  69    3. Compute MII: a lower bound on the number of cycles to schedule the loop.
  70    4. Use the ordering to perform list-scheduling of the loop:
  71       1. Set II = MII.  We will try to schedule the loop within II cycles.
  72       2. Try to schedule the insns one by one according to the ordering.
  73          For each insn compute an interval of cycles by considering already-
  74          scheduled preds and succs (and associated latencies); try to place
  75          the insn in the cycles of this window checking for potential
  76          resource conflicts (using the DFA interface).
  77          Note: this is different from the cycle-scheduling of schedule_insns;
  78          here the insns are not scheduled monotonically top-down (nor bottom-
  79          up).
  80       3. If failed in scheduling all insns - bump II++ and try again, unless
  81          II reaches an upper bound MaxII, in which case report failure.
  82    5. If we succeeded in scheduling the loop within II cycles, we now
  83       generate prolog and epilog, decrease the counter of the loop, and
  84       perform modulo variable expansion for live ranges that span more than
  85       II cycles (i.e. use register copies to prevent a def from overwriting
  86       itself before reaching the use).
  87 */
  88
  89 \f
  90 /* This page defines partial-schedule structures and functions for
  91    modulo scheduling.  */
  92
  93 typedef struct partial_schedule *partial_schedule_ptr;
  94 typedef struct ps_insn *ps_insn_ptr;
  95
  96 /* The minimum (absolute) cycle that a node of ps was scheduled in.  */
  97 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
  98
  99 /* The maximum (absolute) cycle that a node of ps was scheduled in.  */
 100 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
 101
 102 /* Perform signed modulo, always returning a non-negative value.  */
 103 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
 104
 105 /* The number of different iterations the nodes in ps span, assuming
 106    the stage boundaries are placed efficiently.  */
 107 #define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \
 108                              + 1 + (ps)->ii - 1) / (ps)->ii)
 109
 110 /* A single instruction in the partial schedule.  */
 111 struct ps_insn
 112 {
 113   /* The corresponding DDG_NODE.  */
 114   ddg_node_ptr node;
 115
 116   /* The (absolute) cycle in which the PS instruction is scheduled.
 117      Same as SCHED_TIME (node).  */
 118   int cycle;
 119
 120   /* The next/prev PS_INSN in the same row.  */
 121   ps_insn_ptr next_in_row,
 122               prev_in_row;
 123
 124   /* The number of nodes in the same row that come after this node.  */
 125   int row_rest_count;
 126 };
 127
 128 /* Holds the partial schedule as an array of II rows.  Each entry of the
 129    array points to a linked list of PS_INSNs, which represents the
 130    instructions that are scheduled for that row.  */
 131 struct partial_schedule
 132 {
 133   int ii;       /* Number of rows in the partial schedule.  */
 134   int history;  /* Threshold for conflict checking using DFA.  */
 135
 136   /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii).  */
 137   ps_insn_ptr *rows;
 138
 139   /* The earliest absolute cycle of an insn in the partial schedule.  */
 140   int min_cycle;
 141
 142   /* The latest absolute cycle of an insn in the partial schedule.  */
 143   int max_cycle;
 144
 145   ddg_ptr g;    /* The DDG of the insns in the partial schedule.  */
 146 };
 147
 148 /* We use this to record all the register replacements we do in
 149    the kernel so we can undo SMS if it is not profitable.  */
 150 struct undo_replace_buff_elem
 151 {
 152   rtx insn;
 153   rtx orig_reg;
 154   rtx new_reg;
 155   struct undo_replace_buff_elem *next;
 156 };
 157
 158
 159
 160 static partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
 161 static void free_partial_schedule (partial_schedule_ptr);
 162 static void reset_partial_schedule (partial_schedule_ptr, int new_ii);
 163 void print_partial_schedule (partial_schedule_ptr, FILE *);
 164 static int kernel_number_of_cycles (rtx first_insn, rtx last_insn);
 165 static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
 166                                                 ddg_node_ptr node, int cycle,
 167                                                 sbitmap must_precede,
 168                                                 sbitmap must_follow);
 169 static void rotate_partial_schedule (partial_schedule_ptr, int);
 170 void set_row_column_for_ps (partial_schedule_ptr);
 171 static bool ps_unschedule_node (partial_schedule_ptr, ddg_node_ptr );
 172
 173 \f
 174 /* This page defines constants and structures for the modulo scheduling
 175    driver.  */
 176
 177 /* As in haifa-sched.c:  */
 178 /* issue_rate is the number of insns that can be scheduled in the same
 179    machine cycle.  It can be defined in the config/mach/mach.h file,
 180    otherwise we set it to 1.  */
 181
 182 static int issue_rate;
 183
 184 static int sms_order_nodes (ddg_ptr, int, int * result);
 185 static void set_node_sched_params (ddg_ptr);
 186 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *);
 187 static void permute_partial_schedule (partial_schedule_ptr ps, rtx last);
 188 static void generate_prolog_epilog (partial_schedule_ptr ,struct loop * loop, rtx);
 189 static void duplicate_insns_of_cycles (partial_schedule_ptr ps,
 190                                        int from_stage, int to_stage,
 191                                        int is_prolog);
 192
 193 #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
 194 #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
 195 #define SCHED_FIRST_REG_MOVE(x) \
 196         (((node_sched_params_ptr)(x)->aux.info)->first_reg_move)
 197 #define SCHED_NREG_MOVES(x) \
 198         (((node_sched_params_ptr)(x)->aux.info)->nreg_moves)
 199 #define SCHED_ROW(x) (((node_sched_params_ptr)(x)->aux.info)->row)
 200 #define SCHED_STAGE(x) (((node_sched_params_ptr)(x)->aux.info)->stage)
 201 #define SCHED_COLUMN(x) (((node_sched_params_ptr)(x)->aux.info)->column)
 202
 203 /* The scheduling parameters held for each node.  */
 204 typedef struct node_sched_params
 205 {
 206   int asap;     /* A lower-bound on the absolute scheduling cycle.  */
 207   int time;     /* The absolute scheduling cycle (time >= asap).  */
 208
 209   /* The following field (first_reg_move) is a pointer to the first
 210      register-move instruction added to handle the modulo-variable-expansion
 211      of the register defined by this node.  This register-move copies the
 212      original register defined by the node.  */
 213   rtx first_reg_move;
 214
 215   /* The number of register-move instructions added, immediately preceding
 216      first_reg_move.  */
 217   int nreg_moves;
 218
 219   int row;    /* Holds time % ii.  */
 220   int stage;  /* Holds time / ii.  */
 221
 222   /* The column of a node inside the ps.  If nodes u, v are on the same row,
 223      u will precede v if column (u) < column (v).  */
 224   int column;
 225 } *node_sched_params_ptr;
 226
 227 \f
 228 /* The following three functions are copied from the current scheduler
 229    code in order to use sched_analyze() for computing the dependencies.
 230    They are used when initializing the sched_info structure.  */
 231 static const char *
 232 sms_print_insn (rtx insn, int aligned ATTRIBUTE_UNUSED)
 233 {
 234   static char tmp[80];
 235
 236   sprintf (tmp, "i%4d", INSN_UID (insn));
 237   return tmp;
 238 }
 239
 240 static void
 241 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
 242                                regset cond_exec ATTRIBUTE_UNUSED,
 243                                regset used ATTRIBUTE_UNUSED,
 244                                regset set ATTRIBUTE_UNUSED)
 245 {
 246 }
 247
 248 static struct sched_info sms_sched_info =
 249 {
 250   NULL,
 251   NULL,
 252   NULL,
 253   NULL,
 254   NULL,
 255   sms_print_insn,
 256   NULL,
 257   compute_jump_reg_dependencies,
 258   NULL, NULL,
 259   NULL, NULL,
 260   0, 0, 0,
 261
 262   NULL, NULL, NULL, NULL, NULL,
 263 #ifdef ENABLE_CHECKING
 264   NULL,
 265 #endif
 266   0
 267 };
 268
 269
 270 /* Return the register decremented and tested in INSN,
 271    or zero if it is not a decrement-and-branch insn.  */
 272
 273 static rtx
 274 doloop_register_get (rtx insn ATTRIBUTE_UNUSED)
 275 {
 276 #ifdef HAVE_doloop_end
 277   rtx pattern, reg, condition;
 278
 279   if (! JUMP_P (insn))
 280     return NULL_RTX;
 281
 282   pattern = PATTERN (insn);
 283   condition = doloop_condition_get (pattern);
 284   if (! condition)
 285     return NULL_RTX;
 286
 287   if (REG_P (XEXP (condition, 0)))
 288     reg = XEXP (condition, 0);
 289   else if (GET_CODE (XEXP (condition, 0)) == PLUS
 290            && REG_P (XEXP (XEXP (condition, 0), 0)))
 291     reg = XEXP (XEXP (condition, 0), 0);
 292   else
 293     gcc_unreachable ();
 294
 295   return reg;
 296 #else
 297   return NULL_RTX;
 298 #endif
 299 }
 300
 301 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
 302    that the number of iterations is a compile-time constant.  If so,
 303    return the rtx that sets COUNT_REG to a constant, and set COUNT to
 304    this constant.  Otherwise return 0.  */
 305 static rtx
 306 const_iteration_count (rtx count_reg, basic_block pre_header,
 307                        HOST_WIDEST_INT * count)
 308 {
 309   rtx insn;
 310   rtx head, tail;
 311
 312   if (! pre_header)
 313     return NULL_RTX;
 314
 315   get_ebb_head_tail (pre_header, pre_header, &head, &tail);
 316
 317   for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
 318     if (INSN_P (insn) && single_set (insn) &&
 319         rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
 320       {
 321         rtx pat = single_set (insn);
 322
 323         if (GET_CODE (SET_SRC (pat)) == CONST_INT)
 324           {
 325             *count = INTVAL (SET_SRC (pat));
 326             return insn;
 327           }
 328
 329         return NULL_RTX;
 330       }
 331
 332   return NULL_RTX;
 333 }
 334
 335 /* A very simple resource-based lower bound on the initiation interval.
 336    ??? Improve the accuracy of this bound by considering the
 337    utilization of various units.  */
 338 static int
 339 res_MII (ddg_ptr g)
 340 {
 341   return (g->num_nodes / issue_rate);
 342 }
 343
 344
 345 /* Points to the array that contains the sched data for each node.  */
 346 static node_sched_params_ptr node_sched_params;
 347
 348 /* Allocate sched_params for each node and initialize it.  Assumes that
 349    the aux field of each node contain the asap bound (computed earlier),
 350    and copies it into the sched_params field.  */
 351 static void
 352 set_node_sched_params (ddg_ptr g)
 353 {
 354   int i;
 355
 356   /* Allocate for each node in the DDG a place to hold the "sched_data".  */
 357   /* Initialize ASAP/ALAP/HIGHT to zero.  */
 358   node_sched_params = (node_sched_params_ptr)
 359                        xcalloc (g->num_nodes,
 360                                 sizeof (struct node_sched_params));
 361
 362   /* Set the pointer of the general data of the node to point to the
 363      appropriate sched_params structure.  */
 364   for (i = 0; i < g->num_nodes; i++)
 365     {
 366       /* Watch out for aliasing problems?  */
 367       node_sched_params[i].asap = g->nodes[i].aux.count;
 368       g->nodes[i].aux.info = &node_sched_params[i];
 369     }
 370 }
 371
 372 static void
 373 print_node_sched_params (FILE * file, int num_nodes)
 374 {
 375   int i;
 376
 377   if (! file)
 378     return;
 379   for (i = 0; i < num_nodes; i++)
 380     {
 381       node_sched_params_ptr nsp = &node_sched_params[i];
 382       rtx reg_move = nsp->first_reg_move;
 383       int j;
 384
 385       fprintf (file, "Node %d:\n", i);
 386       fprintf (file, " asap = %d:\n", nsp->asap);
 387       fprintf (file, " time = %d:\n", nsp->time);
 388       fprintf (file, " nreg_moves = %d:\n", nsp->nreg_moves);
 389       for (j = 0; j < nsp->nreg_moves; j++)
 390         {
 391           fprintf (file, " reg_move = ");
 392           print_rtl_single (file, reg_move);
 393           reg_move = PREV_INSN (reg_move);
 394         }
 395     }
 396 }
 397
 398 /* Calculate an upper bound for II.  SMS should not schedule the loop if it
 399    requires more cycles than this bound.  Currently set to the sum of the
 400    longest latency edge for each node.  Reset based on experiments.  */
 401 static int
 402 calculate_maxii (ddg_ptr g)
 403 {
 404   int i;
 405   int maxii = 0;
 406
 407   for (i = 0; i < g->num_nodes; i++)
 408     {
 409       ddg_node_ptr u = &g->nodes[i];
 410       ddg_edge_ptr e;
 411       int max_edge_latency = 0;
 412
 413       for (e = u->out; e; e = e->next_out)
 414         max_edge_latency = MAX (max_edge_latency, e->latency);
 415
 416       maxii += max_edge_latency;
 417     }
 418   return maxii;
 419 }
 420
 421 /*
 422    Breaking intra-loop register anti-dependences:
 423    Each intra-loop register anti-dependence implies a cross-iteration true
 424    dependence of distance 1. Therefore, we can remove such false dependencies
 425    and figure out if the partial schedule broke them by checking if (for a
 426    true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
 427    if so generate a register move.   The number of such moves is equal to:
 428               SCHED_TIME (use) - SCHED_TIME (def)       { 0 broken
 429    nreg_moves = ----------------------------------- + 1 - {   dependence.
 430                             ii                          { 1 if not.
 431 */
 432 static struct undo_replace_buff_elem *
 433 generate_reg_moves (partial_schedule_ptr ps)
 434 {
 435   ddg_ptr g = ps->g;
 436   int ii = ps->ii;
 437   int i;
 438   struct undo_replace_buff_elem *reg_move_replaces = NULL;
 439
 440   for (i = 0; i < g->num_nodes; i++)
 441     {
 442       ddg_node_ptr u = &g->nodes[i];
 443       ddg_edge_ptr e;
 444       int nreg_moves = 0, i_reg_move;
 445       sbitmap *uses_of_defs;
 446       rtx last_reg_move;
 447       rtx prev_reg, old_reg;
 448
 449       /* Compute the number of reg_moves needed for u, by looking at life
 450          ranges started at u (excluding self-loops).  */
 451       for (e = u->out; e; e = e->next_out)
 452         if (e->type == TRUE_DEP && e->dest != e->src)
 453           {
 454             int nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
 455
 456             if (e->distance == 1)
 457               nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
 458
 459             /* If dest precedes src in the schedule of the kernel, then dest
 460                will read before src writes and we can save one reg_copy.  */
 461             if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
 462                 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
 463               nreg_moves4e--;
 464
 465             nreg_moves = MAX (nreg_moves, nreg_moves4e);
 466           }
 467
 468       if (nreg_moves == 0)
 469         continue;
 470
 471       /* Every use of the register defined by node may require a different
 472          copy of this register, depending on the time the use is scheduled.
 473          Set a bitmap vector, telling which nodes use each copy of this
 474          register.  */
 475       uses_of_defs = sbitmap_vector_alloc (nreg_moves, g->num_nodes);
 476       sbitmap_vector_zero (uses_of_defs, nreg_moves);
 477       for (e = u->out; e; e = e->next_out)
 478         if (e->type == TRUE_DEP && e->dest != e->src)
 479           {
 480             int dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
 481
 482             if (e->distance == 1)
 483               dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
 484
 485             if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
 486                 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
 487               dest_copy--;
 488
 489             if (dest_copy)
 490               SET_BIT (uses_of_defs[dest_copy - 1], e->dest->cuid);
 491           }
 492
 493       /* Now generate the reg_moves, attaching relevant uses to them.  */
 494       SCHED_NREG_MOVES (u) = nreg_moves;
 495       old_reg = prev_reg = copy_rtx (SET_DEST (single_set (u->insn)));
 496       last_reg_move = u->insn;
 497
 498       for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
 499         {
 500           unsigned int i_use = 0;
 501           rtx new_reg = gen_reg_rtx (GET_MODE (prev_reg));
 502           rtx reg_move = gen_move_insn (new_reg, prev_reg);
 503           sbitmap_iterator sbi;
 504
 505           add_insn_before (reg_move, last_reg_move);
 506           last_reg_move = reg_move;
 507
 508           if (!SCHED_FIRST_REG_MOVE (u))
 509             SCHED_FIRST_REG_MOVE (u) = reg_move;
 510
 511           EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs[i_reg_move], 0, i_use, sbi)
 512             {
 513               struct undo_replace_buff_elem *rep;
 514
 515               rep = (struct undo_replace_buff_elem *)
 516                     xcalloc (1, sizeof (struct undo_replace_buff_elem));
 517               rep->insn = g->nodes[i_use].insn;
 518               rep->orig_reg = old_reg;
 519               rep->new_reg = new_reg;
 520
 521               if (! reg_move_replaces)
 522                 reg_move_replaces = rep;
 523               else
 524                 {
 525                   rep->next = reg_move_replaces;
 526                   reg_move_replaces = rep;
 527                 }
 528
 529               replace_rtx (g->nodes[i_use].insn, old_reg, new_reg);
 530             }
 531
 532           prev_reg = new_reg;
 533         }
 534       sbitmap_vector_free (uses_of_defs);
 535     }
 536   return reg_move_replaces;
 537 }
 538
 539 /* We call this when we want to undo the SMS schedule for a given loop.
 540    One of the things that we do is to delete the register moves generated
 541    for the sake of SMS; this function deletes the register move instructions
 542    recorded in the undo buffer.  */
 543 static void
 544 undo_generate_reg_moves (partial_schedule_ptr ps,
 545                          struct undo_replace_buff_elem *reg_move_replaces)
 546 {
 547   int i,j;
 548
 549   for (i = 0; i < ps->g->num_nodes; i++)
 550     {
 551       ddg_node_ptr u = &ps->g->nodes[i];
 552       rtx prev;
 553       rtx crr = SCHED_FIRST_REG_MOVE (u);
 554
 555       for (j = 0; j < SCHED_NREG_MOVES (u); j++)
 556         {
 557           prev = PREV_INSN (crr);
 558           delete_insn (crr);
 559           crr = prev;
 560         }
 561       SCHED_FIRST_REG_MOVE (u) = NULL_RTX;
 562     }
 563
 564   while (reg_move_replaces)
 565     {
 566       struct undo_replace_buff_elem *rep = reg_move_replaces;
 567
 568       reg_move_replaces = reg_move_replaces->next;
 569       replace_rtx (rep->insn, rep->new_reg, rep->orig_reg);
 570     }
 571 }
 572
 573 /* Free memory allocated for the undo buffer.  */
 574 static void
 575 free_undo_replace_buff (struct undo_replace_buff_elem *reg_move_replaces)
 576 {
 577
 578   while (reg_move_replaces)
 579     {
 580       struct undo_replace_buff_elem *rep = reg_move_replaces;
 581
 582       reg_move_replaces = reg_move_replaces->next;
 583       free (rep);
 584     }
 585 }
 586
 587 /* Bump the SCHED_TIMEs of all nodes to start from zero.  Set the values
 588    of SCHED_ROW and SCHED_STAGE.  */
 589 static void
 590 normalize_sched_times (partial_schedule_ptr ps)
 591 {
 592   int i;
 593   ddg_ptr g = ps->g;
 594   int amount = PS_MIN_CYCLE (ps);
 595   int ii = ps->ii;
 596
 597   /* Don't include the closing branch assuming that it is the last node.  */
 598   for (i = 0; i < g->num_nodes - 1; i++)
 599     {
 600       ddg_node_ptr u = &g->nodes[i];
 601       int normalized_time = SCHED_TIME (u) - amount;
 602
 603       gcc_assert (normalized_time >= 0);
 604
 605       SCHED_TIME (u) = normalized_time;
 606       SCHED_ROW (u) = normalized_time % ii;
 607       SCHED_STAGE (u) = normalized_time / ii;
 608     }
 609 }
 610
 611 /* Set SCHED_COLUMN of each node according to its position in PS.  */
 612 static void
 613 set_columns_for_ps (partial_schedule_ptr ps)
 614 {
 615   int row;
 616
 617   for (row = 0; row < ps->ii; row++)
 618     {
 619       ps_insn_ptr cur_insn = ps->rows[row];
 620       int column = 0;
 621
 622       for (; cur_insn; cur_insn = cur_insn->next_in_row)
 623         SCHED_COLUMN (cur_insn->node) = column++;
 624     }
 625 }
 626
 627 /* Permute the insns according to their order in PS, from row 0 to
 628    row ii-1, and position them right before LAST.  This schedules
 629    the insns of the loop kernel.  */
 630 static void
 631 permute_partial_schedule (partial_schedule_ptr ps, rtx last)
 632 {
 633   int ii = ps->ii;
 634   int row;
 635   ps_insn_ptr ps_ij;
 636
 637   for (row = 0; row < ii ; row++)
 638     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
 639       if (PREV_INSN (last) != ps_ij->node->insn)
 640         reorder_insns_nobb (ps_ij->node->first_note, ps_ij->node->insn,
 641                             PREV_INSN (last));
 642 }
 643
 644 /* As part of undoing SMS we return to the original ordering of the
 645    instructions inside the loop kernel.  Given the partial schedule PS, this
 646    function returns the ordering of the instruction according to their CUID
 647    in the DDG (PS->G), which is the original order of the instruction before
 648    performing SMS.  */
 649 static void
 650 undo_permute_partial_schedule (partial_schedule_ptr ps, rtx last)
 651 {
 652   int i;
 653
 654   for (i = 0 ; i < ps->g->num_nodes; i++)
 655     if (last == ps->g->nodes[i].insn
 656         || last == ps->g->nodes[i].first_note)
 657       break;
 658     else if (PREV_INSN (last) != ps->g->nodes[i].insn)
 659       reorder_insns_nobb (ps->g->nodes[i].first_note, ps->g->nodes[i].insn,
 660                           PREV_INSN (last));
 661 }
 662
 663 /* Used to generate the prologue & epilogue.  Duplicate the subset of
 664    nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive
 665    of both), together with a prefix/suffix of their reg_moves.  */
 666 static void
 667 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
 668                            int to_stage, int for_prolog)
 669 {
 670   int row;
 671   ps_insn_ptr ps_ij;
 672
 673   for (row = 0; row < ps->ii; row++)
 674     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
 675       {
 676         ddg_node_ptr u_node = ps_ij->node;
 677         int j, i_reg_moves;
 678         rtx reg_move = NULL_RTX;
 679
 680         if (for_prolog)
 681           {
 682             /* SCHED_STAGE (u_node) >= from_stage == 0.  Generate increasing
 683                number of reg_moves starting with the second occurrence of
 684                u_node, which is generated if its SCHED_STAGE <= to_stage.  */
 685             i_reg_moves = to_stage - SCHED_STAGE (u_node) + 1;
 686             i_reg_moves = MAX (i_reg_moves, 0);
 687             i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
 688
 689             /* The reg_moves start from the *first* reg_move backwards.  */
 690             if (i_reg_moves)
 691               {
 692                 reg_move = SCHED_FIRST_REG_MOVE (u_node);
 693                 for (j = 1; j < i_reg_moves; j++)
 694                   reg_move = PREV_INSN (reg_move);
 695               }
 696           }
 697         else /* It's for the epilog.  */
 698           {
 699             /* SCHED_STAGE (u_node) <= to_stage.  Generate all reg_moves,
 700                starting to decrease one stage after u_node no longer occurs;
 701                that is, generate all reg_moves until
 702                SCHED_STAGE (u_node) == from_stage - 1.  */
 703             i_reg_moves = SCHED_NREG_MOVES (u_node)
 704                        - (from_stage - SCHED_STAGE (u_node) - 1);
 705             i_reg_moves = MAX (i_reg_moves, 0);
 706             i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
 707
 708             /* The reg_moves start from the *last* reg_move forwards.  */
 709             if (i_reg_moves)
 710               {
 711                 reg_move = SCHED_FIRST_REG_MOVE (u_node);
 712                 for (j = 1; j < SCHED_NREG_MOVES (u_node); j++)
 713                   reg_move = PREV_INSN (reg_move);
 714               }
 715           }
 716
 717         for (j = 0; j < i_reg_moves; j++, reg_move = NEXT_INSN (reg_move))
 718           emit_insn (copy_rtx (PATTERN (reg_move)));
 719         if (SCHED_STAGE (u_node) >= from_stage
 720             && SCHED_STAGE (u_node) <= to_stage)
 721           duplicate_insn_chain (u_node->first_note, u_node->insn);
 722       }
 723 }
 724
 725
 726 /* Generate the instructions (including reg_moves) for prolog & epilog.  */
 727 static void
 728 generate_prolog_epilog (partial_schedule_ptr ps, struct loop * loop, rtx count_reg)
 729 {
 730   int i;
 731   int last_stage = PS_STAGE_COUNT (ps) - 1;
 732   edge e;
 733
 734   /* Generate the prolog, inserting its insns on the loop-entry edge.  */
 735   start_sequence ();
 736
 737   if (count_reg)
 738    /* Generate a subtract instruction at the beginning of the prolog to
 739       adjust the loop count by STAGE_COUNT.  */
 740    emit_insn (gen_sub2_insn (count_reg, GEN_INT (last_stage)));
 741
 742   for (i = 0; i < last_stage; i++)
 743     duplicate_insns_of_cycles (ps, 0, i, 1);
 744
 745   /* Put the prolog on the entry edge.  */
 746   e = loop_preheader_edge (loop);
 747   split_edge_and_insert (e, get_insns());
 748
 749   end_sequence ();
 750
 751   /* Generate the epilog, inserting its insns on the loop-exit edge.  */
 752   start_sequence ();
 753
 754   for (i = 0; i < last_stage; i++)
 755     duplicate_insns_of_cycles (ps, i + 1, last_stage, 0);
 756
 757   /* Put the epilogue on the exit edge.  */
 758   gcc_assert (loop->single_exit);
 759   e = loop->single_exit;
 760   split_edge_and_insert (e, get_insns());
 761   end_sequence ();
 762 }
 763
 764 /* Return true if all the BBs of the loop are empty except the
 765    loop header.  */
 766 static bool
 767 loop_single_full_bb_p (struct loop *loop)
 768 {
 769   unsigned i;
 770   basic_block *bbs = get_loop_body (loop);
 771
 772   for (i = 0; i < loop->num_nodes ; i++)
 773     {
 774       rtx head, tail;
 775       bool empty_bb = true;
 776
 777       if (bbs[i] == loop->header)
 778         continue;
 779
 780       /* Make sure that basic blocks other than the header
 781          have only notes labels or jumps.  */
 782       get_ebb_head_tail (bbs[i], bbs[i], &head, &tail);
 783       for (; head != NEXT_INSN (tail); head = NEXT_INSN (head))
 784         {
 785           if (NOTE_P (head) || LABEL_P (head)
 786               || (INSN_P (head) && JUMP_P (head)))
 787             continue;
 788           empty_bb = false;
 789           break;
 790         }
 791
 792       if (! empty_bb)
 793         {
 794           free (bbs);
 795           return false;
 796         }
 797     }
 798   free (bbs);
 799   return true;
 800 }
 801
 802 /* A simple loop from SMS point of view; it is a loop that is composed of
 803    either a single basic block or two BBs - a header and a latch.  */
 804 #define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 )                     \
 805                                   && (EDGE_COUNT (loop->latch->preds) == 1) \
 806                                   && (EDGE_COUNT (loop->latch->succs) == 1))
 807
 808 /* Return true if the loop is in its canonical form and false if not.
 809    i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit.  */
 810 static bool
 811 loop_canon_p (struct loop *loop)
 812 {
 813
 814   if (loop->inner || ! loop->outer)
 815     return false;
 816
 817   if (!loop->single_exit)
 818     {
 819       if (dump_file)
 820         {
 821           fprintf (dump_file, "SMS loop many exits ");
 822         }
 823       return false;
 824     }
 825
 826   if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop))
 827     {
 828       if (dump_file)
 829         {
 830           fprintf (dump_file, "SMS loop many BBs. ");
 831         }
 832       return false;
 833     }
 834
 835     return true;
 836 }
 837
 838 /* If there are more than one entry for the loop,
 839    make it one by splitting the first entry edge and
 840    redirecting the others to the new BB.  */
 841 static void
 842 canon_loop (struct loop *loop)
 843 {
 844   edge e;
 845   edge_iterator i;
 846
 847   /* Avoid annoying special cases of edges going to exit
 848      block.  */
 849   FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR->preds)
 850     if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1))
 851       split_edge (e);
 852
 853   if (loop->latch == loop->header
 854       || EDGE_COUNT (loop->latch->succs) > 1)
 855     {
 856       FOR_EACH_EDGE (e, i, loop->header->preds)
 857         if (e->src == loop->latch)
 858           break;
 859       split_edge (e);
 860     }
 861 }
 862
 863 /* Main entry point, perform SMS scheduling on the loops of the function
 864    that consist of single basic blocks.  */
 865 static void
 866 sms_schedule (void)
 867 {
 868   static int passes = 0;
 869   rtx insn;
 870   ddg_ptr *g_arr, g;
 871   int * node_order;
 872   int maxii;
 873   unsigned i,num_loops;
 874   partial_schedule_ptr ps;
 875   struct df *df;
 876   basic_block bb = NULL;
 877   /* vars to the versioning only if needed*/
 878   struct loop * nloop;
 879   basic_block condition_bb = NULL;
 880   edge latch_edge;
 881   gcov_type trip_count = 0;
 882
 883   loop_optimizer_init (LOOPS_HAVE_PREHEADERS
 884                        | LOOPS_HAVE_MARKED_SINGLE_EXITS);
 885   if (!current_loops)
 886     return;  /* There are no loops to schedule.  */
 887
 888   /* Initialize issue_rate.  */
 889   if (targetm.sched.issue_rate)
 890     {
 891       int temp = reload_completed;
 892
 893       reload_completed = 1;
 894       issue_rate = targetm.sched.issue_rate ();
 895       reload_completed = temp;
 896     }
 897   else
 898     issue_rate = 1;
 899
 900   /* Initialize the scheduler.  */
 901   current_sched_info = &sms_sched_info;
 902   sched_init ();
 903
 904   /* Init Data Flow analysis, to be used in interloop dep calculation.  */
 905   df = df_init (DF_HARD_REGS | DF_EQUIV_NOTES | DF_SUBREGS);
 906   df_rd_add_problem (df, 0);
 907   df_ru_add_problem (df, 0);
 908   df_chain_add_problem (df, DF_DU_CHAIN | DF_UD_CHAIN);
 909   df_analyze (df);
 910
 911   if (dump_file)
 912     df_dump (df, dump_file);
 913
 914   /* Allocate memory to hold the DDG array one entry for each loop.
 915      We use loop->num as index into this array.  */
 916   g_arr = XCNEWVEC (ddg_ptr, current_loops->num);
 917
 918
 919   /* Build DDGs for all the relevant loops and hold them in G_ARR
 920      indexed by the loop index.  */
 921   for (i = 0; i < current_loops->num; i++)
 922     {
 923       rtx head, tail;
 924       rtx count_reg;
 925       struct loop *loop = current_loops->parray[i];
 926
 927       /* For debugging.  */
 928       if ((passes++ > MAX_SMS_LOOP_NUMBER) && (MAX_SMS_LOOP_NUMBER != -1))
 929         {
 930           if (dump_file)
 931             fprintf (dump_file, "SMS reached MAX_PASSES... \n");
 932
 933           break;
 934         }
 935
 936       if (! loop_canon_p (loop))
 937         continue;
 938
 939       if (! loop_single_full_bb_p (loop))
 940         continue;
 941
 942       bb = loop->header;
 943
 944       get_ebb_head_tail (bb, bb, &head, &tail);
 945       latch_edge = loop_latch_edge (loop);
 946       gcc_assert (loop->single_exit);
 947       if (loop->single_exit->count)
 948         trip_count = latch_edge->count / loop->single_exit->count;
 949
 950       /* Perfrom SMS only on loops that their average count is above threshold.  */
 951
 952       if ( latch_edge->count
 953           && (latch_edge->count < loop->single_exit->count * SMS_LOOP_AVERAGE_COUNT_THRESHOLD))
 954         {
 955           if (dump_file)
 956             {
 957               fprintf (dump_file, "SMS single-bb-loop\n");
 958               if (profile_info && flag_branch_probabilities)
 959                 {
 960                   fprintf (dump_file, "SMS loop-count ");
 961                   fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
 962                            (HOST_WIDEST_INT) bb->count);
 963                   fprintf (dump_file, "\n");
 964                   fprintf (dump_file, "SMS trip-count ");
 965                   fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
 966                            (HOST_WIDEST_INT) trip_count);
 967                   fprintf (dump_file, "\n");
 968                   fprintf (dump_file, "SMS profile-sum-max ");
 969                   fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
 970                            (HOST_WIDEST_INT) profile_info->sum_max);
 971                   fprintf (dump_file, "\n");
 972                 }
 973             }
 974           continue;
 975         }
 976
 977       /* Make sure this is a doloop.  */
 978       if ( !(count_reg = doloop_register_get (tail)))
 979         continue;
 980
 981       /* Don't handle BBs with calls or barriers, or !single_set insns.  */
 982       for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
 983         if (CALL_P (insn)
 984             || BARRIER_P (insn)
 985             || (INSN_P (insn) && !JUMP_P (insn)
 986                 && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
 987           break;
 988
 989       if (insn != NEXT_INSN (tail))
 990         {
 991           if (dump_file)
 992             {
 993               if (CALL_P (insn))
 994                 fprintf (dump_file, "SMS loop-with-call\n");
 995               else if (BARRIER_P (insn))
 996                 fprintf (dump_file, "SMS loop-with-barrier\n");
 997               else
 998                 fprintf (dump_file, "SMS loop-with-not-single-set\n");
 999               print_rtl_single (dump_file, insn);
1000             }
1001
1002           continue;
1003         }
1004
1005       if (! (g = create_ddg (bb, df, 0)))
1006         {
1007           if (dump_file)
1008             fprintf (dump_file, "SMS doloop\n");
1009           continue;
1010         }
1011
1012       g_arr[i] = g;
1013     }
1014
1015   /* Release Data Flow analysis data structures.  */
1016   df_finish (df);
1017   df = NULL;
1018
1019   /* We don't want to perform SMS on new loops - created by versioning.  */
1020   num_loops = current_loops->num;
1021   /* Go over the built DDGs and perfrom SMS for each one of them.  */
1022   for (i = 0; i < num_loops; i++)
1023     {
1024       rtx head, tail;
1025       rtx count_reg, count_init;
1026       int mii, rec_mii;
1027       unsigned stage_count = 0;
1028       HOST_WIDEST_INT loop_count = 0;
1029       struct loop *loop = current_loops->parray[i];
1030
1031       if (! (g = g_arr[i]))
1032         continue;
1033
1034       if (dump_file)
1035         print_ddg (dump_file, g);
1036
1037       get_ebb_head_tail (loop->header, loop->header, &head, &tail);
1038
1039       latch_edge = loop_latch_edge (loop);
1040       gcc_assert (loop->single_exit);
1041       if (loop->single_exit->count)
1042         trip_count = latch_edge->count / loop->single_exit->count;
1043
1044       if (dump_file)
1045         {
1046           fprintf (dump_file, "SMS single-bb-loop\n");
1047           if (profile_info && flag_branch_probabilities)
1048             {
1049               fprintf (dump_file, "SMS loop-count ");
1050               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1051                        (HOST_WIDEST_INT) bb->count);
1052               fprintf (dump_file, "\n");
1053               fprintf (dump_file, "SMS profile-sum-max ");
1054               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1055                        (HOST_WIDEST_INT) profile_info->sum_max);
1056               fprintf (dump_file, "\n");
1057             }
1058           fprintf (dump_file, "SMS doloop\n");
1059           fprintf (dump_file, "SMS built-ddg %d\n", g->num_nodes);
1060           fprintf (dump_file, "SMS num-loads %d\n", g->num_loads);
1061           fprintf (dump_file, "SMS num-stores %d\n", g->num_stores);
1062         }
1063
1064
1065       /* In case of th loop have doloop register it gets special
1066          handling.  */
1067       count_init = NULL_RTX;
1068       if ((count_reg = doloop_register_get (tail)))
1069         {
1070           basic_block pre_header;
1071
1072           pre_header = loop_preheader_edge (loop)->src;
1073           count_init = const_iteration_count (count_reg, pre_header,
1074                                               &loop_count);
1075         }
1076       gcc_assert (count_reg);
1077
1078       if (dump_file && count_init)
1079         {
1080           fprintf (dump_file, "SMS const-doloop ");
1081           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1082                      loop_count);
1083           fprintf (dump_file, "\n");
1084         }
1085
1086       node_order = XNEWVEC (int, g->num_nodes);
1087
1088       mii = 1; /* Need to pass some estimate of mii.  */
1089       rec_mii = sms_order_nodes (g, mii, node_order);
1090       mii = MAX (res_MII (g), rec_mii);
1091       maxii = (calculate_maxii (g) * SMS_MAX_II_FACTOR) / 100;
1092
1093       if (dump_file)
1094         fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1095                  rec_mii, mii, maxii);
1096
1097       /* After sms_order_nodes and before sms_schedule_by_order, to copy over
1098          ASAP.  */
1099       set_node_sched_params (g);
1100
1101       ps = sms_schedule_by_order (g, mii, maxii, node_order);
1102
1103       if (ps)
1104         stage_count = PS_STAGE_COUNT (ps);
1105
1106       /* Stage count of 1 means that there is no interleaving between
1107          iterations, let the scheduling passes do the job.  */
1108       if (stage_count < 1
1109           || (count_init && (loop_count <= stage_count))
1110           || (flag_branch_probabilities && (trip_count <= stage_count)))
1111         {
1112           if (dump_file)
1113             {
1114               fprintf (dump_file, "SMS failed... \n");
1115               fprintf (dump_file, "SMS sched-failed (stage-count=%d, loop-count=", stage_count);
1116               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, loop_count);
1117               fprintf (dump_file, ", trip-count=");
1118               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, trip_count);
1119               fprintf (dump_file, ")\n");
1120             }
1121           continue;
1122         }
1123       else
1124         {
1125           int orig_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1126           int new_cycles;
1127           struct undo_replace_buff_elem *reg_move_replaces;
1128
1129           if (dump_file)
1130             {
1131               fprintf (dump_file,
1132                        "SMS succeeded %d %d (with ii, sc)\n", ps->ii,
1133                        stage_count);
1134               print_partial_schedule (ps, dump_file);
1135               fprintf (dump_file,
1136                        "SMS Branch (%d) will later be scheduled at cycle %d.\n",
1137                        g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1);
1138             }
1139
1140           /* Set the stage boundaries.  If the DDG is built with closing_branch_deps,
1141              the closing_branch was scheduled and should appear in the last (ii-1)
1142              row.  Otherwise, we are free to schedule the branch, and we let nodes
1143              that were scheduled at the first PS_MIN_CYCLE cycle appear in the first
1144              row; this should reduce stage_count to minimum.  */
1145           normalize_sched_times (ps);
1146           rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
1147           set_columns_for_ps (ps);
1148
1149           /* Generate the kernel just to be able to measure its cycles.  */
1150           permute_partial_schedule (ps, g->closing_branch->first_note);
1151           reg_move_replaces = generate_reg_moves (ps);
1152
1153           /* Get the number of cycles the new kernel expect to execute in.  */
1154           new_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1155
1156           /* Get back to the original loop so we can do loop versioning.  */
1157           undo_permute_partial_schedule (ps, g->closing_branch->first_note);
1158           if (reg_move_replaces)
1159             undo_generate_reg_moves (ps, reg_move_replaces);
1160
1161           if ( new_cycles >= orig_cycles)
1162             {
1163               /* SMS is not profitable so undo the permutation and reg move generation
1164                  and return the kernel to its original state.  */
1165               if (dump_file)
1166                 fprintf (dump_file, "Undoing SMS because it is not profitable.\n");
1167
1168             }
1169           else
1170             {
1171               canon_loop (loop);
1172
1173               /* case the BCT count is not known , Do loop-versioning */
1174               if (count_reg && ! count_init)
1175                 {
1176                   rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
1177                                                  GEN_INT(stage_count));
1178
1179                   nloop = loop_version (current_loops, loop, comp_rtx,
1180                                         &condition_bb, true);
1181                 }
1182
1183               /* Set new iteration count of loop kernel.  */
1184               if (count_reg && count_init)
1185                 SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1186                                                              - stage_count + 1);
1187
1188               /* Now apply the scheduled kernel to the RTL of the loop.  */
1189               permute_partial_schedule (ps, g->closing_branch->first_note);
1190
1191               /* Mark this loop as software pipelined so the later
1192               scheduling passes doesn't touch it.  */
1193               if (! flag_resched_modulo_sched)
1194                 g->bb->flags |= BB_DISABLE_SCHEDULE;
1195               /* The life-info is not valid any more.  */
1196               g->bb->flags |= BB_DIRTY;
1197
1198               reg_move_replaces = generate_reg_moves (ps);
1199               if (dump_file)
1200                 print_node_sched_params (dump_file, g->num_nodes);
1201               /* Generate prolog and epilog.  */
1202               if (count_reg && !count_init)
1203                 generate_prolog_epilog (ps, loop, count_reg);
1204               else
1205                 generate_prolog_epilog (ps, loop, NULL_RTX);
1206             }
1207           free_undo_replace_buff (reg_move_replaces);
1208         }
1209
1210       free_partial_schedule (ps);
1211       free (node_sched_params);
1212       free (node_order);
1213       free_ddg (g);
1214     }
1215
1216   free (g_arr);
1217
1218   /* Release scheduler data, needed until now because of DFA.  */
1219   sched_finish ();
1220   loop_optimizer_finalize ();
1221 }
1222
1223 /* The SMS scheduling algorithm itself
1224    -----------------------------------
1225    Input: 'O' an ordered list of insns of a loop.
1226    Output: A scheduling of the loop - kernel, prolog, and epilogue.
1227
1228    'Q' is the empty Set
1229    'PS' is the partial schedule; it holds the currently scheduled nodes with
1230         their cycle/slot.
1231    'PSP' previously scheduled predecessors.
1232    'PSS' previously scheduled successors.
1233    't(u)' the cycle where u is scheduled.
1234    'l(u)' is the latency of u.
1235    'd(v,u)' is the dependence distance from v to u.
1236    'ASAP(u)' the earliest time at which u could be scheduled as computed in
1237              the node ordering phase.
1238    'check_hardware_resources_conflicts(u, PS, c)'
1239                              run a trace around cycle/slot through DFA model
1240                              to check resource conflicts involving instruction u
1241                              at cycle c given the partial schedule PS.
1242    'add_to_partial_schedule_at_time(u, PS, c)'
1243                              Add the node/instruction u to the partial schedule
1244                              PS at time c.
1245    'calculate_register_pressure(PS)'
1246                              Given a schedule of instructions, calculate the register
1247                              pressure it implies.  One implementation could be the
1248                              maximum number of overlapping live ranges.
1249    'maxRP' The maximum allowed register pressure, it is usually derived from the number
1250            registers available in the hardware.
1251
1252    1. II = MII.
1253    2. PS = empty list
1254    3. for each node u in O in pre-computed order
1255    4.   if (PSP(u) != Q && PSS(u) == Q) then
1256    5.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1257    6.     start = Early_start; end = Early_start + II - 1; step = 1
1258    11.  else if (PSP(u) == Q && PSS(u) != Q) then
1259    12.      Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1260    13.     start = Late_start; end = Late_start - II + 1; step = -1
1261    14.  else if (PSP(u) != Q && PSS(u) != Q) then
1262    15.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1263    16.     Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1264    17.     start = Early_start;
1265    18.     end = min(Early_start + II - 1 , Late_start);
1266    19.     step = 1
1267    20.     else "if (PSP(u) == Q && PSS(u) == Q)"
1268    21.    start = ASAP(u); end = start + II - 1; step = 1
1269    22.  endif
1270
1271    23.  success = false
1272    24.  for (c = start ; c != end ; c += step)
1273    25.     if check_hardware_resources_conflicts(u, PS, c) then
1274    26.       add_to_partial_schedule_at_time(u, PS, c)
1275    27.       success = true
1276    28.       break
1277    29.     endif
1278    30.  endfor
1279    31.  if (success == false) then
1280    32.    II = II + 1
1281    33.    if (II > maxII) then
1282    34.       finish - failed to schedule
1283    35.   endif
1284    36.    goto 2.
1285    37.  endif
1286    38. endfor
1287    39. if (calculate_register_pressure(PS) > maxRP) then
1288    40.    goto 32.
1289    41. endif
1290    42. compute epilogue & prologue
1291    43. finish - succeeded to schedule
1292 */
1293
1294 /* A limit on the number of cycles that resource conflicts can span.  ??? Should
1295    be provided by DFA, and be dependent on the type of insn scheduled.  Currently
1296    set to 0 to save compile time.  */
1297 #define DFA_HISTORY SMS_DFA_HISTORY
1298
1299 /* Given the partial schedule PS, this function calculates and returns the
1300    cycles in which we can schedule the node with the given index I.
1301    NOTE: Here we do the backtracking in SMS, in some special cases. We have
1302    noticed that there are several cases in which we fail    to SMS the loop
1303    because the sched window of a node is empty    due to tight data-deps. In
1304    such cases we want to unschedule    some of the predecessors/successors
1305    until we get non-empty    scheduling window.  It returns -1 if the
1306    scheduling window is empty and zero otherwise.  */
1307
1308 static int
1309 get_sched_window (partial_schedule_ptr ps, int *nodes_order, int i,
1310                   sbitmap sched_nodes, int ii, int *start_p, int *step_p, int *end_p)
1311 {
1312   int start, step, end;
1313   ddg_edge_ptr e;
1314   int u = nodes_order [i];
1315   ddg_node_ptr u_node = &ps->g->nodes[u];
1316   sbitmap psp = sbitmap_alloc (ps->g->num_nodes);
1317   sbitmap pss = sbitmap_alloc (ps->g->num_nodes);
1318   sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1319   sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1320   int psp_not_empty;
1321   int pss_not_empty;
1322
1323   /* 1. compute sched window for u (start, end, step).  */
1324   sbitmap_zero (psp);
1325   sbitmap_zero (pss);
1326   psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes);
1327   pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes);
1328
1329   if (psp_not_empty && !pss_not_empty)
1330     {
1331       int early_start = INT_MIN;
1332
1333       end = INT_MAX;
1334       for (e = u_node->in; e != 0; e = e->next_in)
1335         {
1336           ddg_node_ptr v_node = e->src;
1337           if (TEST_BIT (sched_nodes, v_node->cuid))
1338             {
1339               int node_st = SCHED_TIME (v_node)
1340                             + e->latency - (e->distance * ii);
1341
1342               early_start = MAX (early_start, node_st);
1343
1344               if (e->data_type == MEM_DEP)
1345                 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1346             }
1347         }
1348       start = early_start;
1349       end = MIN (end, early_start + ii);
1350       step = 1;
1351     }
1352
1353   else if (!psp_not_empty && pss_not_empty)
1354     {
1355       int late_start = INT_MAX;
1356
1357       end = INT_MIN;
1358       for (e = u_node->out; e != 0; e = e->next_out)
1359         {
1360           ddg_node_ptr v_node = e->dest;
1361           if (TEST_BIT (sched_nodes, v_node->cuid))
1362             {
1363               late_start = MIN (late_start,
1364                                 SCHED_TIME (v_node) - e->latency
1365                                 + (e->distance * ii));
1366               if (e->data_type == MEM_DEP)
1367                 end = MAX (end, SCHED_TIME (v_node) - ii + 1);
1368             }
1369         }
1370       start = late_start;
1371       end = MAX (end, late_start - ii);
1372       step = -1;
1373     }
1374
1375   else if (psp_not_empty && pss_not_empty)
1376     {
1377       int early_start = INT_MIN;
1378       int late_start = INT_MAX;
1379
1380       start = INT_MIN;
1381       end = INT_MAX;
1382       for (e = u_node->in; e != 0; e = e->next_in)
1383         {
1384           ddg_node_ptr v_node = e->src;
1385
1386           if (TEST_BIT (sched_nodes, v_node->cuid))
1387             {
1388               early_start = MAX (early_start,
1389                                  SCHED_TIME (v_node) + e->latency
1390                                  - (e->distance * ii));
1391               if (e->data_type == MEM_DEP)
1392                 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1393             }
1394         }
1395       for (e = u_node->out; e != 0; e = e->next_out)
1396         {
1397           ddg_node_ptr v_node = e->dest;
1398
1399           if (TEST_BIT (sched_nodes, v_node->cuid))
1400             {
1401               late_start = MIN (late_start,
1402                                 SCHED_TIME (v_node) - e->latency
1403                                 + (e->distance * ii));
1404               if (e->data_type == MEM_DEP)
1405                 start = MAX (start, SCHED_TIME (v_node) - ii + 1);
1406             }
1407         }
1408       start = MAX (start, early_start);
1409       end = MIN (end, MIN (early_start + ii, late_start + 1));
1410       step = 1;
1411     }
1412   else /* psp is empty && pss is empty.  */
1413     {
1414       start = SCHED_ASAP (u_node);
1415       end = start + ii;
1416       step = 1;
1417     }
1418
1419   *start_p = start;
1420   *step_p = step;
1421   *end_p = end;
1422   sbitmap_free (psp);
1423   sbitmap_free (pss);
1424
1425   if ((start >= end && step == 1) || (start <= end && step == -1))
1426     return -1;
1427   else
1428     return 0;
1429 }
1430
1431 /* This function implements the scheduling algorithm for SMS according to the
1432    above algorithm.  */
1433 static partial_schedule_ptr
1434 sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order)
1435 {
1436   int ii = mii;
1437   int i, c, success;
1438   int try_again_with_larger_ii = true;
1439   int num_nodes = g->num_nodes;
1440   ddg_edge_ptr e;
1441   int start, end, step; /* Place together into one struct?  */
1442   sbitmap sched_nodes = sbitmap_alloc (num_nodes);
1443   sbitmap must_precede = sbitmap_alloc (num_nodes);
1444   sbitmap must_follow = sbitmap_alloc (num_nodes);
1445   sbitmap tobe_scheduled = sbitmap_alloc (num_nodes);
1446
1447   partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY);
1448
1449   sbitmap_ones (tobe_scheduled);
1450   sbitmap_zero (sched_nodes);
1451
1452   while ((! sbitmap_equal (tobe_scheduled, sched_nodes)
1453          || try_again_with_larger_ii ) && ii < maxii)
1454     {
1455       int j;
1456       bool unscheduled_nodes = false;
1457
1458       if (dump_file)
1459         fprintf(dump_file, "Starting with ii=%d\n", ii);
1460       if (try_again_with_larger_ii)
1461         {
1462           try_again_with_larger_ii = false;
1463           sbitmap_zero (sched_nodes);
1464         }
1465
1466       for (i = 0; i < num_nodes; i++)
1467         {
1468           int u = nodes_order[i];
1469           ddg_node_ptr u_node = &ps->g->nodes[u];
1470           rtx insn = u_node->insn;
1471
1472           if (!INSN_P (insn))
1473             {
1474               RESET_BIT (tobe_scheduled, u);
1475               continue;
1476             }
1477
1478           if (JUMP_P (insn)) /* Closing branch handled later.  */
1479             {
1480               RESET_BIT (tobe_scheduled, u);
1481               continue;
1482             }
1483
1484           if (TEST_BIT (sched_nodes, u))
1485             continue;
1486
1487           /* Try to get non-empty scheduling window.  */
1488           j = i;
1489           while (get_sched_window (ps, nodes_order, i, sched_nodes, ii, &start, &step, &end) < 0
1490                  && j > 0)
1491             {
1492               unscheduled_nodes = true;
1493               if (TEST_BIT (NODE_PREDECESSORS (u_node), nodes_order[j - 1])
1494                   || TEST_BIT (NODE_SUCCESSORS (u_node), nodes_order[j - 1]))
1495                 {
1496                   ps_unschedule_node (ps, &ps->g->nodes[nodes_order[j - 1]]);
1497                   RESET_BIT (sched_nodes, nodes_order [j - 1]);
1498                 }
1499               j--;
1500             }
1501           if (j < 0)
1502             {
1503               /* ??? Try backtracking instead of immediately ii++?  */
1504               ii++;
1505               try_again_with_larger_ii = true;
1506               reset_partial_schedule (ps, ii);
1507               break;
1508             }
1509           /* 2. Try scheduling u in window.  */
1510           if (dump_file)
1511             fprintf(dump_file, "Trying to schedule node %d in (%d .. %d) step %d\n",
1512                     u, start, end, step);
1513
1514           /* use must_follow & must_precede bitmaps to determine order
1515              of nodes within the cycle.  */
1516           sbitmap_zero (must_precede);
1517           sbitmap_zero (must_follow);
1518           for (e = u_node->in; e != 0; e = e->next_in)
1519             if (TEST_BIT (sched_nodes, e->src->cuid)
1520                 && e->latency == (ii * e->distance)
1521                 && start == SCHED_TIME (e->src))
1522              SET_BIT (must_precede, e->src->cuid);
1523
1524           for (e = u_node->out; e != 0; e = e->next_out)
1525             if (TEST_BIT (sched_nodes, e->dest->cuid)
1526                 && e->latency == (ii * e->distance)
1527                 && end == SCHED_TIME (e->dest))
1528              SET_BIT (must_follow, e->dest->cuid);
1529
1530           success = 0;
1531           if ((step > 0 && start < end) ||  (step < 0 && start > end))
1532             for (c = start; c != end; c += step)
1533               {
1534                 ps_insn_ptr psi;
1535
1536                 psi = ps_add_node_check_conflicts (ps, u_node, c,
1537                                                    must_precede,
1538                                                    must_follow);
1539
1540                 if (psi)
1541                   {
1542                     SCHED_TIME (u_node) = c;
1543                     SET_BIT (sched_nodes, u);
1544                     success = 1;
1545                     if (dump_file)
1546                       fprintf(dump_file, "Schedule in %d\n", c);
1547                     break;
1548                   }
1549               }
1550           if (!success)
1551             {
1552               /* ??? Try backtracking instead of immediately ii++?  */
1553               ii++;
1554               try_again_with_larger_ii = true;
1555               reset_partial_schedule (ps, ii);
1556               break;
1557             }
1558           if (unscheduled_nodes)
1559             break;
1560
1561           /* ??? If (success), check register pressure estimates.  */
1562         } /* Continue with next node.  */
1563     } /* While try_again_with_larger_ii.  */
1564
1565   sbitmap_free (sched_nodes);
1566   sbitmap_free (must_precede);
1567   sbitmap_free (must_follow);
1568   sbitmap_free (tobe_scheduled);
1569
1570   if (ii >= maxii)
1571     {
1572       free_partial_schedule (ps);
1573       ps = NULL;
1574     }
1575   return ps;
1576 }
1577
1578 \f
1579 /* This page implements the algorithm for ordering the nodes of a DDG
1580    for modulo scheduling, activated through the
1581    "int sms_order_nodes (ddg_ptr, int mii, int * result)" API.  */
1582
1583 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
1584 #define ASAP(x) (ORDER_PARAMS ((x))->asap)
1585 #define ALAP(x) (ORDER_PARAMS ((x))->alap)
1586 #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
1587 #define MOB(x) (ALAP ((x)) - ASAP ((x)))
1588 #define DEPTH(x) (ASAP ((x)))
1589
1590 typedef struct node_order_params * nopa;
1591
1592 static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
1593 static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
1594 static nopa  calculate_order_params (ddg_ptr, int mii);
1595 static int find_max_asap (ddg_ptr, sbitmap);
1596 static int find_max_hv_min_mob (ddg_ptr, sbitmap);
1597 static int find_max_dv_min_mob (ddg_ptr, sbitmap);
1598
1599 enum sms_direction {BOTTOMUP, TOPDOWN};
1600
1601 struct node_order_params
1602 {
1603   int asap;
1604   int alap;
1605   int height;
1606 };
1607
1608 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1.  */
1609 static void
1610 check_nodes_order (int *node_order, int num_nodes)
1611 {
1612   int i;
1613   sbitmap tmp = sbitmap_alloc (num_nodes);
1614
1615   sbitmap_zero (tmp);
1616
1617   for (i = 0; i < num_nodes; i++)
1618     {
1619       int u = node_order[i];
1620
1621       gcc_assert (u < num_nodes && u >= 0 && !TEST_BIT (tmp, u));
1622
1623       SET_BIT (tmp, u);
1624     }
1625
1626   sbitmap_free (tmp);
1627 }
1628
1629 /* Order the nodes of G for scheduling and pass the result in
1630    NODE_ORDER.  Also set aux.count of each node to ASAP.
1631    Return the recMII for the given DDG.  */
1632 static int
1633 sms_order_nodes (ddg_ptr g, int mii, int * node_order)
1634 {
1635   int i;
1636   int rec_mii = 0;
1637   ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
1638
1639   nopa nops = calculate_order_params (g, mii);
1640
1641   order_nodes_of_sccs (sccs, node_order);
1642
1643   if (sccs->num_sccs > 0)
1644     /* First SCC has the largest recurrence_length.  */
1645     rec_mii = sccs->sccs[0]->recurrence_length;
1646
1647   /* Save ASAP before destroying node_order_params.  */
1648   for (i = 0; i < g->num_nodes; i++)
1649     {
1650       ddg_node_ptr v = &g->nodes[i];
1651       v->aux.count = ASAP (v);
1652     }
1653
1654   free (nops);
1655   free_ddg_all_sccs (sccs);
1656   check_nodes_order (node_order, g->num_nodes);
1657
1658   return rec_mii;
1659 }
1660
1661 static void
1662 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
1663 {
1664   int i, pos = 0;
1665   ddg_ptr g = all_sccs->ddg;
1666   int num_nodes = g->num_nodes;
1667   sbitmap prev_sccs = sbitmap_alloc (num_nodes);
1668   sbitmap on_path = sbitmap_alloc (num_nodes);
1669   sbitmap tmp = sbitmap_alloc (num_nodes);
1670   sbitmap ones = sbitmap_alloc (num_nodes);
1671
1672   sbitmap_zero (prev_sccs);
1673   sbitmap_ones (ones);
1674
1675   /* Perfrom the node ordering starting from the SCC with the highest recMII.
1676      For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc.  */
1677   for (i = 0; i < all_sccs->num_sccs; i++)
1678     {
1679       ddg_scc_ptr scc = all_sccs->sccs[i];
1680
1681       /* Add nodes on paths from previous SCCs to the current SCC.  */
1682       find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
1683       sbitmap_a_or_b (tmp, scc->nodes, on_path);
1684
1685       /* Add nodes on paths from the current SCC to previous SCCs.  */
1686       find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
1687       sbitmap_a_or_b (tmp, tmp, on_path);
1688
1689       /* Remove nodes of previous SCCs from current extended SCC.  */
1690       sbitmap_difference (tmp, tmp, prev_sccs);
1691
1692       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1693       /* Above call to order_nodes_in_scc updated prev_sccs |= tmp.  */
1694     }
1695
1696   /* Handle the remaining nodes that do not belong to any scc.  Each call
1697      to order_nodes_in_scc handles a single connected component.  */
1698   while (pos < g->num_nodes)
1699     {
1700       sbitmap_difference (tmp, ones, prev_sccs);
1701       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1702     }
1703   sbitmap_free (prev_sccs);
1704   sbitmap_free (on_path);
1705   sbitmap_free (tmp);
1706   sbitmap_free (ones);
1707 }
1708
1709 /* MII is needed if we consider backarcs (that do not close recursive cycles).  */
1710 static struct node_order_params *
1711 calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED)
1712 {
1713   int u;
1714   int max_asap;
1715   int num_nodes = g->num_nodes;
1716   ddg_edge_ptr e;
1717   /* Allocate a place to hold ordering params for each node in the DDG.  */
1718   nopa node_order_params_arr;
1719
1720   /* Initialize of ASAP/ALAP/HEIGHT to zero.  */
1721   node_order_params_arr = (nopa) xcalloc (num_nodes,
1722                                           sizeof (struct node_order_params));
1723
1724   /* Set the aux pointer of each node to point to its order_params structure.  */
1725   for (u = 0; u < num_nodes; u++)
1726     g->nodes[u].aux.info = &node_order_params_arr[u];
1727
1728   /* Disregarding a backarc from each recursive cycle to obtain a DAG,
1729      calculate ASAP, ALAP, mobility, distance, and height for each node
1730      in the dependence (direct acyclic) graph.  */
1731
1732   /* We assume that the nodes in the array are in topological order.  */
1733
1734   max_asap = 0;
1735   for (u = 0; u < num_nodes; u++)
1736     {
1737       ddg_node_ptr u_node = &g->nodes[u];
1738
1739       ASAP (u_node) = 0;
1740       for (e = u_node->in; e; e = e->next_in)
1741         if (e->distance == 0)
1742           ASAP (u_node) = MAX (ASAP (u_node),
1743                                ASAP (e->src) + e->latency);
1744       max_asap = MAX (max_asap, ASAP (u_node));
1745     }
1746
1747   for (u = num_nodes - 1; u > -1; u--)
1748     {
1749       ddg_node_ptr u_node = &g->nodes[u];
1750
1751       ALAP (u_node) = max_asap;
1752       HEIGHT (u_node) = 0;
1753       for (e = u_node->out; e; e = e->next_out)
1754         if (e->distance == 0)
1755           {
1756             ALAP (u_node) = MIN (ALAP (u_node),
1757                                  ALAP (e->dest) - e->latency);
1758             HEIGHT (u_node) = MAX (HEIGHT (u_node),
1759                                    HEIGHT (e->dest) + e->latency);
1760           }
1761     }
1762
1763   return node_order_params_arr;
1764 }
1765
1766 static int
1767 find_max_asap (ddg_ptr g, sbitmap nodes)
1768 {
1769   unsigned int u = 0;
1770   int max_asap = -1;
1771   int result = -1;
1772   sbitmap_iterator sbi;
1773
1774   EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1775     {
1776       ddg_node_ptr u_node = &g->nodes[u];
1777
1778       if (max_asap < ASAP (u_node))
1779         {
1780           max_asap = ASAP (u_node);
1781           result = u;
1782         }
1783     }
1784   return result;
1785 }
1786
1787 static int
1788 find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
1789 {
1790   unsigned int u = 0;
1791   int max_hv = -1;
1792   int min_mob = INT_MAX;
1793   int result = -1;
1794   sbitmap_iterator sbi;
1795
1796   EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1797     {
1798       ddg_node_ptr u_node = &g->nodes[u];
1799
1800       if (max_hv < HEIGHT (u_node))
1801         {
1802           max_hv = HEIGHT (u_node);
1803           min_mob = MOB (u_node);
1804           result = u;
1805         }
1806       else if ((max_hv == HEIGHT (u_node))
1807                && (min_mob > MOB (u_node)))
1808         {
1809           min_mob = MOB (u_node);
1810           result = u;
1811         }
1812     }
1813   return result;
1814 }
1815
1816 static int
1817 find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
1818 {
1819   unsigned int u = 0;
1820   int max_dv = -1;
1821   int min_mob = INT_MAX;
1822   int result = -1;
1823   sbitmap_iterator sbi;
1824
1825   EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1826     {
1827       ddg_node_ptr u_node = &g->nodes[u];
1828
1829       if (max_dv < DEPTH (u_node))
1830         {
1831           max_dv = DEPTH (u_node);
1832           min_mob = MOB (u_node);
1833           result = u;
1834         }
1835       else if ((max_dv == DEPTH (u_node))
1836                && (min_mob > MOB (u_node)))
1837         {
1838           min_mob = MOB (u_node);
1839           result = u;
1840         }
1841     }
1842   return result;
1843 }
1844
1845 /* Places the nodes of SCC into the NODE_ORDER array starting
1846    at position POS, according to the SMS ordering algorithm.
1847    NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
1848    the NODE_ORDER array, starting from position zero.  */
1849 static int
1850 order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
1851                     int * node_order, int pos)
1852 {
1853   enum sms_direction dir;
1854   int num_nodes = g->num_nodes;
1855   sbitmap workset = sbitmap_alloc (num_nodes);
1856   sbitmap tmp = sbitmap_alloc (num_nodes);
1857   sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
1858   sbitmap predecessors = sbitmap_alloc (num_nodes);
1859   sbitmap successors = sbitmap_alloc (num_nodes);
1860
1861   sbitmap_zero (predecessors);
1862   find_predecessors (predecessors, g, nodes_ordered);
1863
1864   sbitmap_zero (successors);
1865   find_successors (successors, g, nodes_ordered);
1866
1867   sbitmap_zero (tmp);
1868   if (sbitmap_a_and_b_cg (tmp, predecessors, scc))
1869     {
1870       sbitmap_copy (workset, tmp);
1871       dir = BOTTOMUP;
1872     }
1873   else if (sbitmap_a_and_b_cg (tmp, successors, scc))
1874     {
1875       sbitmap_copy (workset, tmp);
1876       dir = TOPDOWN;
1877     }
1878   else
1879     {
1880       int u;
1881
1882       sbitmap_zero (workset);
1883       if ((u = find_max_asap (g, scc)) >= 0)
1884         SET_BIT (workset, u);
1885       dir = BOTTOMUP;
1886     }
1887
1888   sbitmap_zero (zero_bitmap);
1889   while (!sbitmap_equal (workset, zero_bitmap))
1890     {
1891       int v;
1892       ddg_node_ptr v_node;
1893       sbitmap v_node_preds;
1894       sbitmap v_node_succs;
1895
1896       if (dir == TOPDOWN)
1897         {
1898           while (!sbitmap_equal (workset, zero_bitmap))
1899             {
1900               v = find_max_hv_min_mob (g, workset);
1901               v_node = &g->nodes[v];
1902               node_order[pos++] = v;
1903               v_node_succs = NODE_SUCCESSORS (v_node);
1904               sbitmap_a_and_b (tmp, v_node_succs, scc);
1905
1906               /* Don't consider the already ordered successors again.  */
1907               sbitmap_difference (tmp, tmp, nodes_ordered);
1908               sbitmap_a_or_b (workset, workset, tmp);
1909               RESET_BIT (workset, v);
1910               SET_BIT (nodes_ordered, v);
1911             }
1912           dir = BOTTOMUP;
1913           sbitmap_zero (predecessors);
1914           find_predecessors (predecessors, g, nodes_ordered);
1915           sbitmap_a_and_b (workset, predecessors, scc);
1916         }
1917       else
1918         {
1919           while (!sbitmap_equal (workset, zero_bitmap))
1920             {
1921               v = find_max_dv_min_mob (g, workset);
1922               v_node = &g->nodes[v];
1923               node_order[pos++] = v;
1924               v_node_preds = NODE_PREDECESSORS (v_node);
1925               sbitmap_a_and_b (tmp, v_node_preds, scc);
1926
1927               /* Don't consider the already ordered predecessors again.  */
1928               sbitmap_difference (tmp, tmp, nodes_ordered);
1929               sbitmap_a_or_b (workset, workset, tmp);
1930               RESET_BIT (workset, v);
1931               SET_BIT (nodes_ordered, v);
1932             }
1933           dir = TOPDOWN;
1934           sbitmap_zero (successors);
1935           find_successors (successors, g, nodes_ordered);
1936           sbitmap_a_and_b (workset, successors, scc);
1937         }
1938     }
1939   sbitmap_free (tmp);
1940   sbitmap_free (workset);
1941   sbitmap_free (zero_bitmap);
1942   sbitmap_free (predecessors);
1943   sbitmap_free (successors);
1944   return pos;
1945 }
1946
1947 \f
1948 /* This page contains functions for manipulating partial-schedules during
1949    modulo scheduling.  */
1950
1951 /* Create a partial schedule and allocate a memory to hold II rows.  */
1952
1953 static partial_schedule_ptr
1954 create_partial_schedule (int ii, ddg_ptr g, int history)
1955 {
1956   partial_schedule_ptr ps = XNEW (struct partial_schedule);
1957   ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
1958   ps->ii = ii;
1959   ps->history = history;
1960   ps->min_cycle = INT_MAX;
1961   ps->max_cycle = INT_MIN;
1962   ps->g = g;
1963
1964   return ps;
1965 }
1966
1967 /* Free the PS_INSNs in rows array of the given partial schedule.
1968    ??? Consider caching the PS_INSN's.  */
1969 static void
1970 free_ps_insns (partial_schedule_ptr ps)
1971 {
1972   int i;
1973
1974   for (i = 0; i < ps->ii; i++)
1975     {
1976       while (ps->rows[i])
1977         {
1978           ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
1979
1980           free (ps->rows[i]);
1981           ps->rows[i] = ps_insn;
1982         }
1983       ps->rows[i] = NULL;
1984     }
1985 }
1986
1987 /* Free all the memory allocated to the partial schedule.  */
1988
1989 static void
1990 free_partial_schedule (partial_schedule_ptr ps)
1991 {
1992   if (!ps)
1993     return;
1994   free_ps_insns (ps);
1995   free (ps->rows);
1996   free (ps);
1997 }
1998
1999 /* Clear the rows array with its PS_INSNs, and create a new one with
2000    NEW_II rows.  */
2001
2002 static void
2003 reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
2004 {
2005   if (!ps)
2006     return;
2007   free_ps_insns (ps);
2008   if (new_ii == ps->ii)
2009     return;
2010   ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
2011                                                  * sizeof (ps_insn_ptr));
2012   memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
2013   ps->ii = new_ii;
2014   ps->min_cycle = INT_MAX;
2015   ps->max_cycle = INT_MIN;
2016 }
2017
2018 /* Prints the partial schedule as an ii rows array, for each rows
2019    print the ids of the insns in it.  */
2020 void
2021 print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
2022 {
2023   int i;
2024
2025   for (i = 0; i < ps->ii; i++)
2026     {
2027       ps_insn_ptr ps_i = ps->rows[i];
2028
2029       fprintf (dump, "\n[CYCLE %d ]: ", i);
2030       while (ps_i)
2031         {
2032           fprintf (dump, "%d, ",
2033                    INSN_UID (ps_i->node->insn));
2034           ps_i = ps_i->next_in_row;
2035         }
2036     }
2037 }
2038
2039 /* Creates an object of PS_INSN and initializes it to the given parameters.  */
2040 static ps_insn_ptr
2041 create_ps_insn (ddg_node_ptr node, int rest_count, int cycle)
2042 {
2043   ps_insn_ptr ps_i = XNEW (struct ps_insn);
2044
2045   ps_i->node = node;
2046   ps_i->next_in_row = NULL;
2047   ps_i->prev_in_row = NULL;
2048   ps_i->row_rest_count = rest_count;
2049   ps_i->cycle = cycle;
2050
2051   return ps_i;
2052 }
2053
2054
2055 /* Removes the given PS_INSN from the partial schedule.  Returns false if the
2056    node is not found in the partial schedule, else returns true.  */
2057 static bool
2058 remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
2059 {
2060   int row;
2061
2062   if (!ps || !ps_i)
2063     return false;
2064
2065   row = SMODULO (ps_i->cycle, ps->ii);
2066   if (! ps_i->prev_in_row)
2067     {
2068       if (ps_i != ps->rows[row])
2069         return false;
2070
2071       ps->rows[row] = ps_i->next_in_row;
2072       if (ps->rows[row])
2073         ps->rows[row]->prev_in_row = NULL;
2074     }
2075   else
2076     {
2077       ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
2078       if (ps_i->next_in_row)
2079         ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
2080     }
2081   free (ps_i);
2082   return true;
2083 }
2084
2085 /* Unlike what literature describes for modulo scheduling (which focuses
2086    on VLIW machines) the order of the instructions inside a cycle is
2087    important.  Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
2088    where the current instruction should go relative to the already
2089    scheduled instructions in the given cycle.  Go over these
2090    instructions and find the first possible column to put it in.  */
2091 static bool
2092 ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2093                      sbitmap must_precede, sbitmap must_follow)
2094 {
2095   ps_insn_ptr next_ps_i;
2096   ps_insn_ptr first_must_follow = NULL;
2097   ps_insn_ptr last_must_precede = NULL;
2098   int row;
2099
2100   if (! ps_i)
2101     return false;
2102
2103   row = SMODULO (ps_i->cycle, ps->ii);
2104
2105   /* Find the first must follow and the last must precede
2106      and insert the node immediately after the must precede
2107      but make sure that it there is no must follow after it.  */
2108   for (next_ps_i = ps->rows[row];
2109        next_ps_i;
2110        next_ps_i = next_ps_i->next_in_row)
2111     {
2112       if (TEST_BIT (must_follow, next_ps_i->node->cuid)
2113           && ! first_must_follow)
2114         first_must_follow = next_ps_i;
2115       if (TEST_BIT (must_precede, next_ps_i->node->cuid))
2116         {
2117           /* If we have already met a node that must follow, then
2118              there is no possible column.  */
2119           if (first_must_follow)
2120             return false;
2121           else
2122             last_must_precede = next_ps_i;
2123         }
2124     }
2125
2126   /* Now insert the node after INSERT_AFTER_PSI.  */
2127
2128   if (! last_must_precede)
2129     {
2130       ps_i->next_in_row = ps->rows[row];
2131       ps_i->prev_in_row = NULL;
2132       if (ps_i->next_in_row)
2133         ps_i->next_in_row->prev_in_row = ps_i;
2134       ps->rows[row] = ps_i;
2135     }
2136   else
2137     {
2138       ps_i->next_in_row = last_must_precede->next_in_row;
2139       last_must_precede->next_in_row = ps_i;
2140       ps_i->prev_in_row = last_must_precede;
2141       if (ps_i->next_in_row)
2142         ps_i->next_in_row->prev_in_row = ps_i;
2143     }
2144
2145   return true;
2146 }
2147
2148 /* Advances the PS_INSN one column in its current row; returns false
2149    in failure and true in success.  Bit N is set in MUST_FOLLOW if
2150    the node with cuid N must be come after the node pointed to by
2151    PS_I when scheduled in the same cycle.  */
2152 static int
2153 ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2154                         sbitmap must_follow)
2155 {
2156   ps_insn_ptr prev, next;
2157   int row;
2158   ddg_node_ptr next_node;
2159
2160   if (!ps || !ps_i)
2161     return false;
2162
2163   row = SMODULO (ps_i->cycle, ps->ii);
2164
2165   if (! ps_i->next_in_row)
2166     return false;
2167
2168   next_node = ps_i->next_in_row->node;
2169
2170   /* Check if next_in_row is dependent on ps_i, both having same sched
2171      times (typically ANTI_DEP).  If so, ps_i cannot skip over it.  */
2172   if (TEST_BIT (must_follow, next_node->cuid))
2173     return false;
2174
2175   /* Advance PS_I over its next_in_row in the doubly linked list.  */
2176   prev = ps_i->prev_in_row;
2177   next = ps_i->next_in_row;
2178
2179   if (ps_i == ps->rows[row])
2180     ps->rows[row] = next;
2181
2182   ps_i->next_in_row = next->next_in_row;
2183
2184   if (next->next_in_row)
2185     next->next_in_row->prev_in_row = ps_i;
2186
2187   next->next_in_row = ps_i;
2188   ps_i->prev_in_row = next;
2189
2190   next->prev_in_row = prev;
2191   if (prev)
2192     prev->next_in_row = next;
2193
2194   return true;
2195 }
2196
2197 /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
2198    Returns 0 if this is not possible and a PS_INSN otherwise.  Bit N is
2199    set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
2200    before/after (respectively) the node pointed to by PS_I when scheduled
2201    in the same cycle.  */
2202 static ps_insn_ptr
2203 add_node_to_ps (partial_schedule_ptr ps, ddg_node_ptr node, int cycle,
2204                 sbitmap must_precede, sbitmap must_follow)
2205 {
2206   ps_insn_ptr ps_i;
2207   int rest_count = 1;
2208   int row = SMODULO (cycle, ps->ii);
2209
2210   if (ps->rows[row]
2211       && ps->rows[row]->row_rest_count >= issue_rate)
2212     return NULL;
2213
2214   if (ps->rows[row])
2215     rest_count += ps->rows[row]->row_rest_count;
2216
2217   ps_i = create_ps_insn (node, rest_count, cycle);
2218
2219   /* Finds and inserts PS_I according to MUST_FOLLOW and
2220      MUST_PRECEDE.  */
2221   if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow))
2222     {
2223       free (ps_i);
2224       return NULL;
2225     }
2226
2227   return ps_i;
2228 }
2229
2230 /* Advance time one cycle.  Assumes DFA is being used.  */
2231 static void
2232 advance_one_cycle (void)
2233 {
2234   if (targetm.sched.dfa_pre_cycle_insn)
2235     state_transition (curr_state,
2236                       targetm.sched.dfa_pre_cycle_insn ());
2237
2238   state_transition (curr_state, NULL);
2239
2240   if (targetm.sched.dfa_post_cycle_insn)
2241     state_transition (curr_state,
2242                       targetm.sched.dfa_post_cycle_insn ());
2243 }
2244
2245 /* Given the kernel of a loop (from FIRST_INSN to LAST_INSN), finds
2246    the number of cycles according to DFA that the kernel fits in,
2247    we use this to check if we done well with SMS after we add
2248    register moves.  In some cases register moves overhead makes
2249    it even worse than the original loop.  We want SMS to be performed
2250    when it gives less cycles after register moves are added.  */
2251 static int
2252 kernel_number_of_cycles (rtx first_insn, rtx last_insn)
2253 {
2254   int cycles = 0;
2255   rtx insn;
2256   int can_issue_more = issue_rate;
2257
2258   state_reset (curr_state);
2259
2260   for (insn = first_insn;
2261        insn != NULL_RTX && insn != last_insn;
2262        insn = NEXT_INSN (insn))
2263     {
2264       if (! INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE)
2265         continue;
2266
2267       /* Check if there is room for the current insn.  */
2268       if (!can_issue_more || state_dead_lock_p (curr_state))
2269         {
2270           cycles ++;
2271           advance_one_cycle ();
2272           can_issue_more = issue_rate;
2273         }
2274
2275         /* Update the DFA state and return with failure if the DFA found
2276            recource conflicts.  */
2277       if (state_transition (curr_state, insn) >= 0)
2278         {
2279           cycles ++;
2280           advance_one_cycle ();
2281           can_issue_more = issue_rate;
2282         }
2283
2284       if (targetm.sched.variable_issue)
2285         can_issue_more =
2286           targetm.sched.variable_issue (sched_dump, sched_verbose,
2287                                         insn, can_issue_more);
2288       /* A naked CLOBBER or USE generates no instruction, so don't
2289          let them consume issue slots.  */
2290       else if (GET_CODE (PATTERN (insn)) != USE
2291                && GET_CODE (PATTERN (insn)) != CLOBBER)
2292         can_issue_more--;
2293     }
2294   return cycles;
2295 }
2296
2297 /* Checks if PS has resource conflicts according to DFA, starting from
2298    FROM cycle to TO cycle; returns true if there are conflicts and false
2299    if there are no conflicts.  Assumes DFA is being used.  */
2300 static int
2301 ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
2302 {
2303   int cycle;
2304
2305   state_reset (curr_state);
2306
2307   for (cycle = from; cycle <= to; cycle++)
2308     {
2309       ps_insn_ptr crr_insn;
2310       /* Holds the remaining issue slots in the current row.  */
2311       int can_issue_more = issue_rate;
2312
2313       /* Walk through the DFA for the current row.  */
2314       for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
2315            crr_insn;
2316            crr_insn = crr_insn->next_in_row)
2317         {
2318           rtx insn = crr_insn->node->insn;
2319
2320           if (!INSN_P (insn))
2321             continue;
2322
2323           /* Check if there is room for the current insn.  */
2324           if (!can_issue_more || state_dead_lock_p (curr_state))
2325             return true;
2326
2327           /* Update the DFA state and return with failure if the DFA found
2328              recource conflicts.  */
2329           if (state_transition (curr_state, insn) >= 0)
2330             return true;
2331
2332           if (targetm.sched.variable_issue)
2333             can_issue_more =
2334               targetm.sched.variable_issue (sched_dump, sched_verbose,
2335                                             insn, can_issue_more);
2336           /* A naked CLOBBER or USE generates no instruction, so don't
2337              let them consume issue slots.  */
2338           else if (GET_CODE (PATTERN (insn)) != USE
2339                    && GET_CODE (PATTERN (insn)) != CLOBBER)
2340             can_issue_more--;
2341         }
2342
2343       /* Advance the DFA to the next cycle.  */
2344       advance_one_cycle ();
2345     }
2346   return false;
2347 }
2348
2349 /* Checks if the given node causes resource conflicts when added to PS at
2350    cycle C.  If not the node is added to PS and returned; otherwise zero
2351    is returned.  Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
2352    cuid N must be come before/after (respectively) the node pointed to by
2353    PS_I when scheduled in the same cycle.  */
2354 ps_insn_ptr
2355 ps_add_node_check_conflicts (partial_schedule_ptr ps, ddg_node_ptr n,
2356                              int c, sbitmap must_precede,
2357                              sbitmap must_follow)
2358 {
2359   int has_conflicts = 0;
2360   ps_insn_ptr ps_i;
2361
2362   /* First add the node to the PS, if this succeeds check for
2363      conflicts, trying different issue slots in the same row.  */
2364   if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow)))
2365     return NULL; /* Failed to insert the node at the given cycle.  */
2366
2367   has_conflicts = ps_has_conflicts (ps, c, c)
2368                   || (ps->history > 0
2369                       && ps_has_conflicts (ps,
2370                                            c - ps->history,
2371                                            c + ps->history));
2372
2373   /* Try different issue slots to find one that the given node can be
2374      scheduled in without conflicts.  */
2375   while (has_conflicts)
2376     {
2377       if (! ps_insn_advance_column (ps, ps_i, must_follow))
2378         break;
2379       has_conflicts = ps_has_conflicts (ps, c, c)
2380                       || (ps->history > 0
2381                           && ps_has_conflicts (ps,
2382                                                c - ps->history,
2383                                                c + ps->history));
2384     }
2385
2386   if (has_conflicts)
2387     {
2388       remove_node_from_ps (ps, ps_i);
2389       return NULL;
2390     }
2391
2392   ps->min_cycle = MIN (ps->min_cycle, c);
2393   ps->max_cycle = MAX (ps->max_cycle, c);
2394   return ps_i;
2395 }
2396
2397 /* Rotate the rows of PS such that insns scheduled at time
2398    START_CYCLE will appear in row 0.  Updates max/min_cycles.  */
2399 void
2400 rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
2401 {
2402   int i, row, backward_rotates;
2403   int last_row = ps->ii - 1;
2404
2405   if (start_cycle == 0)
2406     return;
2407
2408   backward_rotates = SMODULO (start_cycle, ps->ii);
2409
2410   /* Revisit later and optimize this into a single loop.  */
2411   for (i = 0; i < backward_rotates; i++)
2412     {
2413       ps_insn_ptr first_row = ps->rows[0];
2414
2415       for (row = 0; row < last_row; row++)
2416         ps->rows[row] = ps->rows[row+1];
2417
2418       ps->rows[last_row] = first_row;
2419     }
2420
2421   ps->max_cycle -= start_cycle;
2422   ps->min_cycle -= start_cycle;
2423 }
2424
2425 /* Remove the node N from the partial schedule PS; because we restart the DFA
2426    each time we want to check for resource conflicts; this is equivalent to
2427    unscheduling the node N.  */
2428 static bool
2429 ps_unschedule_node (partial_schedule_ptr ps, ddg_node_ptr n)
2430 {
2431   ps_insn_ptr ps_i;
2432   int row = SMODULO (SCHED_TIME (n), ps->ii);
2433
2434   if (row < 0 || row > ps->ii)
2435     return false;
2436
2437   for (ps_i = ps->rows[row];
2438        ps_i &&  ps_i->node != n;
2439        ps_i = ps_i->next_in_row);
2440   if (!ps_i)
2441     return false;
2442
2443   return remove_node_from_ps (ps, ps_i);
2444 }
2445 #endif /* INSN_SCHEDULING */
2446 \f
2447 static bool
2448 gate_handle_sms (void)
2449 {
2450   return (optimize > 0 && flag_modulo_sched);
2451 }
2452
2453
2454 /* Run instruction scheduler.  */
2455 /* Perform SMS module scheduling.  */
2456 static unsigned int
2457 rest_of_handle_sms (void)
2458 {
2459 #ifdef INSN_SCHEDULING
2460   basic_block bb;
2461
2462   /* We want to be able to create new pseudos.  */
2463   no_new_pseudos = 0;
2464   /* Collect loop information to be used in SMS.  */
2465   cfg_layout_initialize (CLEANUP_UPDATE_LIFE);
2466   sms_schedule ();
2467
2468   /* Update the life information, because we add pseudos.  */
2469   max_regno = max_reg_num ();
2470   allocate_reg_info (max_regno, FALSE, FALSE);
2471   update_life_info (NULL, UPDATE_LIFE_GLOBAL_RM_NOTES,
2472                     (PROP_DEATH_NOTES
2473                      | PROP_REG_INFO
2474                      | PROP_KILL_DEAD_CODE
2475                      | PROP_SCAN_DEAD_CODE));
2476
2477   no_new_pseudos = 1;
2478
2479   /* Finalize layout changes.  */
2480   FOR_EACH_BB (bb)
2481     if (bb->next_bb != EXIT_BLOCK_PTR)
2482       bb->aux = bb->next_bb;
2483   cfg_layout_finalize ();
2484   free_dominance_info (CDI_DOMINATORS);
2485 #endif /* INSN_SCHEDULING */
2486   return 0;
2487 }
2488
2489 struct tree_opt_pass pass_sms =
2490 {
2491   "sms",                                /* name */
2492   gate_handle_sms,                      /* gate */
2493   rest_of_handle_sms,                   /* execute */
2494   NULL,                                 /* sub */
2495   NULL,                                 /* next */
2496   0,                                    /* static_pass_number */
2497   TV_SMS,                               /* tv_id */
2498   0,                                    /* properties_required */
2499   0,                                    /* properties_provided */
2500   0,                                    /* properties_destroyed */
2501   TODO_dump_func,                       /* todo_flags_start */
2502   TODO_dump_func |
2503   TODO_ggc_collect,                     /* todo_flags_finish */
2504   'm'                                   /* letter */
2505 };
2506