gcc/modulo-sched.c

   1 /* Swing Modulo Scheduling implementation.
   2    Copyright (C) 2004, 2005, 2006, 2007
   3    Free Software Foundation, Inc.
   4    Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 2, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING.  If not, write to the Free
  20 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
  21 02110-1301, USA.  */
  22
  23
  24 #include "config.h"
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "tm.h"
  28 #include "toplev.h"
  29 #include "rtl.h"
  30 #include "tm_p.h"
  31 #include "hard-reg-set.h"
  32 #include "regs.h"
  33 #include "function.h"
  34 #include "flags.h"
  35 #include "insn-config.h"
  36 #include "insn-attr.h"
  37 #include "except.h"
  38 #include "toplev.h"
  39 #include "recog.h"
  40 #include "sched-int.h"
  41 #include "target.h"
  42 #include "cfglayout.h"
  43 #include "cfgloop.h"
  44 #include "cfghooks.h"
  45 #include "expr.h"
  46 #include "params.h"
  47 #include "gcov-io.h"
  48 #include "ddg.h"
  49 #include "timevar.h"
  50 #include "tree-pass.h"
  51
  52 #ifdef INSN_SCHEDULING
  53
  54 /* This file contains the implementation of the Swing Modulo Scheduler,
  55    described in the following references:
  56    [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
  57        Lifetime--sensitive modulo scheduling in a production environment.
  58        IEEE Trans. on Comps., 50(3), March 2001
  59    [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
  60        Swing Modulo Scheduling: A Lifetime Sensitive Approach.
  61        PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
  62
  63    The basic structure is:
  64    1. Build a data-dependence graph (DDG) for each loop.
  65    2. Use the DDG to order the insns of a loop (not in topological order
  66       necessarily, but rather) trying to place each insn after all its
  67       predecessors _or_ after all its successors.
  68    3. Compute MII: a lower bound on the number of cycles to schedule the loop.
  69    4. Use the ordering to perform list-scheduling of the loop:
  70       1. Set II = MII.  We will try to schedule the loop within II cycles.
  71       2. Try to schedule the insns one by one according to the ordering.
  72          For each insn compute an interval of cycles by considering already-
  73          scheduled preds and succs (and associated latencies); try to place
  74          the insn in the cycles of this window checking for potential
  75          resource conflicts (using the DFA interface).
  76          Note: this is different from the cycle-scheduling of schedule_insns;
  77          here the insns are not scheduled monotonically top-down (nor bottom-
  78          up).
  79       3. If failed in scheduling all insns - bump II++ and try again, unless
  80          II reaches an upper bound MaxII, in which case report failure.
  81    5. If we succeeded in scheduling the loop within II cycles, we now
  82       generate prolog and epilog, decrease the counter of the loop, and
  83       perform modulo variable expansion for live ranges that span more than
  84       II cycles (i.e. use register copies to prevent a def from overwriting
  85       itself before reaching the use).
  86 */
  87
  88 \f
  89 /* This page defines partial-schedule structures and functions for
  90    modulo scheduling.  */
  91
  92 typedef struct partial_schedule *partial_schedule_ptr;
  93 typedef struct ps_insn *ps_insn_ptr;
  94
  95 /* The minimum (absolute) cycle that a node of ps was scheduled in.  */
  96 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
  97
  98 /* The maximum (absolute) cycle that a node of ps was scheduled in.  */
  99 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
 100
 101 /* Perform signed modulo, always returning a non-negative value.  */
 102 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
 103
 104 /* The number of different iterations the nodes in ps span, assuming
 105    the stage boundaries are placed efficiently.  */
 106 #define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \
 107                              + 1 + (ps)->ii - 1) / (ps)->ii)
 108
 109 /* A single instruction in the partial schedule.  */
 110 struct ps_insn
 111 {
 112   /* The corresponding DDG_NODE.  */
 113   ddg_node_ptr node;
 114
 115   /* The (absolute) cycle in which the PS instruction is scheduled.
 116      Same as SCHED_TIME (node).  */
 117   int cycle;
 118
 119   /* The next/prev PS_INSN in the same row.  */
 120   ps_insn_ptr next_in_row,
 121               prev_in_row;
 122
 123   /* The number of nodes in the same row that come after this node.  */
 124   int row_rest_count;
 125 };
 126
 127 /* Holds the partial schedule as an array of II rows.  Each entry of the
 128    array points to a linked list of PS_INSNs, which represents the
 129    instructions that are scheduled for that row.  */
 130 struct partial_schedule
 131 {
 132   int ii;       /* Number of rows in the partial schedule.  */
 133   int history;  /* Threshold for conflict checking using DFA.  */
 134
 135   /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii).  */
 136   ps_insn_ptr *rows;
 137
 138   /* The earliest absolute cycle of an insn in the partial schedule.  */
 139   int min_cycle;
 140
 141   /* The latest absolute cycle of an insn in the partial schedule.  */
 142   int max_cycle;
 143
 144   ddg_ptr g;    /* The DDG of the insns in the partial schedule.  */
 145 };
 146
 147 /* We use this to record all the register replacements we do in
 148    the kernel so we can undo SMS if it is not profitable.  */
 149 struct undo_replace_buff_elem
 150 {
 151   rtx insn;
 152   rtx orig_reg;
 153   rtx new_reg;
 154   struct undo_replace_buff_elem *next;
 155 };
 156
 157
 158
 159 static partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
 160 static void free_partial_schedule (partial_schedule_ptr);
 161 static void reset_partial_schedule (partial_schedule_ptr, int new_ii);
 162 void print_partial_schedule (partial_schedule_ptr, FILE *);
 163 static int kernel_number_of_cycles (rtx first_insn, rtx last_insn);
 164 static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
 165                                                 ddg_node_ptr node, int cycle,
 166                                                 sbitmap must_precede,
 167                                                 sbitmap must_follow);
 168 static void rotate_partial_schedule (partial_schedule_ptr, int);
 169 void set_row_column_for_ps (partial_schedule_ptr);
 170 static bool ps_unschedule_node (partial_schedule_ptr, ddg_node_ptr );
 171
 172 \f
 173 /* This page defines constants and structures for the modulo scheduling
 174    driver.  */
 175
 176 /* As in haifa-sched.c:  */
 177 /* issue_rate is the number of insns that can be scheduled in the same
 178    machine cycle.  It can be defined in the config/mach/mach.h file,
 179    otherwise we set it to 1.  */
 180
 181 static int issue_rate;
 182
 183 static int sms_order_nodes (ddg_ptr, int, int * result);
 184 static void set_node_sched_params (ddg_ptr);
 185 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *);
 186 static void permute_partial_schedule (partial_schedule_ptr ps, rtx last);
 187 static void generate_prolog_epilog (partial_schedule_ptr ,struct loop * loop, rtx);
 188 static void duplicate_insns_of_cycles (partial_schedule_ptr ps,
 189                                        int from_stage, int to_stage,
 190                                        int is_prolog);
 191
 192 #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
 193 #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
 194 #define SCHED_FIRST_REG_MOVE(x) \
 195         (((node_sched_params_ptr)(x)->aux.info)->first_reg_move)
 196 #define SCHED_NREG_MOVES(x) \
 197         (((node_sched_params_ptr)(x)->aux.info)->nreg_moves)
 198 #define SCHED_ROW(x) (((node_sched_params_ptr)(x)->aux.info)->row)
 199 #define SCHED_STAGE(x) (((node_sched_params_ptr)(x)->aux.info)->stage)
 200 #define SCHED_COLUMN(x) (((node_sched_params_ptr)(x)->aux.info)->column)
 201
 202 /* The scheduling parameters held for each node.  */
 203 typedef struct node_sched_params
 204 {
 205   int asap;     /* A lower-bound on the absolute scheduling cycle.  */
 206   int time;     /* The absolute scheduling cycle (time >= asap).  */
 207
 208   /* The following field (first_reg_move) is a pointer to the first
 209      register-move instruction added to handle the modulo-variable-expansion
 210      of the register defined by this node.  This register-move copies the
 211      original register defined by the node.  */
 212   rtx first_reg_move;
 213
 214   /* The number of register-move instructions added, immediately preceding
 215      first_reg_move.  */
 216   int nreg_moves;
 217
 218   int row;    /* Holds time % ii.  */
 219   int stage;  /* Holds time / ii.  */
 220
 221   /* The column of a node inside the ps.  If nodes u, v are on the same row,
 222      u will precede v if column (u) < column (v).  */
 223   int column;
 224 } *node_sched_params_ptr;
 225
 226 \f
 227 /* The following three functions are copied from the current scheduler
 228    code in order to use sched_analyze() for computing the dependencies.
 229    They are used when initializing the sched_info structure.  */
 230 static const char *
 231 sms_print_insn (rtx insn, int aligned ATTRIBUTE_UNUSED)
 232 {
 233   static char tmp[80];
 234
 235   sprintf (tmp, "i%4d", INSN_UID (insn));
 236   return tmp;
 237 }
 238
 239 static void
 240 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
 241                                regset cond_exec ATTRIBUTE_UNUSED,
 242                                regset used ATTRIBUTE_UNUSED,
 243                                regset set ATTRIBUTE_UNUSED)
 244 {
 245 }
 246
 247 static struct sched_info sms_sched_info =
 248 {
 249   NULL,
 250   NULL,
 251   NULL,
 252   NULL,
 253   NULL,
 254   sms_print_insn,
 255   NULL,
 256   compute_jump_reg_dependencies,
 257   NULL, NULL,
 258   NULL, NULL,
 259   0, 0, 0,
 260
 261   NULL, NULL, NULL, NULL, NULL,
 262   0
 263 };
 264
 265
 266 /* Return the register decremented and tested in INSN,
 267    or zero if it is not a decrement-and-branch insn.  */
 268
 269 static rtx
 270 doloop_register_get (rtx insn ATTRIBUTE_UNUSED)
 271 {
 272 #ifdef HAVE_doloop_end
 273   rtx pattern, reg, condition;
 274
 275   if (! JUMP_P (insn))
 276     return NULL_RTX;
 277
 278   pattern = PATTERN (insn);
 279   condition = doloop_condition_get (pattern);
 280   if (! condition)
 281     return NULL_RTX;
 282
 283   if (REG_P (XEXP (condition, 0)))
 284     reg = XEXP (condition, 0);
 285   else if (GET_CODE (XEXP (condition, 0)) == PLUS
 286            && REG_P (XEXP (XEXP (condition, 0), 0)))
 287     reg = XEXP (XEXP (condition, 0), 0);
 288   else
 289     gcc_unreachable ();
 290
 291   return reg;
 292 #else
 293   return NULL_RTX;
 294 #endif
 295 }
 296
 297 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
 298    that the number of iterations is a compile-time constant.  If so,
 299    return the rtx that sets COUNT_REG to a constant, and set COUNT to
 300    this constant.  Otherwise return 0.  */
 301 static rtx
 302 const_iteration_count (rtx count_reg, basic_block pre_header,
 303                        HOST_WIDEST_INT * count)
 304 {
 305   rtx insn;
 306   rtx head, tail;
 307
 308   if (! pre_header)
 309     return NULL_RTX;
 310
 311   get_ebb_head_tail (pre_header, pre_header, &head, &tail);
 312
 313   for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
 314     if (INSN_P (insn) && single_set (insn) &&
 315         rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
 316       {
 317         rtx pat = single_set (insn);
 318
 319         if (GET_CODE (SET_SRC (pat)) == CONST_INT)
 320           {
 321             *count = INTVAL (SET_SRC (pat));
 322             return insn;
 323           }
 324
 325         return NULL_RTX;
 326       }
 327
 328   return NULL_RTX;
 329 }
 330
 331 /* A very simple resource-based lower bound on the initiation interval.
 332    ??? Improve the accuracy of this bound by considering the
 333    utilization of various units.  */
 334 static int
 335 res_MII (ddg_ptr g)
 336 {
 337   return (g->num_nodes / issue_rate);
 338 }
 339
 340
 341 /* Points to the array that contains the sched data for each node.  */
 342 static node_sched_params_ptr node_sched_params;
 343
 344 /* Allocate sched_params for each node and initialize it.  Assumes that
 345    the aux field of each node contain the asap bound (computed earlier),
 346    and copies it into the sched_params field.  */
 347 static void
 348 set_node_sched_params (ddg_ptr g)
 349 {
 350   int i;
 351
 352   /* Allocate for each node in the DDG a place to hold the "sched_data".  */
 353   /* Initialize ASAP/ALAP/HIGHT to zero.  */
 354   node_sched_params = (node_sched_params_ptr)
 355                        xcalloc (g->num_nodes,
 356                                 sizeof (struct node_sched_params));
 357
 358   /* Set the pointer of the general data of the node to point to the
 359      appropriate sched_params structure.  */
 360   for (i = 0; i < g->num_nodes; i++)
 361     {
 362       /* Watch out for aliasing problems?  */
 363       node_sched_params[i].asap = g->nodes[i].aux.count;
 364       g->nodes[i].aux.info = &node_sched_params[i];
 365     }
 366 }
 367
 368 static void
 369 print_node_sched_params (FILE * file, int num_nodes)
 370 {
 371   int i;
 372
 373   if (! file)
 374     return;
 375   for (i = 0; i < num_nodes; i++)
 376     {
 377       node_sched_params_ptr nsp = &node_sched_params[i];
 378       rtx reg_move = nsp->first_reg_move;
 379       int j;
 380
 381       fprintf (file, "Node %d:\n", i);
 382       fprintf (file, " asap = %d:\n", nsp->asap);
 383       fprintf (file, " time = %d:\n", nsp->time);
 384       fprintf (file, " nreg_moves = %d:\n", nsp->nreg_moves);
 385       for (j = 0; j < nsp->nreg_moves; j++)
 386         {
 387           fprintf (file, " reg_move = ");
 388           print_rtl_single (file, reg_move);
 389           reg_move = PREV_INSN (reg_move);
 390         }
 391     }
 392 }
 393
 394 /* Calculate an upper bound for II.  SMS should not schedule the loop if it
 395    requires more cycles than this bound.  Currently set to the sum of the
 396    longest latency edge for each node.  Reset based on experiments.  */
 397 static int
 398 calculate_maxii (ddg_ptr g)
 399 {
 400   int i;
 401   int maxii = 0;
 402
 403   for (i = 0; i < g->num_nodes; i++)
 404     {
 405       ddg_node_ptr u = &g->nodes[i];
 406       ddg_edge_ptr e;
 407       int max_edge_latency = 0;
 408
 409       for (e = u->out; e; e = e->next_out)
 410         max_edge_latency = MAX (max_edge_latency, e->latency);
 411
 412       maxii += max_edge_latency;
 413     }
 414   return maxii;
 415 }
 416
 417 /*
 418    Breaking intra-loop register anti-dependences:
 419    Each intra-loop register anti-dependence implies a cross-iteration true
 420    dependence of distance 1. Therefore, we can remove such false dependencies
 421    and figure out if the partial schedule broke them by checking if (for a
 422    true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
 423    if so generate a register move.   The number of such moves is equal to:
 424               SCHED_TIME (use) - SCHED_TIME (def)       { 0 broken
 425    nreg_moves = ----------------------------------- + 1 - {   dependence.
 426                             ii                          { 1 if not.
 427 */
 428 static struct undo_replace_buff_elem *
 429 generate_reg_moves (partial_schedule_ptr ps, bool rescan)
 430 {
 431   ddg_ptr g = ps->g;
 432   int ii = ps->ii;
 433   int i;
 434   struct undo_replace_buff_elem *reg_move_replaces = NULL;
 435
 436   for (i = 0; i < g->num_nodes; i++)
 437     {
 438       ddg_node_ptr u = &g->nodes[i];
 439       ddg_edge_ptr e;
 440       int nreg_moves = 0, i_reg_move;
 441       sbitmap *uses_of_defs;
 442       rtx last_reg_move;
 443       rtx prev_reg, old_reg;
 444
 445       /* Compute the number of reg_moves needed for u, by looking at life
 446          ranges started at u (excluding self-loops).  */
 447       for (e = u->out; e; e = e->next_out)
 448         if (e->type == TRUE_DEP && e->dest != e->src)
 449           {
 450             int nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
 451
 452             if (e->distance == 1)
 453               nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
 454
 455             /* If dest precedes src in the schedule of the kernel, then dest
 456                will read before src writes and we can save one reg_copy.  */
 457             if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
 458                 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
 459               nreg_moves4e--;
 460
 461             nreg_moves = MAX (nreg_moves, nreg_moves4e);
 462           }
 463
 464       if (nreg_moves == 0)
 465         continue;
 466
 467       /* Every use of the register defined by node may require a different
 468          copy of this register, depending on the time the use is scheduled.
 469          Set a bitmap vector, telling which nodes use each copy of this
 470          register.  */
 471       uses_of_defs = sbitmap_vector_alloc (nreg_moves, g->num_nodes);
 472       sbitmap_vector_zero (uses_of_defs, nreg_moves);
 473       for (e = u->out; e; e = e->next_out)
 474         if (e->type == TRUE_DEP && e->dest != e->src)
 475           {
 476             int dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
 477
 478             if (e->distance == 1)
 479               dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
 480
 481             if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
 482                 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
 483               dest_copy--;
 484
 485             if (dest_copy)
 486               SET_BIT (uses_of_defs[dest_copy - 1], e->dest->cuid);
 487           }
 488
 489       /* Now generate the reg_moves, attaching relevant uses to them.  */
 490       SCHED_NREG_MOVES (u) = nreg_moves;
 491       old_reg = prev_reg = copy_rtx (SET_DEST (single_set (u->insn)));
 492       last_reg_move = u->insn;
 493
 494       for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
 495         {
 496           unsigned int i_use = 0;
 497           rtx new_reg = gen_reg_rtx (GET_MODE (prev_reg));
 498           rtx reg_move = gen_move_insn (new_reg, prev_reg);
 499           sbitmap_iterator sbi;
 500
 501           add_insn_before (reg_move, last_reg_move, NULL);
 502           last_reg_move = reg_move;
 503
 504           if (!SCHED_FIRST_REG_MOVE (u))
 505             SCHED_FIRST_REG_MOVE (u) = reg_move;
 506
 507           EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs[i_reg_move], 0, i_use, sbi)
 508             {
 509               struct undo_replace_buff_elem *rep;
 510
 511               rep = (struct undo_replace_buff_elem *)
 512                     xcalloc (1, sizeof (struct undo_replace_buff_elem));
 513               rep->insn = g->nodes[i_use].insn;
 514               rep->orig_reg = old_reg;
 515               rep->new_reg = new_reg;
 516
 517               if (! reg_move_replaces)
 518                 reg_move_replaces = rep;
 519               else
 520                 {
 521                   rep->next = reg_move_replaces;
 522                   reg_move_replaces = rep;
 523                 }
 524
 525               replace_rtx (g->nodes[i_use].insn, old_reg, new_reg);
 526               if (rescan)
 527                 df_insn_rescan (g->nodes[i_use].insn);
 528             }
 529
 530           prev_reg = new_reg;
 531         }
 532       sbitmap_vector_free (uses_of_defs);
 533     }
 534   return reg_move_replaces;
 535 }
 536
 537 /* We call this when we want to undo the SMS schedule for a given loop.
 538    One of the things that we do is to delete the register moves generated
 539    for the sake of SMS; this function deletes the register move instructions
 540    recorded in the undo buffer.  */
 541 static void
 542 undo_generate_reg_moves (partial_schedule_ptr ps,
 543                          struct undo_replace_buff_elem *reg_move_replaces)
 544 {
 545   int i,j;
 546
 547   for (i = 0; i < ps->g->num_nodes; i++)
 548     {
 549       ddg_node_ptr u = &ps->g->nodes[i];
 550       rtx prev;
 551       rtx crr = SCHED_FIRST_REG_MOVE (u);
 552
 553       for (j = 0; j < SCHED_NREG_MOVES (u); j++)
 554         {
 555           prev = PREV_INSN (crr);
 556           delete_insn (crr);
 557           crr = prev;
 558         }
 559       SCHED_FIRST_REG_MOVE (u) = NULL_RTX;
 560     }
 561
 562   while (reg_move_replaces)
 563     {
 564       struct undo_replace_buff_elem *rep = reg_move_replaces;
 565
 566       reg_move_replaces = reg_move_replaces->next;
 567       replace_rtx (rep->insn, rep->new_reg, rep->orig_reg);
 568     }
 569 }
 570
 571 /* Free memory allocated for the undo buffer.  */
 572 static void
 573 free_undo_replace_buff (struct undo_replace_buff_elem *reg_move_replaces)
 574 {
 575
 576   while (reg_move_replaces)
 577     {
 578       struct undo_replace_buff_elem *rep = reg_move_replaces;
 579
 580       reg_move_replaces = reg_move_replaces->next;
 581       free (rep);
 582     }
 583 }
 584
 585 /* Bump the SCHED_TIMEs of all nodes to start from zero.  Set the values
 586    of SCHED_ROW and SCHED_STAGE.  */
 587 static void
 588 normalize_sched_times (partial_schedule_ptr ps)
 589 {
 590   int i;
 591   ddg_ptr g = ps->g;
 592   int amount = PS_MIN_CYCLE (ps);
 593   int ii = ps->ii;
 594
 595   /* Don't include the closing branch assuming that it is the last node.  */
 596   for (i = 0; i < g->num_nodes - 1; i++)
 597     {
 598       ddg_node_ptr u = &g->nodes[i];
 599       int normalized_time = SCHED_TIME (u) - amount;
 600
 601       gcc_assert (normalized_time >= 0);
 602
 603       SCHED_TIME (u) = normalized_time;
 604       SCHED_ROW (u) = normalized_time % ii;
 605       SCHED_STAGE (u) = normalized_time / ii;
 606     }
 607 }
 608
 609 /* Set SCHED_COLUMN of each node according to its position in PS.  */
 610 static void
 611 set_columns_for_ps (partial_schedule_ptr ps)
 612 {
 613   int row;
 614
 615   for (row = 0; row < ps->ii; row++)
 616     {
 617       ps_insn_ptr cur_insn = ps->rows[row];
 618       int column = 0;
 619
 620       for (; cur_insn; cur_insn = cur_insn->next_in_row)
 621         SCHED_COLUMN (cur_insn->node) = column++;
 622     }
 623 }
 624
 625 /* Permute the insns according to their order in PS, from row 0 to
 626    row ii-1, and position them right before LAST.  This schedules
 627    the insns of the loop kernel.  */
 628 static void
 629 permute_partial_schedule (partial_schedule_ptr ps, rtx last)
 630 {
 631   int ii = ps->ii;
 632   int row;
 633   ps_insn_ptr ps_ij;
 634
 635   for (row = 0; row < ii ; row++)
 636     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
 637       if (PREV_INSN (last) != ps_ij->node->insn)
 638         reorder_insns_nobb (ps_ij->node->first_note, ps_ij->node->insn,
 639                             PREV_INSN (last));
 640 }
 641
 642 /* As part of undoing SMS we return to the original ordering of the
 643    instructions inside the loop kernel.  Given the partial schedule PS, this
 644    function returns the ordering of the instruction according to their CUID
 645    in the DDG (PS->G), which is the original order of the instruction before
 646    performing SMS.  */
 647 static void
 648 undo_permute_partial_schedule (partial_schedule_ptr ps, rtx last)
 649 {
 650   int i;
 651
 652   for (i = 0 ; i < ps->g->num_nodes; i++)
 653     if (last == ps->g->nodes[i].insn
 654         || last == ps->g->nodes[i].first_note)
 655       break;
 656     else if (PREV_INSN (last) != ps->g->nodes[i].insn)
 657       reorder_insns_nobb (ps->g->nodes[i].first_note, ps->g->nodes[i].insn,
 658                           PREV_INSN (last));
 659 }
 660
 661 /* Used to generate the prologue & epilogue.  Duplicate the subset of
 662    nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive
 663    of both), together with a prefix/suffix of their reg_moves.  */
 664 static void
 665 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
 666                            int to_stage, int for_prolog)
 667 {
 668   int row;
 669   ps_insn_ptr ps_ij;
 670
 671   for (row = 0; row < ps->ii; row++)
 672     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
 673       {
 674         ddg_node_ptr u_node = ps_ij->node;
 675         int j, i_reg_moves;
 676         rtx reg_move = NULL_RTX;
 677
 678         if (for_prolog)
 679           {
 680             /* SCHED_STAGE (u_node) >= from_stage == 0.  Generate increasing
 681                number of reg_moves starting with the second occurrence of
 682                u_node, which is generated if its SCHED_STAGE <= to_stage.  */
 683             i_reg_moves = to_stage - SCHED_STAGE (u_node) + 1;
 684             i_reg_moves = MAX (i_reg_moves, 0);
 685             i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
 686
 687             /* The reg_moves start from the *first* reg_move backwards.  */
 688             if (i_reg_moves)
 689               {
 690                 reg_move = SCHED_FIRST_REG_MOVE (u_node);
 691                 for (j = 1; j < i_reg_moves; j++)
 692                   reg_move = PREV_INSN (reg_move);
 693               }
 694           }
 695         else /* It's for the epilog.  */
 696           {
 697             /* SCHED_STAGE (u_node) <= to_stage.  Generate all reg_moves,
 698                starting to decrease one stage after u_node no longer occurs;
 699                that is, generate all reg_moves until
 700                SCHED_STAGE (u_node) == from_stage - 1.  */
 701             i_reg_moves = SCHED_NREG_MOVES (u_node)
 702                        - (from_stage - SCHED_STAGE (u_node) - 1);
 703             i_reg_moves = MAX (i_reg_moves, 0);
 704             i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
 705
 706             /* The reg_moves start from the *last* reg_move forwards.  */
 707             if (i_reg_moves)
 708               {
 709                 reg_move = SCHED_FIRST_REG_MOVE (u_node);
 710                 for (j = 1; j < SCHED_NREG_MOVES (u_node); j++)
 711                   reg_move = PREV_INSN (reg_move);
 712               }
 713           }
 714
 715         for (j = 0; j < i_reg_moves; j++, reg_move = NEXT_INSN (reg_move))
 716           emit_insn (copy_rtx (PATTERN (reg_move)));
 717         if (SCHED_STAGE (u_node) >= from_stage
 718             && SCHED_STAGE (u_node) <= to_stage)
 719           duplicate_insn_chain (u_node->first_note, u_node->insn);
 720       }
 721 }
 722
 723
 724 /* Generate the instructions (including reg_moves) for prolog & epilog.  */
 725 static void
 726 generate_prolog_epilog (partial_schedule_ptr ps, struct loop * loop, rtx count_reg)
 727 {
 728   int i;
 729   int last_stage = PS_STAGE_COUNT (ps) - 1;
 730   edge e;
 731
 732   /* Generate the prolog, inserting its insns on the loop-entry edge.  */
 733   start_sequence ();
 734
 735   if (count_reg)
 736    /* Generate a subtract instruction at the beginning of the prolog to
 737       adjust the loop count by STAGE_COUNT.  */
 738    emit_insn (gen_sub2_insn (count_reg, GEN_INT (last_stage)));
 739
 740   for (i = 0; i < last_stage; i++)
 741     duplicate_insns_of_cycles (ps, 0, i, 1);
 742
 743   /* Put the prolog on the entry edge.  */
 744   e = loop_preheader_edge (loop);
 745   split_edge_and_insert (e, get_insns ());
 746
 747   end_sequence ();
 748
 749   /* Generate the epilog, inserting its insns on the loop-exit edge.  */
 750   start_sequence ();
 751
 752   for (i = 0; i < last_stage; i++)
 753     duplicate_insns_of_cycles (ps, i + 1, last_stage, 0);
 754
 755   /* Put the epilogue on the exit edge.  */
 756   gcc_assert (single_exit (loop));
 757   e = single_exit (loop);
 758   split_edge_and_insert (e, get_insns ());
 759   end_sequence ();
 760 }
 761
 762 /* Return true if all the BBs of the loop are empty except the
 763    loop header.  */
 764 static bool
 765 loop_single_full_bb_p (struct loop *loop)
 766 {
 767   unsigned i;
 768   basic_block *bbs = get_loop_body (loop);
 769
 770   for (i = 0; i < loop->num_nodes ; i++)
 771     {
 772       rtx head, tail;
 773       bool empty_bb = true;
 774
 775       if (bbs[i] == loop->header)
 776         continue;
 777
 778       /* Make sure that basic blocks other than the header
 779          have only notes labels or jumps.  */
 780       get_ebb_head_tail (bbs[i], bbs[i], &head, &tail);
 781       for (; head != NEXT_INSN (tail); head = NEXT_INSN (head))
 782         {
 783           if (NOTE_P (head) || LABEL_P (head)
 784               || (INSN_P (head) && JUMP_P (head)))
 785             continue;
 786           empty_bb = false;
 787           break;
 788         }
 789
 790       if (! empty_bb)
 791         {
 792           free (bbs);
 793           return false;
 794         }
 795     }
 796   free (bbs);
 797   return true;
 798 }
 799
 800 /* A simple loop from SMS point of view; it is a loop that is composed of
 801    either a single basic block or two BBs - a header and a latch.  */
 802 #define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 )                     \
 803                                   && (EDGE_COUNT (loop->latch->preds) == 1) \
 804                                   && (EDGE_COUNT (loop->latch->succs) == 1))
 805
 806 /* Return true if the loop is in its canonical form and false if not.
 807    i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit.  */
 808 static bool
 809 loop_canon_p (struct loop *loop)
 810 {
 811
 812   if (loop->inner || !loop_outer (loop))
 813     return false;
 814
 815   if (!single_exit (loop))
 816     {
 817       if (dump_file)
 818         {
 819           rtx insn = BB_END (loop->header);
 820
 821           fprintf (dump_file, "SMS loop many exits ");
 822                   fprintf (dump_file, " %s %d (file, line)\n",
 823                            insn_file (insn), insn_line (insn));
 824         }
 825       return false;
 826     }
 827
 828   if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop))
 829     {
 830       if (dump_file)
 831         {
 832           rtx insn = BB_END (loop->header);
 833
 834           fprintf (dump_file, "SMS loop many BBs. ");
 835           fprintf (dump_file, " %s %d (file, line)\n",
 836                    insn_file (insn), insn_line (insn));
 837         }
 838       return false;
 839     }
 840
 841     return true;
 842 }
 843
 844 /* If there are more than one entry for the loop,
 845    make it one by splitting the first entry edge and
 846    redirecting the others to the new BB.  */
 847 static void
 848 canon_loop (struct loop *loop)
 849 {
 850   edge e;
 851   edge_iterator i;
 852
 853   /* Avoid annoying special cases of edges going to exit
 854      block.  */
 855   FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR->preds)
 856     if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1))
 857       split_edge (e);
 858
 859   if (loop->latch == loop->header
 860       || EDGE_COUNT (loop->latch->succs) > 1)
 861     {
 862       FOR_EACH_EDGE (e, i, loop->header->preds)
 863         if (e->src == loop->latch)
 864           break;
 865       split_edge (e);
 866     }
 867 }
 868
 869 /* Probability in % that the sms-ed loop rolls enough so that optimized
 870    version may be entered.  Just a guess.  */
 871 #define PROB_SMS_ENOUGH_ITERATIONS 80
 872
 873 /* Main entry point, perform SMS scheduling on the loops of the function
 874    that consist of single basic blocks.  */
 875 static void
 876 sms_schedule (void)
 877 {
 878   static int passes = 0;
 879   rtx insn;
 880   ddg_ptr *g_arr, g;
 881   int * node_order;
 882   int maxii;
 883   loop_iterator li;
 884   partial_schedule_ptr ps;
 885   basic_block bb = NULL;
 886   struct loop *loop;
 887   basic_block condition_bb = NULL;
 888   edge latch_edge;
 889   gcov_type trip_count = 0;
 890
 891   loop_optimizer_init (LOOPS_HAVE_PREHEADERS
 892                        | LOOPS_HAVE_RECORDED_EXITS);
 893   if (number_of_loops () <= 1)
 894     {
 895       loop_optimizer_finalize ();
 896       return;  /* There are no loops to schedule.  */
 897     }
 898
 899   /* Initialize issue_rate.  */
 900   if (targetm.sched.issue_rate)
 901     {
 902       int temp = reload_completed;
 903
 904       reload_completed = 1;
 905       issue_rate = targetm.sched.issue_rate ();
 906       reload_completed = temp;
 907     }
 908   else
 909     issue_rate = 1;
 910
 911   /* Initialize the scheduler.  */
 912   current_sched_info = &sms_sched_info;
 913
 914   /* Init Data Flow analysis, to be used in interloop dep calculation.  */
 915   df_set_flags (DF_LR_RUN_DCE);
 916   df_rd_add_problem ();
 917   df_ru_add_problem ();
 918   df_note_add_problem ();
 919   df_chain_add_problem (DF_DU_CHAIN + DF_UD_CHAIN);
 920   df_analyze ();
 921   regstat_compute_calls_crossed ();
 922   sched_init ();
 923
 924   /* Allocate memory to hold the DDG array one entry for each loop.
 925      We use loop->num as index into this array.  */
 926   g_arr = XCNEWVEC (ddg_ptr, number_of_loops ());
 927
 928   /* Build DDGs for all the relevant loops and hold them in G_ARR
 929      indexed by the loop index.  */
 930   FOR_EACH_LOOP (li, loop, 0)
 931     {
 932       rtx head, tail;
 933       rtx count_reg;
 934
 935       /* For debugging.  */
 936       if ((passes++ > MAX_SMS_LOOP_NUMBER) && (MAX_SMS_LOOP_NUMBER != -1))
 937         {
 938           if (dump_file)
 939             fprintf (dump_file, "SMS reached MAX_PASSES... \n");
 940
 941           break;
 942         }
 943
 944       if (! loop_canon_p (loop))
 945         continue;
 946
 947       if (! loop_single_full_bb_p (loop))
 948         continue;
 949
 950       bb = loop->header;
 951
 952       get_ebb_head_tail (bb, bb, &head, &tail);
 953       latch_edge = loop_latch_edge (loop);
 954       gcc_assert (single_exit (loop));
 955       if (single_exit (loop)->count)
 956         trip_count = latch_edge->count / single_exit (loop)->count;
 957
 958       /* Perfrom SMS only on loops that their average count is above threshold.  */
 959
 960       if ( latch_edge->count
 961           && (latch_edge->count < single_exit (loop)->count * SMS_LOOP_AVERAGE_COUNT_THRESHOLD))
 962         {
 963           if (dump_file)
 964             {
 965               fprintf (dump_file, " %s %d (file, line)\n",
 966                        insn_file (tail), insn_line (tail));
 967               fprintf (dump_file, "SMS single-bb-loop\n");
 968               if (profile_info && flag_branch_probabilities)
 969                 {
 970                   fprintf (dump_file, "SMS loop-count ");
 971                   fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
 972                            (HOST_WIDEST_INT) bb->count);
 973                   fprintf (dump_file, "\n");
 974                   fprintf (dump_file, "SMS trip-count ");
 975                   fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
 976                            (HOST_WIDEST_INT) trip_count);
 977                   fprintf (dump_file, "\n");
 978                   fprintf (dump_file, "SMS profile-sum-max ");
 979                   fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
 980                            (HOST_WIDEST_INT) profile_info->sum_max);
 981                   fprintf (dump_file, "\n");
 982                 }
 983             }
 984           continue;
 985         }
 986
 987       /* Make sure this is a doloop.  */
 988       if ( !(count_reg = doloop_register_get (tail)))
 989         continue;
 990
 991       /* Don't handle BBs with calls or barriers, or !single_set insns.  */
 992       for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
 993         if (CALL_P (insn)
 994             || BARRIER_P (insn)
 995             || (INSN_P (insn) && !JUMP_P (insn)
 996                 && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
 997           break;
 998
 999       if (insn != NEXT_INSN (tail))
1000         {
1001           if (dump_file)
1002             {
1003               if (CALL_P (insn))
1004                 fprintf (dump_file, "SMS loop-with-call\n");
1005               else if (BARRIER_P (insn))
1006                 fprintf (dump_file, "SMS loop-with-barrier\n");
1007               else
1008                 fprintf (dump_file, "SMS loop-with-not-single-set\n");
1009               print_rtl_single (dump_file, insn);
1010             }
1011
1012           continue;
1013         }
1014
1015       if (! (g = create_ddg (bb, 0)))
1016         {
1017           if (dump_file)
1018             fprintf (dump_file, "SMS doloop\n");
1019           continue;
1020         }
1021
1022       g_arr[loop->num] = g;
1023     }
1024
1025   /* We don't want to perform SMS on new loops - created by versioning.  */
1026   FOR_EACH_LOOP (li, loop, 0)
1027     {
1028       rtx head, tail;
1029       rtx count_reg, count_init;
1030       int mii, rec_mii;
1031       unsigned stage_count = 0;
1032       HOST_WIDEST_INT loop_count = 0;
1033
1034       if (! (g = g_arr[loop->num]))
1035         continue;
1036
1037       if (dump_file)
1038         print_ddg (dump_file, g);
1039
1040       get_ebb_head_tail (loop->header, loop->header, &head, &tail);
1041
1042       latch_edge = loop_latch_edge (loop);
1043       gcc_assert (single_exit (loop));
1044       if (single_exit (loop)->count)
1045         trip_count = latch_edge->count / single_exit (loop)->count;
1046
1047       if (dump_file)
1048         {
1049           fprintf (dump_file, " %s %d (file, line)\n",
1050                    insn_file (tail), insn_line (tail));
1051           fprintf (dump_file, "SMS single-bb-loop\n");
1052           if (profile_info && flag_branch_probabilities)
1053             {
1054               fprintf (dump_file, "SMS loop-count ");
1055               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1056                        (HOST_WIDEST_INT) bb->count);
1057               fprintf (dump_file, "\n");
1058               fprintf (dump_file, "SMS profile-sum-max ");
1059               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1060                        (HOST_WIDEST_INT) profile_info->sum_max);
1061               fprintf (dump_file, "\n");
1062             }
1063           fprintf (dump_file, "SMS doloop\n");
1064           fprintf (dump_file, "SMS built-ddg %d\n", g->num_nodes);
1065           fprintf (dump_file, "SMS num-loads %d\n", g->num_loads);
1066           fprintf (dump_file, "SMS num-stores %d\n", g->num_stores);
1067         }
1068
1069
1070       /* In case of th loop have doloop register it gets special
1071          handling.  */
1072       count_init = NULL_RTX;
1073       if ((count_reg = doloop_register_get (tail)))
1074         {
1075           basic_block pre_header;
1076
1077           pre_header = loop_preheader_edge (loop)->src;
1078           count_init = const_iteration_count (count_reg, pre_header,
1079                                               &loop_count);
1080         }
1081       gcc_assert (count_reg);
1082
1083       if (dump_file && count_init)
1084         {
1085           fprintf (dump_file, "SMS const-doloop ");
1086           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1087                      loop_count);
1088           fprintf (dump_file, "\n");
1089         }
1090
1091       node_order = XNEWVEC (int, g->num_nodes);
1092
1093       mii = 1; /* Need to pass some estimate of mii.  */
1094       rec_mii = sms_order_nodes (g, mii, node_order);
1095       mii = MAX (res_MII (g), rec_mii);
1096       maxii = (calculate_maxii (g) * SMS_MAX_II_FACTOR) / 100;
1097
1098       if (dump_file)
1099         fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1100                  rec_mii, mii, maxii);
1101
1102       /* After sms_order_nodes and before sms_schedule_by_order, to copy over
1103          ASAP.  */
1104       set_node_sched_params (g);
1105
1106       ps = sms_schedule_by_order (g, mii, maxii, node_order);
1107
1108       if (ps)
1109         stage_count = PS_STAGE_COUNT (ps);
1110
1111       /* Stage count of 1 means that there is no interleaving between
1112          iterations, let the scheduling passes do the job.  */
1113       if (stage_count < 1
1114           || (count_init && (loop_count <= stage_count))
1115           || (flag_branch_probabilities && (trip_count <= stage_count)))
1116         {
1117           if (dump_file)
1118             {
1119               fprintf (dump_file, "SMS failed... \n");
1120               fprintf (dump_file, "SMS sched-failed (stage-count=%d, loop-count=", stage_count);
1121               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, loop_count);
1122               fprintf (dump_file, ", trip-count=");
1123               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, trip_count);
1124               fprintf (dump_file, ")\n");
1125             }
1126           continue;
1127         }
1128       else
1129         {
1130           int orig_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1131           int new_cycles;
1132           struct undo_replace_buff_elem *reg_move_replaces;
1133
1134           if (dump_file)
1135             {
1136               fprintf (dump_file,
1137                        "SMS succeeded %d %d (with ii, sc)\n", ps->ii,
1138                        stage_count);
1139               print_partial_schedule (ps, dump_file);
1140               fprintf (dump_file,
1141                        "SMS Branch (%d) will later be scheduled at cycle %d.\n",
1142                        g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1);
1143             }
1144
1145           /* Set the stage boundaries.  If the DDG is built with closing_branch_deps,
1146              the closing_branch was scheduled and should appear in the last (ii-1)
1147              row.  Otherwise, we are free to schedule the branch, and we let nodes
1148              that were scheduled at the first PS_MIN_CYCLE cycle appear in the first
1149              row; this should reduce stage_count to minimum.  */
1150           normalize_sched_times (ps);
1151           rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
1152           set_columns_for_ps (ps);
1153
1154           /* Generate the kernel just to be able to measure its cycles.  */
1155           permute_partial_schedule (ps, g->closing_branch->first_note);
1156           reg_move_replaces = generate_reg_moves (ps, false);
1157
1158           /* Get the number of cycles the new kernel expect to execute in.  */
1159           new_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1160
1161           /* Get back to the original loop so we can do loop versioning.  */
1162           undo_permute_partial_schedule (ps, g->closing_branch->first_note);
1163           if (reg_move_replaces)
1164             undo_generate_reg_moves (ps, reg_move_replaces);
1165
1166           if ( new_cycles >= orig_cycles)
1167             {
1168               /* SMS is not profitable so undo the permutation and reg move generation
1169                  and return the kernel to its original state.  */
1170               if (dump_file)
1171                 fprintf (dump_file, "Undoing SMS because it is not profitable.\n");
1172
1173             }
1174           else
1175             {
1176               canon_loop (loop);
1177
1178               /* case the BCT count is not known , Do loop-versioning */
1179               if (count_reg && ! count_init)
1180                 {
1181                   rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
1182                                                  GEN_INT(stage_count));
1183                   unsigned prob = (PROB_SMS_ENOUGH_ITERATIONS
1184                                    * REG_BR_PROB_BASE) / 100;
1185
1186                   loop_version (loop, comp_rtx, &condition_bb,
1187                                 prob, prob, REG_BR_PROB_BASE - prob,
1188                                 true);
1189                 }
1190
1191               /* Set new iteration count of loop kernel.  */
1192               if (count_reg && count_init)
1193                 SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1194                                                              - stage_count + 1);
1195
1196               /* Now apply the scheduled kernel to the RTL of the loop.  */
1197               permute_partial_schedule (ps, g->closing_branch->first_note);
1198
1199               /* Mark this loop as software pipelined so the later
1200               scheduling passes doesn't touch it.  */
1201               if (! flag_resched_modulo_sched)
1202                 g->bb->flags |= BB_DISABLE_SCHEDULE;
1203               /* The life-info is not valid any more.  */
1204               df_set_bb_dirty (g->bb);
1205
1206               reg_move_replaces = generate_reg_moves (ps, true);
1207               if (dump_file)
1208                 print_node_sched_params (dump_file, g->num_nodes);
1209               /* Generate prolog and epilog.  */
1210               if (count_reg && !count_init)
1211                 generate_prolog_epilog (ps, loop, count_reg);
1212               else
1213                 generate_prolog_epilog (ps, loop, NULL_RTX);
1214             }
1215           free_undo_replace_buff (reg_move_replaces);
1216         }
1217
1218       free_partial_schedule (ps);
1219       free (node_sched_params);
1220       free (node_order);
1221       free_ddg (g);
1222     }
1223
1224   regstat_free_calls_crossed ();
1225   free (g_arr);
1226
1227   /* Release scheduler data, needed until now because of DFA.  */
1228   sched_finish ();
1229   loop_optimizer_finalize ();
1230 }
1231
1232 /* The SMS scheduling algorithm itself
1233    -----------------------------------
1234    Input: 'O' an ordered list of insns of a loop.
1235    Output: A scheduling of the loop - kernel, prolog, and epilogue.
1236
1237    'Q' is the empty Set
1238    'PS' is the partial schedule; it holds the currently scheduled nodes with
1239         their cycle/slot.
1240    'PSP' previously scheduled predecessors.
1241    'PSS' previously scheduled successors.
1242    't(u)' the cycle where u is scheduled.
1243    'l(u)' is the latency of u.
1244    'd(v,u)' is the dependence distance from v to u.
1245    'ASAP(u)' the earliest time at which u could be scheduled as computed in
1246              the node ordering phase.
1247    'check_hardware_resources_conflicts(u, PS, c)'
1248                              run a trace around cycle/slot through DFA model
1249                              to check resource conflicts involving instruction u
1250                              at cycle c given the partial schedule PS.
1251    'add_to_partial_schedule_at_time(u, PS, c)'
1252                              Add the node/instruction u to the partial schedule
1253                              PS at time c.
1254    'calculate_register_pressure(PS)'
1255                              Given a schedule of instructions, calculate the register
1256                              pressure it implies.  One implementation could be the
1257                              maximum number of overlapping live ranges.
1258    'maxRP' The maximum allowed register pressure, it is usually derived from the number
1259            registers available in the hardware.
1260
1261    1. II = MII.
1262    2. PS = empty list
1263    3. for each node u in O in pre-computed order
1264    4.   if (PSP(u) != Q && PSS(u) == Q) then
1265    5.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1266    6.     start = Early_start; end = Early_start + II - 1; step = 1
1267    11.  else if (PSP(u) == Q && PSS(u) != Q) then
1268    12.      Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1269    13.     start = Late_start; end = Late_start - II + 1; step = -1
1270    14.  else if (PSP(u) != Q && PSS(u) != Q) then
1271    15.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1272    16.     Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1273    17.     start = Early_start;
1274    18.     end = min(Early_start + II - 1 , Late_start);
1275    19.     step = 1
1276    20.     else "if (PSP(u) == Q && PSS(u) == Q)"
1277    21.    start = ASAP(u); end = start + II - 1; step = 1
1278    22.  endif
1279
1280    23.  success = false
1281    24.  for (c = start ; c != end ; c += step)
1282    25.     if check_hardware_resources_conflicts(u, PS, c) then
1283    26.       add_to_partial_schedule_at_time(u, PS, c)
1284    27.       success = true
1285    28.       break
1286    29.     endif
1287    30.  endfor
1288    31.  if (success == false) then
1289    32.    II = II + 1
1290    33.    if (II > maxII) then
1291    34.       finish - failed to schedule
1292    35.   endif
1293    36.    goto 2.
1294    37.  endif
1295    38. endfor
1296    39. if (calculate_register_pressure(PS) > maxRP) then
1297    40.    goto 32.
1298    41. endif
1299    42. compute epilogue & prologue
1300    43. finish - succeeded to schedule
1301 */
1302
1303 /* A limit on the number of cycles that resource conflicts can span.  ??? Should
1304    be provided by DFA, and be dependent on the type of insn scheduled.  Currently
1305    set to 0 to save compile time.  */
1306 #define DFA_HISTORY SMS_DFA_HISTORY
1307
1308 /* Given the partial schedule PS, this function calculates and returns the
1309    cycles in which we can schedule the node with the given index I.
1310    NOTE: Here we do the backtracking in SMS, in some special cases. We have
1311    noticed that there are several cases in which we fail    to SMS the loop
1312    because the sched window of a node is empty    due to tight data-deps. In
1313    such cases we want to unschedule    some of the predecessors/successors
1314    until we get non-empty    scheduling window.  It returns -1 if the
1315    scheduling window is empty and zero otherwise.  */
1316
1317 static int
1318 get_sched_window (partial_schedule_ptr ps, int *nodes_order, int i,
1319                   sbitmap sched_nodes, int ii, int *start_p, int *step_p, int *end_p)
1320 {
1321   int start, step, end;
1322   ddg_edge_ptr e;
1323   int u = nodes_order [i];
1324   ddg_node_ptr u_node = &ps->g->nodes[u];
1325   sbitmap psp = sbitmap_alloc (ps->g->num_nodes);
1326   sbitmap pss = sbitmap_alloc (ps->g->num_nodes);
1327   sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1328   sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1329   int psp_not_empty;
1330   int pss_not_empty;
1331
1332   /* 1. compute sched window for u (start, end, step).  */
1333   sbitmap_zero (psp);
1334   sbitmap_zero (pss);
1335   psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes);
1336   pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes);
1337
1338   if (psp_not_empty && !pss_not_empty)
1339     {
1340       int early_start = INT_MIN;
1341
1342       end = INT_MAX;
1343       for (e = u_node->in; e != 0; e = e->next_in)
1344         {
1345           ddg_node_ptr v_node = e->src;
1346           if (TEST_BIT (sched_nodes, v_node->cuid))
1347             {
1348               int node_st = SCHED_TIME (v_node)
1349                             + e->latency - (e->distance * ii);
1350
1351               early_start = MAX (early_start, node_st);
1352
1353               if (e->data_type == MEM_DEP)
1354                 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1355             }
1356         }
1357       start = early_start;
1358       end = MIN (end, early_start + ii);
1359       step = 1;
1360     }
1361
1362   else if (!psp_not_empty && pss_not_empty)
1363     {
1364       int late_start = INT_MAX;
1365
1366       end = INT_MIN;
1367       for (e = u_node->out; e != 0; e = e->next_out)
1368         {
1369           ddg_node_ptr v_node = e->dest;
1370           if (TEST_BIT (sched_nodes, v_node->cuid))
1371             {
1372               late_start = MIN (late_start,
1373                                 SCHED_TIME (v_node) - e->latency
1374                                 + (e->distance * ii));
1375               if (e->data_type == MEM_DEP)
1376                 end = MAX (end, SCHED_TIME (v_node) - ii + 1);
1377             }
1378         }
1379       start = late_start;
1380       end = MAX (end, late_start - ii);
1381       step = -1;
1382     }
1383
1384   else if (psp_not_empty && pss_not_empty)
1385     {
1386       int early_start = INT_MIN;
1387       int late_start = INT_MAX;
1388
1389       start = INT_MIN;
1390       end = INT_MAX;
1391       for (e = u_node->in; e != 0; e = e->next_in)
1392         {
1393           ddg_node_ptr v_node = e->src;
1394
1395           if (TEST_BIT (sched_nodes, v_node->cuid))
1396             {
1397               early_start = MAX (early_start,
1398                                  SCHED_TIME (v_node) + e->latency
1399                                  - (e->distance * ii));
1400               if (e->data_type == MEM_DEP)
1401                 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1402             }
1403         }
1404       for (e = u_node->out; e != 0; e = e->next_out)
1405         {
1406           ddg_node_ptr v_node = e->dest;
1407
1408           if (TEST_BIT (sched_nodes, v_node->cuid))
1409             {
1410               late_start = MIN (late_start,
1411                                 SCHED_TIME (v_node) - e->latency
1412                                 + (e->distance * ii));
1413               if (e->data_type == MEM_DEP)
1414                 start = MAX (start, SCHED_TIME (v_node) - ii + 1);
1415             }
1416         }
1417       start = MAX (start, early_start);
1418       end = MIN (end, MIN (early_start + ii, late_start + 1));
1419       step = 1;
1420     }
1421   else /* psp is empty && pss is empty.  */
1422     {
1423       start = SCHED_ASAP (u_node);
1424       end = start + ii;
1425       step = 1;
1426     }
1427
1428   *start_p = start;
1429   *step_p = step;
1430   *end_p = end;
1431   sbitmap_free (psp);
1432   sbitmap_free (pss);
1433
1434   if ((start >= end && step == 1) || (start <= end && step == -1))
1435     return -1;
1436   else
1437     return 0;
1438 }
1439
1440 /* This function implements the scheduling algorithm for SMS according to the
1441    above algorithm.  */
1442 static partial_schedule_ptr
1443 sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order)
1444 {
1445   int ii = mii;
1446   int i, c, success;
1447   int try_again_with_larger_ii = true;
1448   int num_nodes = g->num_nodes;
1449   ddg_edge_ptr e;
1450   int start, end, step; /* Place together into one struct?  */
1451   sbitmap sched_nodes = sbitmap_alloc (num_nodes);
1452   sbitmap must_precede = sbitmap_alloc (num_nodes);
1453   sbitmap must_follow = sbitmap_alloc (num_nodes);
1454   sbitmap tobe_scheduled = sbitmap_alloc (num_nodes);
1455
1456   partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY);
1457
1458   sbitmap_ones (tobe_scheduled);
1459   sbitmap_zero (sched_nodes);
1460
1461   while ((! sbitmap_equal (tobe_scheduled, sched_nodes)
1462          || try_again_with_larger_ii ) && ii < maxii)
1463     {
1464       int j;
1465       bool unscheduled_nodes = false;
1466
1467       if (dump_file)
1468         fprintf (dump_file, "Starting with ii=%d\n", ii);
1469       if (try_again_with_larger_ii)
1470         {
1471           try_again_with_larger_ii = false;
1472           sbitmap_zero (sched_nodes);
1473         }
1474
1475       for (i = 0; i < num_nodes; i++)
1476         {
1477           int u = nodes_order[i];
1478           ddg_node_ptr u_node = &ps->g->nodes[u];
1479           rtx insn = u_node->insn;
1480
1481           if (!INSN_P (insn))
1482             {
1483               RESET_BIT (tobe_scheduled, u);
1484               continue;
1485             }
1486
1487           if (JUMP_P (insn)) /* Closing branch handled later.  */
1488             {
1489               RESET_BIT (tobe_scheduled, u);
1490               continue;
1491             }
1492
1493           if (TEST_BIT (sched_nodes, u))
1494             continue;
1495
1496           /* Try to get non-empty scheduling window.  */
1497           j = i;
1498           while (get_sched_window (ps, nodes_order, i, sched_nodes, ii, &start, &step, &end) < 0
1499                  && j > 0)
1500             {
1501               unscheduled_nodes = true;
1502               if (TEST_BIT (NODE_PREDECESSORS (u_node), nodes_order[j - 1])
1503                   || TEST_BIT (NODE_SUCCESSORS (u_node), nodes_order[j - 1]))
1504                 {
1505                   ps_unschedule_node (ps, &ps->g->nodes[nodes_order[j - 1]]);
1506                   RESET_BIT (sched_nodes, nodes_order [j - 1]);
1507                 }
1508               j--;
1509             }
1510           if (j < 0)
1511             {
1512               /* ??? Try backtracking instead of immediately ii++?  */
1513               ii++;
1514               try_again_with_larger_ii = true;
1515               reset_partial_schedule (ps, ii);
1516               break;
1517             }
1518           /* 2. Try scheduling u in window.  */
1519           if (dump_file)
1520             fprintf (dump_file,
1521                      "Trying to schedule node %d in (%d .. %d) step %d\n",
1522                      u, start, end, step);
1523
1524           /* use must_follow & must_precede bitmaps to determine order
1525              of nodes within the cycle.  */
1526           sbitmap_zero (must_precede);
1527           sbitmap_zero (must_follow);
1528           for (e = u_node->in; e != 0; e = e->next_in)
1529             if (TEST_BIT (sched_nodes, e->src->cuid)
1530                 && e->latency == (ii * e->distance)
1531                 && start == SCHED_TIME (e->src))
1532              SET_BIT (must_precede, e->src->cuid);
1533
1534           for (e = u_node->out; e != 0; e = e->next_out)
1535             if (TEST_BIT (sched_nodes, e->dest->cuid)
1536                 && e->latency == (ii * e->distance)
1537                 && end == SCHED_TIME (e->dest))
1538              SET_BIT (must_follow, e->dest->cuid);
1539
1540           success = 0;
1541           if ((step > 0 && start < end) ||  (step < 0 && start > end))
1542             for (c = start; c != end; c += step)
1543               {
1544                 ps_insn_ptr psi;
1545
1546                 psi = ps_add_node_check_conflicts (ps, u_node, c,
1547                                                    must_precede,
1548                                                    must_follow);
1549
1550                 if (psi)
1551                   {
1552                     SCHED_TIME (u_node) = c;
1553                     SET_BIT (sched_nodes, u);
1554                     success = 1;
1555                     if (dump_file)
1556                       fprintf (dump_file, "Schedule in %d\n", c);
1557                     break;
1558                   }
1559               }
1560           if (!success)
1561             {
1562               /* ??? Try backtracking instead of immediately ii++?  */
1563               ii++;
1564               try_again_with_larger_ii = true;
1565               reset_partial_schedule (ps, ii);
1566               break;
1567             }
1568           if (unscheduled_nodes)
1569             break;
1570
1571           /* ??? If (success), check register pressure estimates.  */
1572         } /* Continue with next node.  */
1573     } /* While try_again_with_larger_ii.  */
1574
1575   sbitmap_free (sched_nodes);
1576   sbitmap_free (must_precede);
1577   sbitmap_free (must_follow);
1578   sbitmap_free (tobe_scheduled);
1579
1580   if (ii >= maxii)
1581     {
1582       free_partial_schedule (ps);
1583       ps = NULL;
1584     }
1585   return ps;
1586 }
1587
1588 \f
1589 /* This page implements the algorithm for ordering the nodes of a DDG
1590    for modulo scheduling, activated through the
1591    "int sms_order_nodes (ddg_ptr, int mii, int * result)" API.  */
1592
1593 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
1594 #define ASAP(x) (ORDER_PARAMS ((x))->asap)
1595 #define ALAP(x) (ORDER_PARAMS ((x))->alap)
1596 #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
1597 #define MOB(x) (ALAP ((x)) - ASAP ((x)))
1598 #define DEPTH(x) (ASAP ((x)))
1599
1600 typedef struct node_order_params * nopa;
1601
1602 static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
1603 static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
1604 static nopa  calculate_order_params (ddg_ptr, int mii);
1605 static int find_max_asap (ddg_ptr, sbitmap);
1606 static int find_max_hv_min_mob (ddg_ptr, sbitmap);
1607 static int find_max_dv_min_mob (ddg_ptr, sbitmap);
1608
1609 enum sms_direction {BOTTOMUP, TOPDOWN};
1610
1611 struct node_order_params
1612 {
1613   int asap;
1614   int alap;
1615   int height;
1616 };
1617
1618 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1.  */
1619 static void
1620 check_nodes_order (int *node_order, int num_nodes)
1621 {
1622   int i;
1623   sbitmap tmp = sbitmap_alloc (num_nodes);
1624
1625   sbitmap_zero (tmp);
1626
1627   for (i = 0; i < num_nodes; i++)
1628     {
1629       int u = node_order[i];
1630
1631       gcc_assert (u < num_nodes && u >= 0 && !TEST_BIT (tmp, u));
1632
1633       SET_BIT (tmp, u);
1634     }
1635
1636   sbitmap_free (tmp);
1637 }
1638
1639 /* Order the nodes of G for scheduling and pass the result in
1640    NODE_ORDER.  Also set aux.count of each node to ASAP.
1641    Return the recMII for the given DDG.  */
1642 static int
1643 sms_order_nodes (ddg_ptr g, int mii, int * node_order)
1644 {
1645   int i;
1646   int rec_mii = 0;
1647   ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
1648
1649   nopa nops = calculate_order_params (g, mii);
1650
1651   if (dump_file)
1652     print_sccs (dump_file, sccs, g);
1653
1654   order_nodes_of_sccs (sccs, node_order);
1655
1656   if (sccs->num_sccs > 0)
1657     /* First SCC has the largest recurrence_length.  */
1658     rec_mii = sccs->sccs[0]->recurrence_length;
1659
1660   /* Save ASAP before destroying node_order_params.  */
1661   for (i = 0; i < g->num_nodes; i++)
1662     {
1663       ddg_node_ptr v = &g->nodes[i];
1664       v->aux.count = ASAP (v);
1665     }
1666
1667   free (nops);
1668   free_ddg_all_sccs (sccs);
1669   check_nodes_order (node_order, g->num_nodes);
1670
1671   return rec_mii;
1672 }
1673
1674 static void
1675 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
1676 {
1677   int i, pos = 0;
1678   ddg_ptr g = all_sccs->ddg;
1679   int num_nodes = g->num_nodes;
1680   sbitmap prev_sccs = sbitmap_alloc (num_nodes);
1681   sbitmap on_path = sbitmap_alloc (num_nodes);
1682   sbitmap tmp = sbitmap_alloc (num_nodes);
1683   sbitmap ones = sbitmap_alloc (num_nodes);
1684
1685   sbitmap_zero (prev_sccs);
1686   sbitmap_ones (ones);
1687
1688   /* Perfrom the node ordering starting from the SCC with the highest recMII.
1689      For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc.  */
1690   for (i = 0; i < all_sccs->num_sccs; i++)
1691     {
1692       ddg_scc_ptr scc = all_sccs->sccs[i];
1693
1694       /* Add nodes on paths from previous SCCs to the current SCC.  */
1695       find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
1696       sbitmap_a_or_b (tmp, scc->nodes, on_path);
1697
1698       /* Add nodes on paths from the current SCC to previous SCCs.  */
1699       find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
1700       sbitmap_a_or_b (tmp, tmp, on_path);
1701
1702       /* Remove nodes of previous SCCs from current extended SCC.  */
1703       sbitmap_difference (tmp, tmp, prev_sccs);
1704
1705       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1706       /* Above call to order_nodes_in_scc updated prev_sccs |= tmp.  */
1707     }
1708
1709   /* Handle the remaining nodes that do not belong to any scc.  Each call
1710      to order_nodes_in_scc handles a single connected component.  */
1711   while (pos < g->num_nodes)
1712     {
1713       sbitmap_difference (tmp, ones, prev_sccs);
1714       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1715     }
1716   sbitmap_free (prev_sccs);
1717   sbitmap_free (on_path);
1718   sbitmap_free (tmp);
1719   sbitmap_free (ones);
1720 }
1721
1722 /* MII is needed if we consider backarcs (that do not close recursive cycles).  */
1723 static struct node_order_params *
1724 calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED)
1725 {
1726   int u;
1727   int max_asap;
1728   int num_nodes = g->num_nodes;
1729   ddg_edge_ptr e;
1730   /* Allocate a place to hold ordering params for each node in the DDG.  */
1731   nopa node_order_params_arr;
1732
1733   /* Initialize of ASAP/ALAP/HEIGHT to zero.  */
1734   node_order_params_arr = (nopa) xcalloc (num_nodes,
1735                                           sizeof (struct node_order_params));
1736
1737   /* Set the aux pointer of each node to point to its order_params structure.  */
1738   for (u = 0; u < num_nodes; u++)
1739     g->nodes[u].aux.info = &node_order_params_arr[u];
1740
1741   /* Disregarding a backarc from each recursive cycle to obtain a DAG,
1742      calculate ASAP, ALAP, mobility, distance, and height for each node
1743      in the dependence (direct acyclic) graph.  */
1744
1745   /* We assume that the nodes in the array are in topological order.  */
1746
1747   max_asap = 0;
1748   for (u = 0; u < num_nodes; u++)
1749     {
1750       ddg_node_ptr u_node = &g->nodes[u];
1751
1752       ASAP (u_node) = 0;
1753       for (e = u_node->in; e; e = e->next_in)
1754         if (e->distance == 0)
1755           ASAP (u_node) = MAX (ASAP (u_node),
1756                                ASAP (e->src) + e->latency);
1757       max_asap = MAX (max_asap, ASAP (u_node));
1758     }
1759
1760   for (u = num_nodes - 1; u > -1; u--)
1761     {
1762       ddg_node_ptr u_node = &g->nodes[u];
1763
1764       ALAP (u_node) = max_asap;
1765       HEIGHT (u_node) = 0;
1766       for (e = u_node->out; e; e = e->next_out)
1767         if (e->distance == 0)
1768           {
1769             ALAP (u_node) = MIN (ALAP (u_node),
1770                                  ALAP (e->dest) - e->latency);
1771             HEIGHT (u_node) = MAX (HEIGHT (u_node),
1772                                    HEIGHT (e->dest) + e->latency);
1773           }
1774     }
1775
1776   return node_order_params_arr;
1777 }
1778
1779 static int
1780 find_max_asap (ddg_ptr g, sbitmap nodes)
1781 {
1782   unsigned int u = 0;
1783   int max_asap = -1;
1784   int result = -1;
1785   sbitmap_iterator sbi;
1786
1787   EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1788     {
1789       ddg_node_ptr u_node = &g->nodes[u];
1790
1791       if (max_asap < ASAP (u_node))
1792         {
1793           max_asap = ASAP (u_node);
1794           result = u;
1795         }
1796     }
1797   return result;
1798 }
1799
1800 static int
1801 find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
1802 {
1803   unsigned int u = 0;
1804   int max_hv = -1;
1805   int min_mob = INT_MAX;
1806   int result = -1;
1807   sbitmap_iterator sbi;
1808
1809   EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1810     {
1811       ddg_node_ptr u_node = &g->nodes[u];
1812
1813       if (max_hv < HEIGHT (u_node))
1814         {
1815           max_hv = HEIGHT (u_node);
1816           min_mob = MOB (u_node);
1817           result = u;
1818         }
1819       else if ((max_hv == HEIGHT (u_node))
1820                && (min_mob > MOB (u_node)))
1821         {
1822           min_mob = MOB (u_node);
1823           result = u;
1824         }
1825     }
1826   return result;
1827 }
1828
1829 static int
1830 find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
1831 {
1832   unsigned int u = 0;
1833   int max_dv = -1;
1834   int min_mob = INT_MAX;
1835   int result = -1;
1836   sbitmap_iterator sbi;
1837
1838   EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1839     {
1840       ddg_node_ptr u_node = &g->nodes[u];
1841
1842       if (max_dv < DEPTH (u_node))
1843         {
1844           max_dv = DEPTH (u_node);
1845           min_mob = MOB (u_node);
1846           result = u;
1847         }
1848       else if ((max_dv == DEPTH (u_node))
1849                && (min_mob > MOB (u_node)))
1850         {
1851           min_mob = MOB (u_node);
1852           result = u;
1853         }
1854     }
1855   return result;
1856 }
1857
1858 /* Places the nodes of SCC into the NODE_ORDER array starting
1859    at position POS, according to the SMS ordering algorithm.
1860    NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
1861    the NODE_ORDER array, starting from position zero.  */
1862 static int
1863 order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
1864                     int * node_order, int pos)
1865 {
1866   enum sms_direction dir;
1867   int num_nodes = g->num_nodes;
1868   sbitmap workset = sbitmap_alloc (num_nodes);
1869   sbitmap tmp = sbitmap_alloc (num_nodes);
1870   sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
1871   sbitmap predecessors = sbitmap_alloc (num_nodes);
1872   sbitmap successors = sbitmap_alloc (num_nodes);
1873
1874   sbitmap_zero (predecessors);
1875   find_predecessors (predecessors, g, nodes_ordered);
1876
1877   sbitmap_zero (successors);
1878   find_successors (successors, g, nodes_ordered);
1879
1880   sbitmap_zero (tmp);
1881   if (sbitmap_a_and_b_cg (tmp, predecessors, scc))
1882     {
1883       sbitmap_copy (workset, tmp);
1884       dir = BOTTOMUP;
1885     }
1886   else if (sbitmap_a_and_b_cg (tmp, successors, scc))
1887     {
1888       sbitmap_copy (workset, tmp);
1889       dir = TOPDOWN;
1890     }
1891   else
1892     {
1893       int u;
1894
1895       sbitmap_zero (workset);
1896       if ((u = find_max_asap (g, scc)) >= 0)
1897         SET_BIT (workset, u);
1898       dir = BOTTOMUP;
1899     }
1900
1901   sbitmap_zero (zero_bitmap);
1902   while (!sbitmap_equal (workset, zero_bitmap))
1903     {
1904       int v;
1905       ddg_node_ptr v_node;
1906       sbitmap v_node_preds;
1907       sbitmap v_node_succs;
1908
1909       if (dir == TOPDOWN)
1910         {
1911           while (!sbitmap_equal (workset, zero_bitmap))
1912             {
1913               v = find_max_hv_min_mob (g, workset);
1914               v_node = &g->nodes[v];
1915               node_order[pos++] = v;
1916               v_node_succs = NODE_SUCCESSORS (v_node);
1917               sbitmap_a_and_b (tmp, v_node_succs, scc);
1918
1919               /* Don't consider the already ordered successors again.  */
1920               sbitmap_difference (tmp, tmp, nodes_ordered);
1921               sbitmap_a_or_b (workset, workset, tmp);
1922               RESET_BIT (workset, v);
1923               SET_BIT (nodes_ordered, v);
1924             }
1925           dir = BOTTOMUP;
1926           sbitmap_zero (predecessors);
1927           find_predecessors (predecessors, g, nodes_ordered);
1928           sbitmap_a_and_b (workset, predecessors, scc);
1929         }
1930       else
1931         {
1932           while (!sbitmap_equal (workset, zero_bitmap))
1933             {
1934               v = find_max_dv_min_mob (g, workset);
1935               v_node = &g->nodes[v];
1936               node_order[pos++] = v;
1937               v_node_preds = NODE_PREDECESSORS (v_node);
1938               sbitmap_a_and_b (tmp, v_node_preds, scc);
1939
1940               /* Don't consider the already ordered predecessors again.  */
1941               sbitmap_difference (tmp, tmp, nodes_ordered);
1942               sbitmap_a_or_b (workset, workset, tmp);
1943               RESET_BIT (workset, v);
1944               SET_BIT (nodes_ordered, v);
1945             }
1946           dir = TOPDOWN;
1947           sbitmap_zero (successors);
1948           find_successors (successors, g, nodes_ordered);
1949           sbitmap_a_and_b (workset, successors, scc);
1950         }
1951     }
1952   sbitmap_free (tmp);
1953   sbitmap_free (workset);
1954   sbitmap_free (zero_bitmap);
1955   sbitmap_free (predecessors);
1956   sbitmap_free (successors);
1957   return pos;
1958 }
1959
1960 \f
1961 /* This page contains functions for manipulating partial-schedules during
1962    modulo scheduling.  */
1963
1964 /* Create a partial schedule and allocate a memory to hold II rows.  */
1965
1966 static partial_schedule_ptr
1967 create_partial_schedule (int ii, ddg_ptr g, int history)
1968 {
1969   partial_schedule_ptr ps = XNEW (struct partial_schedule);
1970   ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
1971   ps->ii = ii;
1972   ps->history = history;
1973   ps->min_cycle = INT_MAX;
1974   ps->max_cycle = INT_MIN;
1975   ps->g = g;
1976
1977   return ps;
1978 }
1979
1980 /* Free the PS_INSNs in rows array of the given partial schedule.
1981    ??? Consider caching the PS_INSN's.  */
1982 static void
1983 free_ps_insns (partial_schedule_ptr ps)
1984 {
1985   int i;
1986
1987   for (i = 0; i < ps->ii; i++)
1988     {
1989       while (ps->rows[i])
1990         {
1991           ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
1992
1993           free (ps->rows[i]);
1994           ps->rows[i] = ps_insn;
1995         }
1996       ps->rows[i] = NULL;
1997     }
1998 }
1999
2000 /* Free all the memory allocated to the partial schedule.  */
2001
2002 static void
2003 free_partial_schedule (partial_schedule_ptr ps)
2004 {
2005   if (!ps)
2006     return;
2007   free_ps_insns (ps);
2008   free (ps->rows);
2009   free (ps);
2010 }
2011
2012 /* Clear the rows array with its PS_INSNs, and create a new one with
2013    NEW_II rows.  */
2014
2015 static void
2016 reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
2017 {
2018   if (!ps)
2019     return;
2020   free_ps_insns (ps);
2021   if (new_ii == ps->ii)
2022     return;
2023   ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
2024                                                  * sizeof (ps_insn_ptr));
2025   memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
2026   ps->ii = new_ii;
2027   ps->min_cycle = INT_MAX;
2028   ps->max_cycle = INT_MIN;
2029 }
2030
2031 /* Prints the partial schedule as an ii rows array, for each rows
2032    print the ids of the insns in it.  */
2033 void
2034 print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
2035 {
2036   int i;
2037
2038   for (i = 0; i < ps->ii; i++)
2039     {
2040       ps_insn_ptr ps_i = ps->rows[i];
2041
2042       fprintf (dump, "\n[CYCLE %d ]: ", i);
2043       while (ps_i)
2044         {
2045           fprintf (dump, "%d, ",
2046                    INSN_UID (ps_i->node->insn));
2047           ps_i = ps_i->next_in_row;
2048         }
2049     }
2050 }
2051
2052 /* Creates an object of PS_INSN and initializes it to the given parameters.  */
2053 static ps_insn_ptr
2054 create_ps_insn (ddg_node_ptr node, int rest_count, int cycle)
2055 {
2056   ps_insn_ptr ps_i = XNEW (struct ps_insn);
2057
2058   ps_i->node = node;
2059   ps_i->next_in_row = NULL;
2060   ps_i->prev_in_row = NULL;
2061   ps_i->row_rest_count = rest_count;
2062   ps_i->cycle = cycle;
2063
2064   return ps_i;
2065 }
2066
2067
2068 /* Removes the given PS_INSN from the partial schedule.  Returns false if the
2069    node is not found in the partial schedule, else returns true.  */
2070 static bool
2071 remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
2072 {
2073   int row;
2074
2075   if (!ps || !ps_i)
2076     return false;
2077
2078   row = SMODULO (ps_i->cycle, ps->ii);
2079   if (! ps_i->prev_in_row)
2080     {
2081       if (ps_i != ps->rows[row])
2082         return false;
2083
2084       ps->rows[row] = ps_i->next_in_row;
2085       if (ps->rows[row])
2086         ps->rows[row]->prev_in_row = NULL;
2087     }
2088   else
2089     {
2090       ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
2091       if (ps_i->next_in_row)
2092         ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
2093     }
2094   free (ps_i);
2095   return true;
2096 }
2097
2098 /* Unlike what literature describes for modulo scheduling (which focuses
2099    on VLIW machines) the order of the instructions inside a cycle is
2100    important.  Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
2101    where the current instruction should go relative to the already
2102    scheduled instructions in the given cycle.  Go over these
2103    instructions and find the first possible column to put it in.  */
2104 static bool
2105 ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2106                      sbitmap must_precede, sbitmap must_follow)
2107 {
2108   ps_insn_ptr next_ps_i;
2109   ps_insn_ptr first_must_follow = NULL;
2110   ps_insn_ptr last_must_precede = NULL;
2111   int row;
2112
2113   if (! ps_i)
2114     return false;
2115
2116   row = SMODULO (ps_i->cycle, ps->ii);
2117
2118   /* Find the first must follow and the last must precede
2119      and insert the node immediately after the must precede
2120      but make sure that it there is no must follow after it.  */
2121   for (next_ps_i = ps->rows[row];
2122        next_ps_i;
2123        next_ps_i = next_ps_i->next_in_row)
2124     {
2125       if (TEST_BIT (must_follow, next_ps_i->node->cuid)
2126           && ! first_must_follow)
2127         first_must_follow = next_ps_i;
2128       if (TEST_BIT (must_precede, next_ps_i->node->cuid))
2129         {
2130           /* If we have already met a node that must follow, then
2131              there is no possible column.  */
2132           if (first_must_follow)
2133             return false;
2134           else
2135             last_must_precede = next_ps_i;
2136         }
2137     }
2138
2139   /* Now insert the node after INSERT_AFTER_PSI.  */
2140
2141   if (! last_must_precede)
2142     {
2143       ps_i->next_in_row = ps->rows[row];
2144       ps_i->prev_in_row = NULL;
2145       if (ps_i->next_in_row)
2146         ps_i->next_in_row->prev_in_row = ps_i;
2147       ps->rows[row] = ps_i;
2148     }
2149   else
2150     {
2151       ps_i->next_in_row = last_must_precede->next_in_row;
2152       last_must_precede->next_in_row = ps_i;
2153       ps_i->prev_in_row = last_must_precede;
2154       if (ps_i->next_in_row)
2155         ps_i->next_in_row->prev_in_row = ps_i;
2156     }
2157
2158   return true;
2159 }
2160
2161 /* Advances the PS_INSN one column in its current row; returns false
2162    in failure and true in success.  Bit N is set in MUST_FOLLOW if
2163    the node with cuid N must be come after the node pointed to by
2164    PS_I when scheduled in the same cycle.  */
2165 static int
2166 ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2167                         sbitmap must_follow)
2168 {
2169   ps_insn_ptr prev, next;
2170   int row;
2171   ddg_node_ptr next_node;
2172
2173   if (!ps || !ps_i)
2174     return false;
2175
2176   row = SMODULO (ps_i->cycle, ps->ii);
2177
2178   if (! ps_i->next_in_row)
2179     return false;
2180
2181   next_node = ps_i->next_in_row->node;
2182
2183   /* Check if next_in_row is dependent on ps_i, both having same sched
2184      times (typically ANTI_DEP).  If so, ps_i cannot skip over it.  */
2185   if (TEST_BIT (must_follow, next_node->cuid))
2186     return false;
2187
2188   /* Advance PS_I over its next_in_row in the doubly linked list.  */
2189   prev = ps_i->prev_in_row;
2190   next = ps_i->next_in_row;
2191
2192   if (ps_i == ps->rows[row])
2193     ps->rows[row] = next;
2194
2195   ps_i->next_in_row = next->next_in_row;
2196
2197   if (next->next_in_row)
2198     next->next_in_row->prev_in_row = ps_i;
2199
2200   next->next_in_row = ps_i;
2201   ps_i->prev_in_row = next;
2202
2203   next->prev_in_row = prev;
2204   if (prev)
2205     prev->next_in_row = next;
2206
2207   return true;
2208 }
2209
2210 /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
2211    Returns 0 if this is not possible and a PS_INSN otherwise.  Bit N is
2212    set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
2213    before/after (respectively) the node pointed to by PS_I when scheduled
2214    in the same cycle.  */
2215 static ps_insn_ptr
2216 add_node_to_ps (partial_schedule_ptr ps, ddg_node_ptr node, int cycle,
2217                 sbitmap must_precede, sbitmap must_follow)
2218 {
2219   ps_insn_ptr ps_i;
2220   int rest_count = 1;
2221   int row = SMODULO (cycle, ps->ii);
2222
2223   if (ps->rows[row]
2224       && ps->rows[row]->row_rest_count >= issue_rate)
2225     return NULL;
2226
2227   if (ps->rows[row])
2228     rest_count += ps->rows[row]->row_rest_count;
2229
2230   ps_i = create_ps_insn (node, rest_count, cycle);
2231
2232   /* Finds and inserts PS_I according to MUST_FOLLOW and
2233      MUST_PRECEDE.  */
2234   if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow))
2235     {
2236       free (ps_i);
2237       return NULL;
2238     }
2239
2240   return ps_i;
2241 }
2242
2243 /* Advance time one cycle.  Assumes DFA is being used.  */
2244 static void
2245 advance_one_cycle (void)
2246 {
2247   if (targetm.sched.dfa_pre_cycle_insn)
2248     state_transition (curr_state,
2249                       targetm.sched.dfa_pre_cycle_insn ());
2250
2251   state_transition (curr_state, NULL);
2252
2253   if (targetm.sched.dfa_post_cycle_insn)
2254     state_transition (curr_state,
2255                       targetm.sched.dfa_post_cycle_insn ());
2256 }
2257
2258 /* Given the kernel of a loop (from FIRST_INSN to LAST_INSN), finds
2259    the number of cycles according to DFA that the kernel fits in,
2260    we use this to check if we done well with SMS after we add
2261    register moves.  In some cases register moves overhead makes
2262    it even worse than the original loop.  We want SMS to be performed
2263    when it gives less cycles after register moves are added.  */
2264 static int
2265 kernel_number_of_cycles (rtx first_insn, rtx last_insn)
2266 {
2267   int cycles = 0;
2268   rtx insn;
2269   int can_issue_more = issue_rate;
2270
2271   state_reset (curr_state);
2272
2273   for (insn = first_insn;
2274        insn != NULL_RTX && insn != last_insn;
2275        insn = NEXT_INSN (insn))
2276     {
2277       if (! INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE)
2278         continue;
2279
2280       /* Check if there is room for the current insn.  */
2281       if (!can_issue_more || state_dead_lock_p (curr_state))
2282         {
2283           cycles ++;
2284           advance_one_cycle ();
2285           can_issue_more = issue_rate;
2286         }
2287
2288         /* Update the DFA state and return with failure if the DFA found
2289            recource conflicts.  */
2290       if (state_transition (curr_state, insn) >= 0)
2291         {
2292           cycles ++;
2293           advance_one_cycle ();
2294           can_issue_more = issue_rate;
2295         }
2296
2297       if (targetm.sched.variable_issue)
2298         can_issue_more =
2299           targetm.sched.variable_issue (sched_dump, sched_verbose,
2300                                         insn, can_issue_more);
2301       /* A naked CLOBBER or USE generates no instruction, so don't
2302          let them consume issue slots.  */
2303       else if (GET_CODE (PATTERN (insn)) != USE
2304                && GET_CODE (PATTERN (insn)) != CLOBBER)
2305         can_issue_more--;
2306     }
2307   return cycles;
2308 }
2309
2310 /* Checks if PS has resource conflicts according to DFA, starting from
2311    FROM cycle to TO cycle; returns true if there are conflicts and false
2312    if there are no conflicts.  Assumes DFA is being used.  */
2313 static int
2314 ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
2315 {
2316   int cycle;
2317
2318   state_reset (curr_state);
2319
2320   for (cycle = from; cycle <= to; cycle++)
2321     {
2322       ps_insn_ptr crr_insn;
2323       /* Holds the remaining issue slots in the current row.  */
2324       int can_issue_more = issue_rate;
2325
2326       /* Walk through the DFA for the current row.  */
2327       for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
2328            crr_insn;
2329            crr_insn = crr_insn->next_in_row)
2330         {
2331           rtx insn = crr_insn->node->insn;
2332
2333           if (!INSN_P (insn))
2334             continue;
2335
2336           /* Check if there is room for the current insn.  */
2337           if (!can_issue_more || state_dead_lock_p (curr_state))
2338             return true;
2339
2340           /* Update the DFA state and return with failure if the DFA found
2341              recource conflicts.  */
2342           if (state_transition (curr_state, insn) >= 0)
2343             return true;
2344
2345           if (targetm.sched.variable_issue)
2346             can_issue_more =
2347               targetm.sched.variable_issue (sched_dump, sched_verbose,
2348                                             insn, can_issue_more);
2349           /* A naked CLOBBER or USE generates no instruction, so don't
2350              let them consume issue slots.  */
2351           else if (GET_CODE (PATTERN (insn)) != USE
2352                    && GET_CODE (PATTERN (insn)) != CLOBBER)
2353             can_issue_more--;
2354         }
2355
2356       /* Advance the DFA to the next cycle.  */
2357       advance_one_cycle ();
2358     }
2359   return false;
2360 }
2361
2362 /* Checks if the given node causes resource conflicts when added to PS at
2363    cycle C.  If not the node is added to PS and returned; otherwise zero
2364    is returned.  Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
2365    cuid N must be come before/after (respectively) the node pointed to by
2366    PS_I when scheduled in the same cycle.  */
2367 ps_insn_ptr
2368 ps_add_node_check_conflicts (partial_schedule_ptr ps, ddg_node_ptr n,
2369                              int c, sbitmap must_precede,
2370                              sbitmap must_follow)
2371 {
2372   int has_conflicts = 0;
2373   ps_insn_ptr ps_i;
2374
2375   /* First add the node to the PS, if this succeeds check for
2376      conflicts, trying different issue slots in the same row.  */
2377   if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow)))
2378     return NULL; /* Failed to insert the node at the given cycle.  */
2379
2380   has_conflicts = ps_has_conflicts (ps, c, c)
2381                   || (ps->history > 0
2382                       && ps_has_conflicts (ps,
2383                                            c - ps->history,
2384                                            c + ps->history));
2385
2386   /* Try different issue slots to find one that the given node can be
2387      scheduled in without conflicts.  */
2388   while (has_conflicts)
2389     {
2390       if (! ps_insn_advance_column (ps, ps_i, must_follow))
2391         break;
2392       has_conflicts = ps_has_conflicts (ps, c, c)
2393                       || (ps->history > 0
2394                           && ps_has_conflicts (ps,
2395                                                c - ps->history,
2396                                                c + ps->history));
2397     }
2398
2399   if (has_conflicts)
2400     {
2401       remove_node_from_ps (ps, ps_i);
2402       return NULL;
2403     }
2404
2405   ps->min_cycle = MIN (ps->min_cycle, c);
2406   ps->max_cycle = MAX (ps->max_cycle, c);
2407   return ps_i;
2408 }
2409
2410 /* Rotate the rows of PS such that insns scheduled at time
2411    START_CYCLE will appear in row 0.  Updates max/min_cycles.  */
2412 void
2413 rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
2414 {
2415   int i, row, backward_rotates;
2416   int last_row = ps->ii - 1;
2417
2418   if (start_cycle == 0)
2419     return;
2420
2421   backward_rotates = SMODULO (start_cycle, ps->ii);
2422
2423   /* Revisit later and optimize this into a single loop.  */
2424   for (i = 0; i < backward_rotates; i++)
2425     {
2426       ps_insn_ptr first_row = ps->rows[0];
2427
2428       for (row = 0; row < last_row; row++)
2429         ps->rows[row] = ps->rows[row+1];
2430
2431       ps->rows[last_row] = first_row;
2432     }
2433
2434   ps->max_cycle -= start_cycle;
2435   ps->min_cycle -= start_cycle;
2436 }
2437
2438 /* Remove the node N from the partial schedule PS; because we restart the DFA
2439    each time we want to check for resource conflicts; this is equivalent to
2440    unscheduling the node N.  */
2441 static bool
2442 ps_unschedule_node (partial_schedule_ptr ps, ddg_node_ptr n)
2443 {
2444   ps_insn_ptr ps_i;
2445   int row = SMODULO (SCHED_TIME (n), ps->ii);
2446
2447   if (row < 0 || row > ps->ii)
2448     return false;
2449
2450   for (ps_i = ps->rows[row];
2451        ps_i &&  ps_i->node != n;
2452        ps_i = ps_i->next_in_row);
2453   if (!ps_i)
2454     return false;
2455
2456   return remove_node_from_ps (ps, ps_i);
2457 }
2458 #endif /* INSN_SCHEDULING */
2459 \f
2460 static bool
2461 gate_handle_sms (void)
2462 {
2463   return (optimize > 0 && flag_modulo_sched);
2464 }
2465
2466
2467 /* Run instruction scheduler.  */
2468 /* Perform SMS module scheduling.  */
2469 static unsigned int
2470 rest_of_handle_sms (void)
2471 {
2472 #ifdef INSN_SCHEDULING
2473   basic_block bb;
2474
2475   /* Collect loop information to be used in SMS.  */
2476   cfg_layout_initialize (0);
2477   sms_schedule ();
2478
2479   /* Update the life information, because we add pseudos.  */
2480   max_regno = max_reg_num ();
2481
2482   /* Finalize layout changes.  */
2483   FOR_EACH_BB (bb)
2484     if (bb->next_bb != EXIT_BLOCK_PTR)
2485       bb->aux = bb->next_bb;
2486   free_dominance_info (CDI_DOMINATORS);
2487   cfg_layout_finalize ();
2488 #endif /* INSN_SCHEDULING */
2489   return 0;
2490 }
2491
2492 struct tree_opt_pass pass_sms =
2493 {
2494   "sms",                                /* name */
2495   gate_handle_sms,                      /* gate */
2496   rest_of_handle_sms,                   /* execute */
2497   NULL,                                 /* sub */
2498   NULL,                                 /* next */
2499   0,                                    /* static_pass_number */
2500   TV_SMS,                               /* tv_id */
2501   0,                                    /* properties_required */
2502   0,                                    /* properties_provided */
2503   0,                                    /* properties_destroyed */
2504   TODO_dump_func,                       /* todo_flags_start */
2505   TODO_df_finish |
2506   TODO_dump_func |
2507   TODO_ggc_collect,                     /* todo_flags_finish */
2508   'm'                                   /* letter */
2509 };
2510