gcc/modulo-sched.c

   1 /* Swing Modulo Scheduling implementation.
   2    Copyright (C) 2004, 2005
   3    Free Software Foundation, Inc.
   4    Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 2, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING.  If not, write to the Free
  20 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
  21 02110-1301, USA.  */
  22
  23
  24 #include "config.h"
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "tm.h"
  28 #include "toplev.h"
  29 #include "rtl.h"
  30 #include "tm_p.h"
  31 #include "hard-reg-set.h"
  32 #include "regs.h"
  33 #include "function.h"
  34 #include "flags.h"
  35 #include "insn-config.h"
  36 #include "insn-attr.h"
  37 #include "except.h"
  38 #include "toplev.h"
  39 #include "recog.h"
  40 #include "sched-int.h"
  41 #include "target.h"
  42 #include "cfglayout.h"
  43 #include "cfgloop.h"
  44 #include "cfghooks.h"
  45 #include "expr.h"
  46 #include "params.h"
  47 #include "gcov-io.h"
  48 #include "df.h"
  49 #include "ddg.h"
  50
  51 #ifdef INSN_SCHEDULING
  52
  53 /* This file contains the implementation of the Swing Modulo Scheduler,
  54    described in the following references:
  55    [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
  56        Lifetime--sensitive modulo scheduling in a production environment.
  57        IEEE Trans. on Comps., 50(3), March 2001
  58    [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
  59        Swing Modulo Scheduling: A Lifetime Sensitive Approach.
  60        PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
  61
  62    The basic structure is:
  63    1. Build a data-dependence graph (DDG) for each loop.
  64    2. Use the DDG to order the insns of a loop (not in topological order
  65       necessarily, but rather) trying to place each insn after all its
  66       predecessors _or_ after all its successors.
  67    3. Compute MII: a lower bound on the number of cycles to schedule the loop.
  68    4. Use the ordering to perform list-scheduling of the loop:
  69       1. Set II = MII.  We will try to schedule the loop within II cycles.
  70       2. Try to schedule the insns one by one according to the ordering.
  71          For each insn compute an interval of cycles by considering already-
  72          scheduled preds and succs (and associated latencies); try to place
  73          the insn in the cycles of this window checking for potential
  74          resource conflicts (using the DFA interface).
  75          Note: this is different from the cycle-scheduling of schedule_insns;
  76          here the insns are not scheduled monotonically top-down (nor bottom-
  77          up).
  78       3. If failed in scheduling all insns - bump II++ and try again, unless
  79          II reaches an upper bound MaxII, in which case report failure.
  80    5. If we succeeded in scheduling the loop within II cycles, we now
  81       generate prolog and epilog, decrease the counter of the loop, and
  82       perform modulo variable expansion for live ranges that span more than
  83       II cycles (i.e. use register copies to prevent a def from overwriting
  84       itself before reaching the use).
  85 */
  86
  87 \f
  88 /* This page defines partial-schedule structures and functions for
  89    modulo scheduling.  */
  90
  91 typedef struct partial_schedule *partial_schedule_ptr;
  92 typedef struct ps_insn *ps_insn_ptr;
  93
  94 /* The minimum (absolute) cycle that a node of ps was scheduled in.  */
  95 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
  96
  97 /* The maximum (absolute) cycle that a node of ps was scheduled in.  */
  98 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
  99
 100 /* Perform signed modulo, always returning a non-negative value.  */
 101 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
 102
 103 /* The number of different iterations the nodes in ps span, assuming
 104    the stage boundaries are placed efficiently.  */
 105 #define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \
 106                              + 1 + (ps)->ii - 1) / (ps)->ii)
 107
 108 /* A single instruction in the partial schedule.  */
 109 struct ps_insn
 110 {
 111   /* The corresponding DDG_NODE.  */
 112   ddg_node_ptr node;
 113
 114   /* The (absolute) cycle in which the PS instruction is scheduled.
 115      Same as SCHED_TIME (node).  */
 116   int cycle;
 117
 118   /* The next/prev PS_INSN in the same row.  */
 119   ps_insn_ptr next_in_row,
 120               prev_in_row;
 121
 122   /* The number of nodes in the same row that come after this node.  */
 123   int row_rest_count;
 124 };
 125
 126 /* Holds the partial schedule as an array of II rows.  Each entry of the
 127    array points to a linked list of PS_INSNs, which represents the
 128    instructions that are scheduled for that row.  */
 129 struct partial_schedule
 130 {
 131   int ii;       /* Number of rows in the partial schedule.  */
 132   int history;  /* Threshold for conflict checking using DFA.  */
 133
 134   /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii).  */
 135   ps_insn_ptr *rows;
 136
 137   /* The earliest absolute cycle of an insn in the partial schedule.  */
 138   int min_cycle;
 139
 140   /* The latest absolute cycle of an insn in the partial schedule.  */
 141   int max_cycle;
 142
 143   ddg_ptr g;    /* The DDG of the insns in the partial schedule.  */
 144 };
 145
 146 /* We use this to record all the register replacements we do in
 147    the kernel so we can undo SMS if it is not profitable.  */
 148 struct undo_replace_buff_elem
 149 {
 150   rtx insn;
 151   rtx orig_reg;
 152   rtx new_reg;
 153   struct undo_replace_buff_elem *next;
 154 };
 155
 156
 157
 158 partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
 159 void free_partial_schedule (partial_schedule_ptr);
 160 void reset_partial_schedule (partial_schedule_ptr, int new_ii);
 161 void print_partial_schedule (partial_schedule_ptr, FILE *);
 162 static int kernel_number_of_cycles (rtx first_insn, rtx last_insn);
 163 static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
 164                                                 ddg_node_ptr node, int cycle,
 165                                                 sbitmap must_precede,
 166                                                 sbitmap must_follow);
 167 static void rotate_partial_schedule (partial_schedule_ptr, int);
 168 void set_row_column_for_ps (partial_schedule_ptr);
 169 static bool ps_unschedule_node (partial_schedule_ptr, ddg_node_ptr );
 170
 171 \f
 172 /* This page defines constants and structures for the modulo scheduling
 173    driver.  */
 174
 175 /* As in haifa-sched.c:  */
 176 /* issue_rate is the number of insns that can be scheduled in the same
 177    machine cycle.  It can be defined in the config/mach/mach.h file,
 178    otherwise we set it to 1.  */
 179
 180 static int issue_rate;
 181
 182 /* For printing statistics.  */
 183 static FILE *stats_file;
 184
 185 static int sms_order_nodes (ddg_ptr, int, int * result);
 186 static void set_node_sched_params (ddg_ptr);
 187 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int,
 188                                                    int *, FILE*);
 189 static void permute_partial_schedule (partial_schedule_ptr ps, rtx last);
 190 static void generate_prolog_epilog (partial_schedule_ptr ,struct loop * loop, rtx);
 191 static void duplicate_insns_of_cycles (partial_schedule_ptr ps,
 192                                        int from_stage, int to_stage,
 193                                        int is_prolog);
 194
 195 #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
 196 #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
 197 #define SCHED_FIRST_REG_MOVE(x) \
 198         (((node_sched_params_ptr)(x)->aux.info)->first_reg_move)
 199 #define SCHED_NREG_MOVES(x) \
 200         (((node_sched_params_ptr)(x)->aux.info)->nreg_moves)
 201 #define SCHED_ROW(x) (((node_sched_params_ptr)(x)->aux.info)->row)
 202 #define SCHED_STAGE(x) (((node_sched_params_ptr)(x)->aux.info)->stage)
 203 #define SCHED_COLUMN(x) (((node_sched_params_ptr)(x)->aux.info)->column)
 204
 205 /* The scheduling parameters held for each node.  */
 206 typedef struct node_sched_params
 207 {
 208   int asap;     /* A lower-bound on the absolute scheduling cycle.  */
 209   int time;     /* The absolute scheduling cycle (time >= asap).  */
 210
 211   /* The following field (first_reg_move) is a pointer to the first
 212      register-move instruction added to handle the modulo-variable-expansion
 213      of the register defined by this node.  This register-move copies the
 214      original register defined by the node.  */
 215   rtx first_reg_move;
 216
 217   /* The number of register-move instructions added, immediately preceding
 218      first_reg_move.  */
 219   int nreg_moves;
 220
 221   int row;    /* Holds time % ii.  */
 222   int stage;  /* Holds time / ii.  */
 223
 224   /* The column of a node inside the ps.  If nodes u, v are on the same row,
 225      u will precede v if column (u) < column (v).  */
 226   int column;
 227 } *node_sched_params_ptr;
 228
 229 \f
 230 /* The following three functions are copied from the current scheduler
 231    code in order to use sched_analyze() for computing the dependencies.
 232    They are used when initializing the sched_info structure.  */
 233 static const char *
 234 sms_print_insn (rtx insn, int aligned ATTRIBUTE_UNUSED)
 235 {
 236   static char tmp[80];
 237
 238   sprintf (tmp, "i%4d", INSN_UID (insn));
 239   return tmp;
 240 }
 241
 242 static int
 243 contributes_to_priority (rtx next, rtx insn)
 244 {
 245   return BLOCK_NUM (next) == BLOCK_NUM (insn);
 246 }
 247
 248 static void
 249 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
 250                                regset cond_exec ATTRIBUTE_UNUSED,
 251                                regset used ATTRIBUTE_UNUSED,
 252                                regset set ATTRIBUTE_UNUSED)
 253 {
 254 }
 255
 256 static struct sched_info sms_sched_info =
 257 {
 258   NULL,
 259   NULL,
 260   NULL,
 261   NULL,
 262   NULL,
 263   sms_print_insn,
 264   contributes_to_priority,
 265   compute_jump_reg_dependencies,
 266   NULL, NULL,
 267   NULL, NULL,
 268   0, 0, 0
 269 };
 270
 271
 272 /* Return the register decremented and tested in INSN,
 273    or zero if it is not a decrement-and-branch insn.  */
 274
 275 static rtx
 276 doloop_register_get (rtx insn ATTRIBUTE_UNUSED)
 277 {
 278 #ifdef HAVE_doloop_end
 279   rtx pattern, reg, condition;
 280
 281   if (! JUMP_P (insn))
 282     return NULL_RTX;
 283
 284   pattern = PATTERN (insn);
 285   condition = doloop_condition_get (pattern);
 286   if (! condition)
 287     return NULL_RTX;
 288
 289   if (REG_P (XEXP (condition, 0)))
 290     reg = XEXP (condition, 0);
 291   else if (GET_CODE (XEXP (condition, 0)) == PLUS
 292            && REG_P (XEXP (XEXP (condition, 0), 0)))
 293     reg = XEXP (XEXP (condition, 0), 0);
 294   else
 295     gcc_unreachable ();
 296
 297   return reg;
 298 #else
 299   return NULL_RTX;
 300 #endif
 301 }
 302
 303 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
 304    that the number of iterations is a compile-time constant.  If so,
 305    return the rtx that sets COUNT_REG to a constant, and set COUNT to
 306    this constant.  Otherwise return 0.  */
 307 static rtx
 308 const_iteration_count (rtx count_reg, basic_block pre_header,
 309                        HOST_WIDEST_INT * count)
 310 {
 311   rtx insn;
 312   rtx head, tail;
 313
 314   if (! pre_header)
 315     return NULL_RTX;
 316
 317   get_block_head_tail (pre_header->index, &head, &tail);
 318
 319   for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
 320     if (INSN_P (insn) && single_set (insn) &&
 321         rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
 322       {
 323         rtx pat = single_set (insn);
 324
 325         if (GET_CODE (SET_SRC (pat)) == CONST_INT)
 326           {
 327             *count = INTVAL (SET_SRC (pat));
 328             return insn;
 329           }
 330
 331         return NULL_RTX;
 332       }
 333
 334   return NULL_RTX;
 335 }
 336
 337 /* A very simple resource-based lower bound on the initiation interval.
 338    ??? Improve the accuracy of this bound by considering the
 339    utilization of various units.  */
 340 static int
 341 res_MII (ddg_ptr g)
 342 {
 343   return (g->num_nodes / issue_rate);
 344 }
 345
 346
 347 /* Points to the array that contains the sched data for each node.  */
 348 static node_sched_params_ptr node_sched_params;
 349
 350 /* Allocate sched_params for each node and initialize it.  Assumes that
 351    the aux field of each node contain the asap bound (computed earlier),
 352    and copies it into the sched_params field.  */
 353 static void
 354 set_node_sched_params (ddg_ptr g)
 355 {
 356   int i;
 357
 358   /* Allocate for each node in the DDG a place to hold the "sched_data".  */
 359   /* Initialize ASAP/ALAP/HIGHT to zero.  */
 360   node_sched_params = (node_sched_params_ptr)
 361                        xcalloc (g->num_nodes,
 362                                 sizeof (struct node_sched_params));
 363
 364   /* Set the pointer of the general data of the node to point to the
 365      appropriate sched_params structure.  */
 366   for (i = 0; i < g->num_nodes; i++)
 367     {
 368       /* Watch out for aliasing problems?  */
 369       node_sched_params[i].asap = g->nodes[i].aux.count;
 370       g->nodes[i].aux.info = &node_sched_params[i];
 371     }
 372 }
 373
 374 static void
 375 print_node_sched_params (FILE * dump_file, int num_nodes)
 376 {
 377   int i;
 378
 379   if (! dump_file)
 380     return;
 381   for (i = 0; i < num_nodes; i++)
 382     {
 383       node_sched_params_ptr nsp = &node_sched_params[i];
 384       rtx reg_move = nsp->first_reg_move;
 385       int j;
 386
 387       fprintf (dump_file, "Node %d:\n", i);
 388       fprintf (dump_file, " asap = %d:\n", nsp->asap);
 389       fprintf (dump_file, " time = %d:\n", nsp->time);
 390       fprintf (dump_file, " nreg_moves = %d:\n", nsp->nreg_moves);
 391       for (j = 0; j < nsp->nreg_moves; j++)
 392         {
 393           fprintf (dump_file, " reg_move = ");
 394           print_rtl_single (dump_file, reg_move);
 395           reg_move = PREV_INSN (reg_move);
 396         }
 397     }
 398 }
 399
 400 /* Calculate an upper bound for II.  SMS should not schedule the loop if it
 401    requires more cycles than this bound.  Currently set to the sum of the
 402    longest latency edge for each node.  Reset based on experiments.  */
 403 static int
 404 calculate_maxii (ddg_ptr g)
 405 {
 406   int i;
 407   int maxii = 0;
 408
 409   for (i = 0; i < g->num_nodes; i++)
 410     {
 411       ddg_node_ptr u = &g->nodes[i];
 412       ddg_edge_ptr e;
 413       int max_edge_latency = 0;
 414
 415       for (e = u->out; e; e = e->next_out)
 416         max_edge_latency = MAX (max_edge_latency, e->latency);
 417
 418       maxii += max_edge_latency;
 419     }
 420   return maxii;
 421 }
 422
 423 /*
 424    Breaking intra-loop register anti-dependences:
 425    Each intra-loop register anti-dependence implies a cross-iteration true
 426    dependence of distance 1. Therefore, we can remove such false dependencies
 427    and figure out if the partial schedule broke them by checking if (for a
 428    true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
 429    if so generate a register move.   The number of such moves is equal to:
 430               SCHED_TIME (use) - SCHED_TIME (def)       { 0 broken
 431    nreg_moves = ----------------------------------- + 1 - {   dependence.
 432                             ii                          { 1 if not.
 433 */
 434 static struct undo_replace_buff_elem *
 435 generate_reg_moves (partial_schedule_ptr ps)
 436 {
 437   ddg_ptr g = ps->g;
 438   int ii = ps->ii;
 439   int i;
 440   struct undo_replace_buff_elem *reg_move_replaces = NULL;
 441
 442   for (i = 0; i < g->num_nodes; i++)
 443     {
 444       ddg_node_ptr u = &g->nodes[i];
 445       ddg_edge_ptr e;
 446       int nreg_moves = 0, i_reg_move;
 447       sbitmap *uses_of_defs;
 448       rtx last_reg_move;
 449       rtx prev_reg, old_reg;
 450
 451       /* Compute the number of reg_moves needed for u, by looking at life
 452          ranges started at u (excluding self-loops).  */
 453       for (e = u->out; e; e = e->next_out)
 454         if (e->type == TRUE_DEP && e->dest != e->src)
 455           {
 456             int nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
 457
 458             if (e->distance == 1)
 459               nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
 460
 461             /* If dest precedes src in the schedule of the kernel, then dest
 462                will read before src writes and we can save one reg_copy.  */
 463             if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
 464                 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
 465               nreg_moves4e--;
 466
 467             nreg_moves = MAX (nreg_moves, nreg_moves4e);
 468           }
 469
 470       if (nreg_moves == 0)
 471         continue;
 472
 473       /* Every use of the register defined by node may require a different
 474          copy of this register, depending on the time the use is scheduled.
 475          Set a bitmap vector, telling which nodes use each copy of this
 476          register.  */
 477       uses_of_defs = sbitmap_vector_alloc (nreg_moves, g->num_nodes);
 478       sbitmap_vector_zero (uses_of_defs, nreg_moves);
 479       for (e = u->out; e; e = e->next_out)
 480         if (e->type == TRUE_DEP && e->dest != e->src)
 481           {
 482             int dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
 483
 484             if (e->distance == 1)
 485               dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
 486
 487             if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
 488                 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
 489               dest_copy--;
 490
 491             if (dest_copy)
 492               SET_BIT (uses_of_defs[dest_copy - 1], e->dest->cuid);
 493           }
 494
 495       /* Now generate the reg_moves, attaching relevant uses to them.  */
 496       SCHED_NREG_MOVES (u) = nreg_moves;
 497       old_reg = prev_reg = copy_rtx (SET_DEST (single_set (u->insn)));
 498       last_reg_move = u->insn;
 499
 500       for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
 501         {
 502           unsigned int i_use;
 503           rtx new_reg = gen_reg_rtx (GET_MODE (prev_reg));
 504           rtx reg_move = gen_move_insn (new_reg, prev_reg);
 505           sbitmap_iterator sbi;
 506
 507           add_insn_before (reg_move, last_reg_move);
 508           last_reg_move = reg_move;
 509
 510           if (!SCHED_FIRST_REG_MOVE (u))
 511             SCHED_FIRST_REG_MOVE (u) = reg_move;
 512
 513           EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs[i_reg_move], 0, i_use, sbi)
 514             {
 515               struct undo_replace_buff_elem *rep;
 516
 517               rep = (struct undo_replace_buff_elem *)
 518                     xcalloc (1, sizeof (struct undo_replace_buff_elem));
 519               rep->insn = g->nodes[i_use].insn;
 520               rep->orig_reg = old_reg;
 521               rep->new_reg = new_reg;
 522
 523               if (! reg_move_replaces)
 524                 reg_move_replaces = rep;
 525               else
 526                 {
 527                   rep->next = reg_move_replaces;
 528                   reg_move_replaces = rep;
 529                 }
 530
 531               replace_rtx (g->nodes[i_use].insn, old_reg, new_reg);
 532             }
 533
 534           prev_reg = new_reg;
 535         }
 536     }
 537   return reg_move_replaces;
 538 }
 539
 540 /* We call this when we want to undo the SMS schedule for a given loop.
 541    One of the things that we do is to delete the register moves generated
 542    for the sake of SMS; this function deletes the register move instructions
 543    recorded in the undo buffer.  */
 544 static void
 545 undo_generate_reg_moves (partial_schedule_ptr ps,
 546                          struct undo_replace_buff_elem *reg_move_replaces)
 547 {
 548   int i,j;
 549
 550   for (i = 0; i < ps->g->num_nodes; i++)
 551     {
 552       ddg_node_ptr u = &ps->g->nodes[i];
 553       rtx prev;
 554       rtx crr = SCHED_FIRST_REG_MOVE (u);
 555
 556       for (j = 0; j < SCHED_NREG_MOVES (u); j++)
 557         {
 558           prev = PREV_INSN (crr);
 559           delete_insn (crr);
 560           crr = prev;
 561         }
 562       SCHED_FIRST_REG_MOVE (u) = NULL_RTX;
 563     }
 564
 565   while (reg_move_replaces)
 566     {
 567       struct undo_replace_buff_elem *rep = reg_move_replaces;
 568
 569       reg_move_replaces = reg_move_replaces->next;
 570       replace_rtx (rep->insn, rep->new_reg, rep->orig_reg);
 571     }
 572 }
 573
 574 /* Free memory allocated for the undo buffer.  */
 575 static void
 576 free_undo_replace_buff (struct undo_replace_buff_elem *reg_move_replaces)
 577 {
 578
 579   while (reg_move_replaces)
 580     {
 581       struct undo_replace_buff_elem *rep = reg_move_replaces;
 582
 583       reg_move_replaces = reg_move_replaces->next;
 584       free (rep);
 585     }
 586 }
 587
 588 /* Bump the SCHED_TIMEs of all nodes to start from zero.  Set the values
 589    of SCHED_ROW and SCHED_STAGE.  */
 590 static void
 591 normalize_sched_times (partial_schedule_ptr ps)
 592 {
 593   int i;
 594   ddg_ptr g = ps->g;
 595   int amount = PS_MIN_CYCLE (ps);
 596   int ii = ps->ii;
 597
 598   /* Don't include the closing branch assuming that it is the last node.  */
 599   for (i = 0; i < g->num_nodes - 1; i++)
 600     {
 601       ddg_node_ptr u = &g->nodes[i];
 602       int normalized_time = SCHED_TIME (u) - amount;
 603
 604       gcc_assert (normalized_time >= 0);
 605
 606       SCHED_TIME (u) = normalized_time;
 607       SCHED_ROW (u) = normalized_time % ii;
 608       SCHED_STAGE (u) = normalized_time / ii;
 609     }
 610 }
 611
 612 /* Set SCHED_COLUMN of each node according to its position in PS.  */
 613 static void
 614 set_columns_for_ps (partial_schedule_ptr ps)
 615 {
 616   int row;
 617
 618   for (row = 0; row < ps->ii; row++)
 619     {
 620       ps_insn_ptr cur_insn = ps->rows[row];
 621       int column = 0;
 622
 623       for (; cur_insn; cur_insn = cur_insn->next_in_row)
 624         SCHED_COLUMN (cur_insn->node) = column++;
 625     }
 626 }
 627
 628 /* Permute the insns according to their order in PS, from row 0 to
 629    row ii-1, and position them right before LAST.  This schedules
 630    the insns of the loop kernel.  */
 631 static void
 632 permute_partial_schedule (partial_schedule_ptr ps, rtx last)
 633 {
 634   int ii = ps->ii;
 635   int row;
 636   ps_insn_ptr ps_ij;
 637
 638   for (row = 0; row < ii ; row++)
 639     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
 640       if (PREV_INSN (last) != ps_ij->node->insn)
 641         reorder_insns_nobb (ps_ij->node->first_note, ps_ij->node->insn,
 642                             PREV_INSN (last));
 643 }
 644
 645 /* As part of undoing SMS we return to the original ordering of the
 646    instructions inside the loop kernel.  Given the partial schedule PS, this
 647    function returns the ordering of the instruction according to their CUID
 648    in the DDG (PS->G), which is the original order of the instruction before
 649    performing SMS.  */
 650 static void
 651 undo_permute_partial_schedule (partial_schedule_ptr ps, rtx last)
 652 {
 653   int i;
 654
 655   for (i = 0 ; i < ps->g->num_nodes; i++)
 656     if (last == ps->g->nodes[i].insn
 657         || last == ps->g->nodes[i].first_note)
 658       break;
 659     else if (PREV_INSN (last) != ps->g->nodes[i].insn)
 660       reorder_insns_nobb (ps->g->nodes[i].first_note, ps->g->nodes[i].insn,
 661                           PREV_INSN (last));
 662 }
 663
 664 /* Used to generate the prologue & epilogue.  Duplicate the subset of
 665    nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive
 666    of both), together with a prefix/suffix of their reg_moves.  */
 667 static void
 668 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
 669                            int to_stage, int for_prolog)
 670 {
 671   int row;
 672   ps_insn_ptr ps_ij;
 673
 674   for (row = 0; row < ps->ii; row++)
 675     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
 676       {
 677         ddg_node_ptr u_node = ps_ij->node;
 678         int j, i_reg_moves;
 679         rtx reg_move = NULL_RTX;
 680
 681         if (for_prolog)
 682           {
 683             /* SCHED_STAGE (u_node) >= from_stage == 0.  Generate increasing
 684                number of reg_moves starting with the second occurrence of
 685                u_node, which is generated if its SCHED_STAGE <= to_stage.  */
 686             i_reg_moves = to_stage - SCHED_STAGE (u_node) + 1;
 687             i_reg_moves = MAX (i_reg_moves, 0);
 688             i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
 689
 690             /* The reg_moves start from the *first* reg_move backwards.  */
 691             if (i_reg_moves)
 692               {
 693                 reg_move = SCHED_FIRST_REG_MOVE (u_node);
 694                 for (j = 1; j < i_reg_moves; j++)
 695                   reg_move = PREV_INSN (reg_move);
 696               }
 697           }
 698         else /* It's for the epilog.  */
 699           {
 700             /* SCHED_STAGE (u_node) <= to_stage.  Generate all reg_moves,
 701                starting to decrease one stage after u_node no longer occurs;
 702                that is, generate all reg_moves until
 703                SCHED_STAGE (u_node) == from_stage - 1.  */
 704             i_reg_moves = SCHED_NREG_MOVES (u_node)
 705                        - (from_stage - SCHED_STAGE (u_node) - 1);
 706             i_reg_moves = MAX (i_reg_moves, 0);
 707             i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
 708
 709             /* The reg_moves start from the *last* reg_move forwards.  */
 710             if (i_reg_moves)
 711               {
 712                 reg_move = SCHED_FIRST_REG_MOVE (u_node);
 713                 for (j = 1; j < SCHED_NREG_MOVES (u_node); j++)
 714                   reg_move = PREV_INSN (reg_move);
 715               }
 716           }
 717
 718         for (j = 0; j < i_reg_moves; j++, reg_move = NEXT_INSN (reg_move))
 719           emit_insn (copy_rtx (PATTERN (reg_move)));
 720         if (SCHED_STAGE (u_node) >= from_stage
 721             && SCHED_STAGE (u_node) <= to_stage)
 722           duplicate_insn_chain (u_node->first_note, u_node->insn);
 723       }
 724 }
 725
 726
 727 /* Generate the instructions (including reg_moves) for prolog & epilog.  */
 728 static void
 729 generate_prolog_epilog (partial_schedule_ptr ps, struct loop * loop, rtx count_reg)
 730 {
 731   int i;
 732   int last_stage = PS_STAGE_COUNT (ps) - 1;
 733   edge e;
 734
 735   /* Generate the prolog, inserting its insns on the loop-entry edge.  */
 736   start_sequence ();
 737
 738   if (count_reg)
 739    /* Generate a subtract instruction at the beginning of the prolog to
 740       adjust the loop count by STAGE_COUNT.  */
 741    emit_insn (gen_sub2_insn (count_reg, GEN_INT (last_stage)));
 742
 743   for (i = 0; i < last_stage; i++)
 744     duplicate_insns_of_cycles (ps, 0, i, 1);
 745
 746   /* Put the prolog ,  on the one and only entry edge.  */
 747   e = loop_preheader_edge (loop);
 748   loop_split_edge_with(e , get_insns());
 749
 750   end_sequence ();
 751
 752   /* Generate the epilog, inserting its insns on the loop-exit edge.  */
 753   start_sequence ();
 754
 755   for (i = 0; i < last_stage; i++)
 756     duplicate_insns_of_cycles (ps, i + 1, last_stage, 0);
 757
 758   /* Put the epilogue on the one and only one exit edge.  */
 759   gcc_assert (loop->single_exit);
 760   e = loop->single_exit;
 761   loop_split_edge_with(e , get_insns());
 762   end_sequence ();
 763 }
 764
 765 /* Return the line note insn preceding INSN, for debugging.  Taken from
 766    emit-rtl.c.  */
 767 static rtx
 768 find_line_note (rtx insn)
 769 {
 770   for (; insn; insn = PREV_INSN (insn))
 771     if (NOTE_P (insn)
 772         && NOTE_LINE_NUMBER (insn) >= 0)
 773       break;
 774
 775   return insn;
 776 }
 777
 778 /* Return true if all the BBs of the loop are empty except the
 779    loop header.  */
 780 static bool
 781 loop_single_full_bb_p (struct loop *loop)
 782 {
 783   unsigned i;
 784   basic_block *bbs = get_loop_body (loop);
 785
 786   for (i = 0; i < loop->num_nodes ; i++)
 787     {
 788       rtx head, tail;
 789       bool empty_bb = true;
 790
 791       if (bbs[i] == loop->header)
 792         continue;
 793
 794       /* Make sure that basic blocks other than the header
 795          have only notes labels or jumps.  */
 796       get_block_head_tail (bbs[i]->index, &head, &tail);
 797       for (; head != NEXT_INSN (tail); head = NEXT_INSN (head))
 798         {
 799           if (NOTE_P (head) || LABEL_P (head)
 800               || (INSN_P (head) && JUMP_P (head)))
 801             continue;
 802           empty_bb = false;
 803           break;
 804         }
 805
 806       if (! empty_bb)
 807         {
 808           free (bbs);
 809           return false;
 810         }
 811     }
 812   free (bbs);
 813   return true;
 814 }
 815
 816 /* A simple loop from SMS point of view; it is a loop that is composed of
 817    either a single basic block or two BBs - a header and a latch.  */
 818 #define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 )                     \
 819                                   && (EDGE_COUNT (loop->latch->preds) == 1) \
 820                                   && (EDGE_COUNT (loop->latch->succs) == 1))
 821
 822 /* Return true if the loop is in its canonical form and false if not.
 823    i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit.  */
 824 static bool
 825 loop_canon_p (struct loop *loop, FILE *dump_file)
 826 {
 827
 828   if (loop->inner || ! loop->outer)
 829     return false;
 830
 831   if (!loop->single_exit)
 832     {
 833       if (dump_file)
 834         {
 835           rtx line_note = find_line_note (BB_END (loop->header));
 836
 837           fprintf (dump_file, "SMS loop many exits ");
 838           if (line_note)
 839             {
 840               expanded_location xloc;
 841               NOTE_EXPANDED_LOCATION (xloc, line_note);
 842               fprintf (stats_file, " %s %d (file, line)\n",
 843                        xloc.file, xloc.line);
 844             }
 845         }
 846       return false;
 847     }
 848
 849   if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop))
 850     {
 851       if (dump_file)
 852         {
 853           rtx line_note = find_line_note (BB_END (loop->header));
 854
 855           fprintf (dump_file, "SMS loop many BBs. ");
 856           if (line_note)
 857             {
 858               expanded_location xloc;
 859               NOTE_EXPANDED_LOCATION (xloc, line_note);
 860               fprintf (stats_file, " %s %d (file, line)\n",
 861                        xloc.file, xloc.line);
 862             }
 863         }
 864       return false;
 865     }
 866
 867     return true;
 868 }
 869
 870 /* If there are more than one entry for the loop,
 871    make it one by splitting the first entry edge and
 872    redirecting the others to the new BB.  */
 873 static void
 874 canon_loop (struct loop *loop)
 875 {
 876   edge e;
 877   edge_iterator i;
 878
 879   /* Avoid annoying special cases of edges going to exit
 880      block.  */
 881   FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR->preds)
 882     if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1))
 883       loop_split_edge_with (e, NULL_RTX);
 884
 885   if (loop->latch == loop->header
 886       || EDGE_COUNT (loop->latch->succs) > 1)
 887     {
 888       FOR_EACH_EDGE (e, i, loop->header->preds)
 889         if (e->src == loop->latch)
 890           break;
 891       loop_split_edge_with (e, NULL_RTX);
 892     }
 893 }
 894
 895 /* Build the loop information without loop
 896    canonization, the loop canonization will
 897    be performed if the loop is SMSable.  */
 898 static struct loops *
 899 build_loops_structure (FILE *dumpfile)
 900 {
 901   struct loops *loops = xcalloc (1, sizeof (struct loops));
 902
 903   /* Find the loops.  */
 904
 905   if (flow_loops_find (loops) <= 1)
 906     {
 907       /* No loops.  */
 908       flow_loops_free (loops);
 909       free (loops);
 910
 911       return NULL;
 912     }
 913
 914   /* Not going to update these.  */
 915   free (loops->cfg.rc_order);
 916   loops->cfg.rc_order = NULL;
 917   free (loops->cfg.dfs_order);
 918   loops->cfg.dfs_order = NULL;
 919
 920   create_preheaders (loops, CP_SIMPLE_PREHEADERS);
 921   mark_single_exit_loops (loops);
 922   /* Dump loops.  */
 923   flow_loops_dump (loops, dumpfile, NULL, 1);
 924
 925 #ifdef ENABLE_CHECKING
 926   verify_dominators (CDI_DOMINATORS);
 927   verify_loop_structure (loops);
 928 #endif
 929
 930   return loops;
 931 }
 932
 933 /* Main entry point, perform SMS scheduling on the loops of the function
 934    that consist of single basic blocks.  */
 935 void
 936 sms_schedule (FILE *dump_file)
 937 {
 938   static int passes = 0;
 939   rtx insn;
 940   ddg_ptr *g_arr, g;
 941   int * node_order;
 942   int maxii;
 943   unsigned i,num_loops;
 944   partial_schedule_ptr ps;
 945   struct df *df;
 946   struct loops *loops;
 947   basic_block bb = NULL;
 948   /* vars to the versioning only if needed*/
 949   struct loop * nloop;
 950   basic_block condition_bb = NULL;
 951   edge latch_edge;
 952   gcov_type trip_count = 0;
 953
 954   if (! (loops = build_loops_structure (dump_file)))
 955     return;  /* There is no loops to schedule.  */
 956
 957
 958   stats_file = dump_file;
 959
 960   /* Initialize issue_rate.  */
 961   if (targetm.sched.issue_rate)
 962     {
 963       int temp = reload_completed;
 964
 965       reload_completed = 1;
 966       issue_rate = targetm.sched.issue_rate ();
 967       reload_completed = temp;
 968     }
 969   else
 970     issue_rate = 1;
 971
 972   /* Initialize the scheduler.  */
 973   current_sched_info = &sms_sched_info;
 974   sched_init (NULL);
 975
 976   /* Init Data Flow analysis, to be used in interloop dep calculation.  */
 977   df = df_init ();
 978   df_analyze (df, 0, DF_ALL);
 979
 980   /* Allocate memory to hold the DDG array one entry for each loop.
 981      We use loop->num as index into this array.  */
 982   g_arr = xcalloc (loops->num, sizeof (ddg_ptr));
 983
 984
 985   /* Build DDGs for all the relevant loops and hold them in G_ARR
 986      indexed by the loop index.  */
 987   for (i = 0; i < loops->num; i++)
 988     {
 989       rtx head, tail;
 990       rtx count_reg;
 991       struct loop *loop = loops->parray[i];
 992
 993       /* For debugging.  */
 994       if ((passes++ > MAX_SMS_LOOP_NUMBER) && (MAX_SMS_LOOP_NUMBER != -1))
 995         {
 996           if (dump_file)
 997             fprintf (dump_file, "SMS reached MAX_PASSES... \n");
 998
 999           break;
1000         }
1001
1002       if (! loop_canon_p (loop, dump_file))
1003         continue;
1004
1005       if (! loop_single_full_bb_p (loop))
1006         continue;
1007
1008       bb = loop->header;
1009
1010       get_block_head_tail (bb->index, &head, &tail);
1011       latch_edge = loop_latch_edge (loop);
1012       gcc_assert (loop->single_exit);
1013       if (loop->single_exit->count)
1014         trip_count = latch_edge->count / loop->single_exit->count;
1015
1016       /* Perfrom SMS only on loops that their average count is above threshold.  */
1017
1018       if ( latch_edge->count
1019           && (latch_edge->count < loop->single_exit->count * SMS_LOOP_AVERAGE_COUNT_THRESHOLD))
1020         {
1021           if (stats_file)
1022             {
1023               rtx line_note = find_line_note (tail);
1024
1025               if (line_note)
1026                 {
1027                   expanded_location xloc;
1028                   NOTE_EXPANDED_LOCATION (xloc, line_note);
1029                   fprintf (stats_file, "SMS bb %s %d (file, line)\n",
1030                            xloc.file, xloc.line);
1031                 }
1032               fprintf (stats_file, "SMS single-bb-loop\n");
1033               if (profile_info && flag_branch_probabilities)
1034                 {
1035                   fprintf (stats_file, "SMS loop-count ");
1036                   fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1037                            (HOST_WIDEST_INT) bb->count);
1038                   fprintf (stats_file, "\n");
1039                   fprintf (stats_file, "SMS trip-count ");
1040                   fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1041                            (HOST_WIDEST_INT) trip_count);
1042                   fprintf (stats_file, "\n");
1043                   fprintf (stats_file, "SMS profile-sum-max ");
1044                   fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1045                            (HOST_WIDEST_INT) profile_info->sum_max);
1046                   fprintf (stats_file, "\n");
1047                 }
1048             }
1049           continue;
1050         }
1051
1052       /* Make sure this is a doloop.  */
1053       if ( !(count_reg = doloop_register_get (tail)))
1054         continue;
1055
1056       /* Don't handle BBs with calls or barriers, or !single_set insns.  */
1057       for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
1058         if (CALL_P (insn)
1059             || BARRIER_P (insn)
1060             || (INSN_P (insn) && !JUMP_P (insn)
1061                 && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
1062           break;
1063
1064       if (insn != NEXT_INSN (tail))
1065         {
1066           if (stats_file)
1067             {
1068               if (CALL_P (insn))
1069                 fprintf (stats_file, "SMS loop-with-call\n");
1070               else if (BARRIER_P (insn))
1071                 fprintf (stats_file, "SMS loop-with-barrier\n");
1072               else
1073                 fprintf (stats_file, "SMS loop-with-not-single-set\n");
1074               print_rtl_single (stats_file, insn);
1075             }
1076
1077           continue;
1078         }
1079
1080       if (! (g = create_ddg (bb, df, 0)))
1081         {
1082           if (stats_file)
1083             fprintf (stats_file, "SMS doloop\n");
1084           continue;
1085         }
1086
1087       g_arr[i] = g;
1088     }
1089
1090   /* Release Data Flow analysis data structures.  */
1091   df_finish (df);
1092
1093   /* We don't want to perform SMS on new loops - created by versioning.  */
1094   num_loops = loops->num;
1095   /* Go over the built DDGs and perfrom SMS for each one of them.  */
1096   for (i = 0; i < num_loops; i++)
1097     {
1098       rtx head, tail;
1099       rtx count_reg, count_init;
1100       int mii, rec_mii;
1101       unsigned stage_count = 0;
1102       HOST_WIDEST_INT loop_count = 0;
1103       struct loop *loop = loops->parray[i];
1104
1105       if (! (g = g_arr[i]))
1106         continue;
1107
1108       if (dump_file)
1109         print_ddg (dump_file, g);
1110
1111       get_block_head_tail (loop->header->index, &head, &tail);
1112
1113       latch_edge = loop_latch_edge (loop);
1114       gcc_assert (loop->single_exit);
1115       if (loop->single_exit->count)
1116         trip_count = latch_edge->count / loop->single_exit->count;
1117
1118       if (stats_file)
1119         {
1120           rtx line_note = find_line_note (tail);
1121
1122           if (line_note)
1123             {
1124               expanded_location xloc;
1125               NOTE_EXPANDED_LOCATION (xloc, line_note);
1126               fprintf (stats_file, "SMS bb %s %d (file, line)\n",
1127                        xloc.file, xloc.line);
1128             }
1129           fprintf (stats_file, "SMS single-bb-loop\n");
1130           if (profile_info && flag_branch_probabilities)
1131             {
1132               fprintf (stats_file, "SMS loop-count ");
1133               fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1134                        (HOST_WIDEST_INT) bb->count);
1135               fprintf (stats_file, "\n");
1136               fprintf (stats_file, "SMS profile-sum-max ");
1137               fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1138                        (HOST_WIDEST_INT) profile_info->sum_max);
1139               fprintf (stats_file, "\n");
1140             }
1141           fprintf (stats_file, "SMS doloop\n");
1142           fprintf (stats_file, "SMS built-ddg %d\n", g->num_nodes);
1143           fprintf (stats_file, "SMS num-loads %d\n", g->num_loads);
1144           fprintf (stats_file, "SMS num-stores %d\n", g->num_stores);
1145         }
1146
1147
1148       /* In case of th loop have doloop register it gets special
1149          handling.  */
1150       count_init = NULL_RTX;
1151       if ((count_reg = doloop_register_get (tail)))
1152         {
1153           basic_block pre_header;
1154
1155           pre_header = loop_preheader_edge (loop)->src;
1156           count_init = const_iteration_count (count_reg, pre_header,
1157                                               &loop_count);
1158         }
1159       gcc_assert (count_reg);
1160
1161       if (stats_file && count_init)
1162         {
1163           fprintf (stats_file, "SMS const-doloop ");
1164           fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1165                      loop_count);
1166           fprintf (stats_file, "\n");
1167         }
1168
1169       node_order = (int *) xmalloc (sizeof (int) * g->num_nodes);
1170
1171       mii = 1; /* Need to pass some estimate of mii.  */
1172       rec_mii = sms_order_nodes (g, mii, node_order);
1173       mii = MAX (res_MII (g), rec_mii);
1174       maxii = (calculate_maxii (g) * SMS_MAX_II_FACTOR) / 100;
1175
1176       if (stats_file)
1177         fprintf (stats_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1178                  rec_mii, mii, maxii);
1179
1180       /* After sms_order_nodes and before sms_schedule_by_order, to copy over
1181          ASAP.  */
1182       set_node_sched_params (g);
1183
1184       ps = sms_schedule_by_order (g, mii, maxii, node_order, dump_file);
1185
1186       if (ps)
1187         stage_count = PS_STAGE_COUNT (ps);
1188
1189       /* Stage count of 1 means that there is no interleaving between
1190          iterations, let the scheduling passes do the job.  */
1191       if (stage_count < 1
1192           || (count_init && (loop_count <= stage_count))
1193           || (flag_branch_probabilities && (trip_count <= stage_count)))
1194         {
1195           if (dump_file)
1196             {
1197               fprintf (dump_file, "SMS failed... \n");
1198               fprintf (dump_file, "SMS sched-failed (stage-count=%d, loop-count=", stage_count);
1199               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, loop_count);
1200               fprintf (dump_file, ", trip-count=");
1201               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, trip_count);
1202               fprintf (dump_file, ")\n");
1203             }
1204           continue;
1205         }
1206       else
1207         {
1208           int orig_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1209           int new_cycles;
1210           struct undo_replace_buff_elem *reg_move_replaces;
1211
1212           if (stats_file)
1213             {
1214               fprintf (stats_file,
1215                        "SMS succeeded %d %d (with ii, sc)\n", ps->ii,
1216                        stage_count);
1217               print_partial_schedule (ps, stats_file);
1218               fprintf (stats_file,
1219                        "SMS Branch (%d) will later be scheduled at cycle %d.\n",
1220                        g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1);
1221             }
1222
1223           /* Set the stage boundaries.  If the DDG is built with closing_branch_deps,
1224              the closing_branch was scheduled and should appear in the last (ii-1)
1225              row.  Otherwise, we are free to schedule the branch, and we let nodes
1226              that were scheduled at the first PS_MIN_CYCLE cycle appear in the first
1227              row; this should reduce stage_count to minimum.  */
1228           normalize_sched_times (ps);
1229           rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
1230           set_columns_for_ps (ps);
1231
1232           /* Generate the kernel just to be able to measure its cycles.  */
1233           permute_partial_schedule (ps, g->closing_branch->first_note);
1234           reg_move_replaces = generate_reg_moves (ps);
1235
1236           /* Get the number of cycles the new kernel expect to execute in.  */
1237           new_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1238
1239           /* Get back to the original loop so we can do loop versioning.  */
1240           undo_permute_partial_schedule (ps, g->closing_branch->first_note);
1241           if (reg_move_replaces)
1242             undo_generate_reg_moves (ps, reg_move_replaces);
1243
1244           if ( new_cycles >= orig_cycles)
1245             {
1246               /* SMS is not profitable so undo the permutation and reg move generation
1247                  and return the kernel to its original state.  */
1248               if (dump_file)
1249                 fprintf (dump_file, "Undoing SMS because it is not profitable.\n");
1250
1251             }
1252           else
1253             {
1254               canon_loop (loop);
1255
1256               /* case the BCT count is not known , Do loop-versioning */
1257               if (count_reg && ! count_init)
1258                 {
1259                   rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
1260                                                  GEN_INT(stage_count));
1261
1262                   nloop = loop_version (loops, loop, comp_rtx, &condition_bb);
1263                 }
1264
1265               /* Set new iteration count of loop kernel.  */
1266               if (count_reg && count_init)
1267                 SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1268                                                              - stage_count + 1);
1269
1270               /* Now apply the scheduled kernel to the RTL of the loop.  */
1271               permute_partial_schedule (ps, g->closing_branch->first_note);
1272
1273               /* Mark this loop as software pipelined so the later
1274               scheduling passes doesn't touch it.  */
1275               if (! flag_resched_modulo_sched)
1276                 g->bb->flags |= BB_DISABLE_SCHEDULE;
1277               /* The life-info is not valid any more.  */
1278               g->bb->flags |= BB_DIRTY;
1279
1280               reg_move_replaces = generate_reg_moves (ps);
1281               if (dump_file)
1282                 print_node_sched_params (dump_file, g->num_nodes);
1283               /* Generate prolog and epilog.  */
1284               if (count_reg && !count_init)
1285                 generate_prolog_epilog (ps, loop, count_reg);
1286               else
1287                 generate_prolog_epilog (ps, loop, NULL_RTX);
1288             }
1289           free_undo_replace_buff (reg_move_replaces);
1290         }
1291
1292       free_partial_schedule (ps);
1293       free (node_sched_params);
1294       free (node_order);
1295       free_ddg (g);
1296     }
1297
1298   /* Release scheduler data, needed until now because of DFA.  */
1299   sched_finish ();
1300   loop_optimizer_finalize (loops, dump_file);
1301 }
1302
1303 /* The SMS scheduling algorithm itself
1304    -----------------------------------
1305    Input: 'O' an ordered list of insns of a loop.
1306    Output: A scheduling of the loop - kernel, prolog, and epilogue.
1307
1308    'Q' is the empty Set
1309    'PS' is the partial schedule; it holds the currently scheduled nodes with
1310         their cycle/slot.
1311    'PSP' previously scheduled predecessors.
1312    'PSS' previously scheduled successors.
1313    't(u)' the cycle where u is scheduled.
1314    'l(u)' is the latency of u.
1315    'd(v,u)' is the dependence distance from v to u.
1316    'ASAP(u)' the earliest time at which u could be scheduled as computed in
1317              the node ordering phase.
1318    'check_hardware_resources_conflicts(u, PS, c)'
1319                              run a trace around cycle/slot through DFA model
1320                              to check resource conflicts involving instruction u
1321                              at cycle c given the partial schedule PS.
1322    'add_to_partial_schedule_at_time(u, PS, c)'
1323                              Add the node/instruction u to the partial schedule
1324                              PS at time c.
1325    'calculate_register_pressure(PS)'
1326                              Given a schedule of instructions, calculate the register
1327                              pressure it implies.  One implementation could be the
1328                              maximum number of overlapping live ranges.
1329    'maxRP' The maximum allowed register pressure, it is usually derived from the number
1330            registers available in the hardware.
1331
1332    1. II = MII.
1333    2. PS = empty list
1334    3. for each node u in O in pre-computed order
1335    4.   if (PSP(u) != Q && PSS(u) == Q) then
1336    5.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1337    6.     start = Early_start; end = Early_start + II - 1; step = 1
1338    11.  else if (PSP(u) == Q && PSS(u) != Q) then
1339    12.      Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1340    13.     start = Late_start; end = Late_start - II + 1; step = -1
1341    14.  else if (PSP(u) != Q && PSS(u) != Q) then
1342    15.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1343    16.     Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1344    17.     start = Early_start;
1345    18.     end = min(Early_start + II - 1 , Late_start);
1346    19.     step = 1
1347    20.     else "if (PSP(u) == Q && PSS(u) == Q)"
1348    21.    start = ASAP(u); end = start + II - 1; step = 1
1349    22.  endif
1350
1351    23.  success = false
1352    24.  for (c = start ; c != end ; c += step)
1353    25.     if check_hardware_resources_conflicts(u, PS, c) then
1354    26.       add_to_partial_schedule_at_time(u, PS, c)
1355    27.       success = true
1356    28.       break
1357    29.     endif
1358    30.  endfor
1359    31.  if (success == false) then
1360    32.    II = II + 1
1361    33.    if (II > maxII) then
1362    34.       finish - failed to schedule
1363    35.   endif
1364    36.    goto 2.
1365    37.  endif
1366    38. endfor
1367    39. if (calculate_register_pressure(PS) > maxRP) then
1368    40.    goto 32.
1369    41. endif
1370    42. compute epilogue & prologue
1371    43. finish - succeeded to schedule
1372 */
1373
1374 /* A limit on the number of cycles that resource conflicts can span.  ??? Should
1375    be provided by DFA, and be dependent on the type of insn scheduled.  Currently
1376    set to 0 to save compile time.  */
1377 #define DFA_HISTORY SMS_DFA_HISTORY
1378
1379 /* Given the partial schedule PS, this function calculates and returns the
1380    cycles in which we can schedule the node with the given index I.
1381    NOTE: Here we do the backtracking in SMS, in some special cases. We have
1382    noticed that there are several cases in which we fail    to SMS the loop
1383    because the sched window of a node is empty    due to tight data-deps. In
1384    such cases we want to unschedule    some of the predecessors/successors
1385    until we get non-empty    scheduling window.  It returns -1 if the
1386    scheduling window is empty and zero otherwise.  */
1387
1388 static int
1389 get_sched_window (partial_schedule_ptr ps, int *nodes_order, int i,
1390                   sbitmap sched_nodes, int ii, int *start_p, int *step_p, int *end_p)
1391 {
1392   int start, step, end;
1393   ddg_edge_ptr e;
1394   int u = nodes_order [i];
1395   ddg_node_ptr u_node = &ps->g->nodes[u];
1396   sbitmap psp = sbitmap_alloc (ps->g->num_nodes);
1397   sbitmap pss = sbitmap_alloc (ps->g->num_nodes);
1398   sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1399   sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1400   int psp_not_empty;
1401   int pss_not_empty;
1402
1403   /* 1. compute sched window for u (start, end, step).  */
1404   sbitmap_zero (psp);
1405   sbitmap_zero (pss);
1406   psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes);
1407   pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes);
1408
1409   if (psp_not_empty && !pss_not_empty)
1410     {
1411       int early_start = INT_MIN;
1412
1413       end = INT_MAX;
1414       for (e = u_node->in; e != 0; e = e->next_in)
1415         {
1416           ddg_node_ptr v_node = e->src;
1417           if (TEST_BIT (sched_nodes, v_node->cuid))
1418             {
1419               int node_st = SCHED_TIME (v_node)
1420                             + e->latency - (e->distance * ii);
1421
1422               early_start = MAX (early_start, node_st);
1423
1424               if (e->data_type == MEM_DEP)
1425                 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1426             }
1427         }
1428       start = early_start;
1429       end = MIN (end, early_start + ii);
1430       step = 1;
1431     }
1432
1433   else if (!psp_not_empty && pss_not_empty)
1434     {
1435       int late_start = INT_MAX;
1436
1437       end = INT_MIN;
1438       for (e = u_node->out; e != 0; e = e->next_out)
1439         {
1440           ddg_node_ptr v_node = e->dest;
1441           if (TEST_BIT (sched_nodes, v_node->cuid))
1442             {
1443               late_start = MIN (late_start,
1444                                 SCHED_TIME (v_node) - e->latency
1445                                 + (e->distance * ii));
1446               if (e->data_type == MEM_DEP)
1447                 end = MAX (end, SCHED_TIME (v_node) - ii + 1);
1448             }
1449         }
1450       start = late_start;
1451       end = MAX (end, late_start - ii);
1452       step = -1;
1453     }
1454
1455   else if (psp_not_empty && pss_not_empty)
1456     {
1457       int early_start = INT_MIN;
1458       int late_start = INT_MAX;
1459
1460       start = INT_MIN;
1461       end = INT_MAX;
1462       for (e = u_node->in; e != 0; e = e->next_in)
1463         {
1464           ddg_node_ptr v_node = e->src;
1465
1466           if (TEST_BIT (sched_nodes, v_node->cuid))
1467             {
1468               early_start = MAX (early_start,
1469                                  SCHED_TIME (v_node) + e->latency
1470                                  - (e->distance * ii));
1471               if (e->data_type == MEM_DEP)
1472                 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1473             }
1474         }
1475       for (e = u_node->out; e != 0; e = e->next_out)
1476         {
1477           ddg_node_ptr v_node = e->dest;
1478
1479           if (TEST_BIT (sched_nodes, v_node->cuid))
1480             {
1481               late_start = MIN (late_start,
1482                                 SCHED_TIME (v_node) - e->latency
1483                                 + (e->distance * ii));
1484               if (e->data_type == MEM_DEP)
1485                 start = MAX (start, SCHED_TIME (v_node) - ii + 1);
1486             }
1487         }
1488       start = MAX (start, early_start);
1489       end = MIN (end, MIN (early_start + ii, late_start + 1));
1490       step = 1;
1491     }
1492   else /* psp is empty && pss is empty.  */
1493     {
1494       start = SCHED_ASAP (u_node);
1495       end = start + ii;
1496       step = 1;
1497     }
1498
1499   *start_p = start;
1500   *step_p = step;
1501   *end_p = end;
1502   sbitmap_free (psp);
1503   sbitmap_free (pss);
1504
1505   if ((start >= end && step == 1) || (start <= end && step == -1))
1506     return -1;
1507   else
1508     return 0;
1509 }
1510
1511 /* This function implements the scheduling algorithm for SMS according to the
1512    above algorithm.  */
1513 static partial_schedule_ptr
1514 sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order, FILE *dump_file)
1515 {
1516   int ii = mii;
1517   int i, c, success;
1518   int try_again_with_larger_ii = true;
1519   int num_nodes = g->num_nodes;
1520   ddg_edge_ptr e;
1521   int start, end, step; /* Place together into one struct?  */
1522   sbitmap sched_nodes = sbitmap_alloc (num_nodes);
1523   sbitmap must_precede = sbitmap_alloc (num_nodes);
1524   sbitmap must_follow = sbitmap_alloc (num_nodes);
1525   sbitmap tobe_scheduled = sbitmap_alloc (num_nodes);
1526
1527   partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY);
1528
1529   sbitmap_ones (tobe_scheduled);
1530   sbitmap_zero (sched_nodes);
1531
1532   while ((! sbitmap_equal (tobe_scheduled, sched_nodes)
1533          || try_again_with_larger_ii ) && ii < maxii)
1534     {
1535       int j;
1536       bool unscheduled_nodes = false;
1537
1538       if (dump_file)
1539         fprintf(dump_file, "Starting with ii=%d\n", ii);
1540       if (try_again_with_larger_ii)
1541         {
1542           try_again_with_larger_ii = false;
1543           sbitmap_zero (sched_nodes);
1544         }
1545
1546       for (i = 0; i < num_nodes; i++)
1547         {
1548           int u = nodes_order[i];
1549           ddg_node_ptr u_node = &ps->g->nodes[u];
1550           rtx insn = u_node->insn;
1551
1552           if (!INSN_P (insn))
1553             {
1554               RESET_BIT (tobe_scheduled, u);
1555               continue;
1556             }
1557
1558           if (JUMP_P (insn)) /* Closing branch handled later.  */
1559             {
1560               RESET_BIT (tobe_scheduled, u);
1561               continue;
1562             }
1563
1564           if (TEST_BIT (sched_nodes, u))
1565             continue;
1566
1567           /* Try to get non-empty scheduling window.  */
1568           j = i;
1569           while (get_sched_window (ps, nodes_order, i, sched_nodes, ii, &start, &step, &end) < 0
1570                  && j > 0)
1571             {
1572               unscheduled_nodes = true;
1573               if (TEST_BIT (NODE_PREDECESSORS (u_node), nodes_order[j - 1])
1574                   || TEST_BIT (NODE_SUCCESSORS (u_node), nodes_order[j - 1]))
1575                 {
1576                   ps_unschedule_node (ps, &ps->g->nodes[nodes_order[j - 1]]);
1577                   RESET_BIT (sched_nodes, nodes_order [j - 1]);
1578                 }
1579               j--;
1580             }
1581           if (j < 0)
1582             {
1583               /* ??? Try backtracking instead of immediately ii++?  */
1584               ii++;
1585               try_again_with_larger_ii = true;
1586               reset_partial_schedule (ps, ii);
1587               break;
1588             }
1589           /* 2. Try scheduling u in window.  */
1590           if (dump_file)
1591             fprintf(dump_file, "Trying to schedule node %d in (%d .. %d) step %d\n",
1592                     u, start, end, step);
1593
1594           /* use must_follow & must_precede bitmaps to determine order
1595              of nodes within the cycle.  */
1596           sbitmap_zero (must_precede);
1597           sbitmap_zero (must_follow);
1598           for (e = u_node->in; e != 0; e = e->next_in)
1599             if (TEST_BIT (sched_nodes, e->src->cuid)
1600                 && e->latency == (ii * e->distance)
1601                 && start == SCHED_TIME (e->src))
1602              SET_BIT (must_precede, e->src->cuid);
1603
1604           for (e = u_node->out; e != 0; e = e->next_out)
1605             if (TEST_BIT (sched_nodes, e->dest->cuid)
1606                 && e->latency == (ii * e->distance)
1607                 && end == SCHED_TIME (e->dest))
1608              SET_BIT (must_follow, e->dest->cuid);
1609
1610           success = 0;
1611           if ((step > 0 && start < end) ||  (step < 0 && start > end))
1612             for (c = start; c != end; c += step)
1613               {
1614                 ps_insn_ptr psi;
1615
1616                 psi = ps_add_node_check_conflicts (ps, u_node, c,
1617                                                    must_precede,
1618                                                    must_follow);
1619
1620                 if (psi)
1621                   {
1622                     SCHED_TIME (u_node) = c;
1623                     SET_BIT (sched_nodes, u);
1624                     success = 1;
1625                     if (dump_file)
1626                       fprintf(dump_file, "Schedule in %d\n", c);
1627                     break;
1628                   }
1629               }
1630           if (!success)
1631             {
1632               /* ??? Try backtracking instead of immediately ii++?  */
1633               ii++;
1634               try_again_with_larger_ii = true;
1635               reset_partial_schedule (ps, ii);
1636               break;
1637             }
1638           if (unscheduled_nodes)
1639             break;
1640
1641           /* ??? If (success), check register pressure estimates.  */
1642         } /* Continue with next node.  */
1643     } /* While try_again_with_larger_ii.  */
1644
1645   sbitmap_free (sched_nodes);
1646
1647   if (ii >= maxii)
1648     {
1649       free_partial_schedule (ps);
1650       ps = NULL;
1651     }
1652   return ps;
1653 }
1654
1655 \f
1656 /* This page implements the algorithm for ordering the nodes of a DDG
1657    for modulo scheduling, activated through the
1658    "int sms_order_nodes (ddg_ptr, int mii, int * result)" API.  */
1659
1660 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
1661 #define ASAP(x) (ORDER_PARAMS ((x))->asap)
1662 #define ALAP(x) (ORDER_PARAMS ((x))->alap)
1663 #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
1664 #define MOB(x) (ALAP ((x)) - ASAP ((x)))
1665 #define DEPTH(x) (ASAP ((x)))
1666
1667 typedef struct node_order_params * nopa;
1668
1669 static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
1670 static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
1671 static nopa  calculate_order_params (ddg_ptr, int mii);
1672 static int find_max_asap (ddg_ptr, sbitmap);
1673 static int find_max_hv_min_mob (ddg_ptr, sbitmap);
1674 static int find_max_dv_min_mob (ddg_ptr, sbitmap);
1675
1676 enum sms_direction {BOTTOMUP, TOPDOWN};
1677
1678 struct node_order_params
1679 {
1680   int asap;
1681   int alap;
1682   int height;
1683 };
1684
1685 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1.  */
1686 static void
1687 check_nodes_order (int *node_order, int num_nodes)
1688 {
1689   int i;
1690   sbitmap tmp = sbitmap_alloc (num_nodes);
1691
1692   sbitmap_zero (tmp);
1693
1694   for (i = 0; i < num_nodes; i++)
1695     {
1696       int u = node_order[i];
1697
1698       gcc_assert (u < num_nodes && u >= 0 && !TEST_BIT (tmp, u));
1699
1700       SET_BIT (tmp, u);
1701     }
1702
1703   sbitmap_free (tmp);
1704 }
1705
1706 /* Order the nodes of G for scheduling and pass the result in
1707    NODE_ORDER.  Also set aux.count of each node to ASAP.
1708    Return the recMII for the given DDG.  */
1709 static int
1710 sms_order_nodes (ddg_ptr g, int mii, int * node_order)
1711 {
1712   int i;
1713   int rec_mii = 0;
1714   ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
1715
1716   nopa nops = calculate_order_params (g, mii);
1717
1718   order_nodes_of_sccs (sccs, node_order);
1719
1720   if (sccs->num_sccs > 0)
1721     /* First SCC has the largest recurrence_length.  */
1722     rec_mii = sccs->sccs[0]->recurrence_length;
1723
1724   /* Save ASAP before destroying node_order_params.  */
1725   for (i = 0; i < g->num_nodes; i++)
1726     {
1727       ddg_node_ptr v = &g->nodes[i];
1728       v->aux.count = ASAP (v);
1729     }
1730
1731   free (nops);
1732   free_ddg_all_sccs (sccs);
1733   check_nodes_order (node_order, g->num_nodes);
1734
1735   return rec_mii;
1736 }
1737
1738 static void
1739 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
1740 {
1741   int i, pos = 0;
1742   ddg_ptr g = all_sccs->ddg;
1743   int num_nodes = g->num_nodes;
1744   sbitmap prev_sccs = sbitmap_alloc (num_nodes);
1745   sbitmap on_path = sbitmap_alloc (num_nodes);
1746   sbitmap tmp = sbitmap_alloc (num_nodes);
1747   sbitmap ones = sbitmap_alloc (num_nodes);
1748
1749   sbitmap_zero (prev_sccs);
1750   sbitmap_ones (ones);
1751
1752   /* Perfrom the node ordering starting from the SCC with the highest recMII.
1753      For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc.  */
1754   for (i = 0; i < all_sccs->num_sccs; i++)
1755     {
1756       ddg_scc_ptr scc = all_sccs->sccs[i];
1757
1758       /* Add nodes on paths from previous SCCs to the current SCC.  */
1759       find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
1760       sbitmap_a_or_b (tmp, scc->nodes, on_path);
1761
1762       /* Add nodes on paths from the current SCC to previous SCCs.  */
1763       find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
1764       sbitmap_a_or_b (tmp, tmp, on_path);
1765
1766       /* Remove nodes of previous SCCs from current extended SCC.  */
1767       sbitmap_difference (tmp, tmp, prev_sccs);
1768
1769       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1770       /* Above call to order_nodes_in_scc updated prev_sccs |= tmp.  */
1771     }
1772
1773   /* Handle the remaining nodes that do not belong to any scc.  Each call
1774      to order_nodes_in_scc handles a single connected component.  */
1775   while (pos < g->num_nodes)
1776     {
1777       sbitmap_difference (tmp, ones, prev_sccs);
1778       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1779     }
1780   sbitmap_free (prev_sccs);
1781   sbitmap_free (on_path);
1782   sbitmap_free (tmp);
1783   sbitmap_free (ones);
1784 }
1785
1786 /* MII is needed if we consider backarcs (that do not close recursive cycles).  */
1787 static struct node_order_params *
1788 calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED)
1789 {
1790   int u;
1791   int max_asap;
1792   int num_nodes = g->num_nodes;
1793   ddg_edge_ptr e;
1794   /* Allocate a place to hold ordering params for each node in the DDG.  */
1795   nopa node_order_params_arr;
1796
1797   /* Initialize of ASAP/ALAP/HEIGHT to zero.  */
1798   node_order_params_arr = (nopa) xcalloc (num_nodes,
1799                                           sizeof (struct node_order_params));
1800
1801   /* Set the aux pointer of each node to point to its order_params structure.  */
1802   for (u = 0; u < num_nodes; u++)
1803     g->nodes[u].aux.info = &node_order_params_arr[u];
1804
1805   /* Disregarding a backarc from each recursive cycle to obtain a DAG,
1806      calculate ASAP, ALAP, mobility, distance, and height for each node
1807      in the dependence (direct acyclic) graph.  */
1808
1809   /* We assume that the nodes in the array are in topological order.  */
1810
1811   max_asap = 0;
1812   for (u = 0; u < num_nodes; u++)
1813     {
1814       ddg_node_ptr u_node = &g->nodes[u];
1815
1816       ASAP (u_node) = 0;
1817       for (e = u_node->in; e; e = e->next_in)
1818         if (e->distance == 0)
1819           ASAP (u_node) = MAX (ASAP (u_node),
1820                                ASAP (e->src) + e->latency);
1821       max_asap = MAX (max_asap, ASAP (u_node));
1822     }
1823
1824   for (u = num_nodes - 1; u > -1; u--)
1825     {
1826       ddg_node_ptr u_node = &g->nodes[u];
1827
1828       ALAP (u_node) = max_asap;
1829       HEIGHT (u_node) = 0;
1830       for (e = u_node->out; e; e = e->next_out)
1831         if (e->distance == 0)
1832           {
1833             ALAP (u_node) = MIN (ALAP (u_node),
1834                                  ALAP (e->dest) - e->latency);
1835             HEIGHT (u_node) = MAX (HEIGHT (u_node),
1836                                    HEIGHT (e->dest) + e->latency);
1837           }
1838     }
1839
1840   return node_order_params_arr;
1841 }
1842
1843 static int
1844 find_max_asap (ddg_ptr g, sbitmap nodes)
1845 {
1846   unsigned int u;
1847   int max_asap = -1;
1848   int result = -1;
1849   sbitmap_iterator sbi;
1850
1851   EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1852     {
1853       ddg_node_ptr u_node = &g->nodes[u];
1854
1855       if (max_asap < ASAP (u_node))
1856         {
1857           max_asap = ASAP (u_node);
1858           result = u;
1859         }
1860     }
1861   return result;
1862 }
1863
1864 static int
1865 find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
1866 {
1867   unsigned int u;
1868   int max_hv = -1;
1869   int min_mob = INT_MAX;
1870   int result = -1;
1871   sbitmap_iterator sbi;
1872
1873   EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1874     {
1875       ddg_node_ptr u_node = &g->nodes[u];
1876
1877       if (max_hv < HEIGHT (u_node))
1878         {
1879           max_hv = HEIGHT (u_node);
1880           min_mob = MOB (u_node);
1881           result = u;
1882         }
1883       else if ((max_hv == HEIGHT (u_node))
1884                && (min_mob > MOB (u_node)))
1885         {
1886           min_mob = MOB (u_node);
1887           result = u;
1888         }
1889     }
1890   return result;
1891 }
1892
1893 static int
1894 find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
1895 {
1896   unsigned int u;
1897   int max_dv = -1;
1898   int min_mob = INT_MAX;
1899   int result = -1;
1900   sbitmap_iterator sbi;
1901
1902   EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1903     {
1904       ddg_node_ptr u_node = &g->nodes[u];
1905
1906       if (max_dv < DEPTH (u_node))
1907         {
1908           max_dv = DEPTH (u_node);
1909           min_mob = MOB (u_node);
1910           result = u;
1911         }
1912       else if ((max_dv == DEPTH (u_node))
1913                && (min_mob > MOB (u_node)))
1914         {
1915           min_mob = MOB (u_node);
1916           result = u;
1917         }
1918     }
1919   return result;
1920 }
1921
1922 /* Places the nodes of SCC into the NODE_ORDER array starting
1923    at position POS, according to the SMS ordering algorithm.
1924    NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
1925    the NODE_ORDER array, starting from position zero.  */
1926 static int
1927 order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
1928                     int * node_order, int pos)
1929 {
1930   enum sms_direction dir;
1931   int num_nodes = g->num_nodes;
1932   sbitmap workset = sbitmap_alloc (num_nodes);
1933   sbitmap tmp = sbitmap_alloc (num_nodes);
1934   sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
1935   sbitmap predecessors = sbitmap_alloc (num_nodes);
1936   sbitmap successors = sbitmap_alloc (num_nodes);
1937
1938   sbitmap_zero (predecessors);
1939   find_predecessors (predecessors, g, nodes_ordered);
1940
1941   sbitmap_zero (successors);
1942   find_successors (successors, g, nodes_ordered);
1943
1944   sbitmap_zero (tmp);
1945   if (sbitmap_a_and_b_cg (tmp, predecessors, scc))
1946     {
1947       sbitmap_copy (workset, tmp);
1948       dir = BOTTOMUP;
1949     }
1950   else if (sbitmap_a_and_b_cg (tmp, successors, scc))
1951     {
1952       sbitmap_copy (workset, tmp);
1953       dir = TOPDOWN;
1954     }
1955   else
1956     {
1957       int u;
1958
1959       sbitmap_zero (workset);
1960       if ((u = find_max_asap (g, scc)) >= 0)
1961         SET_BIT (workset, u);
1962       dir = BOTTOMUP;
1963     }
1964
1965   sbitmap_zero (zero_bitmap);
1966   while (!sbitmap_equal (workset, zero_bitmap))
1967     {
1968       int v;
1969       ddg_node_ptr v_node;
1970       sbitmap v_node_preds;
1971       sbitmap v_node_succs;
1972
1973       if (dir == TOPDOWN)
1974         {
1975           while (!sbitmap_equal (workset, zero_bitmap))
1976             {
1977               v = find_max_hv_min_mob (g, workset);
1978               v_node = &g->nodes[v];
1979               node_order[pos++] = v;
1980               v_node_succs = NODE_SUCCESSORS (v_node);
1981               sbitmap_a_and_b (tmp, v_node_succs, scc);
1982
1983               /* Don't consider the already ordered successors again.  */
1984               sbitmap_difference (tmp, tmp, nodes_ordered);
1985               sbitmap_a_or_b (workset, workset, tmp);
1986               RESET_BIT (workset, v);
1987               SET_BIT (nodes_ordered, v);
1988             }
1989           dir = BOTTOMUP;
1990           sbitmap_zero (predecessors);
1991           find_predecessors (predecessors, g, nodes_ordered);
1992           sbitmap_a_and_b (workset, predecessors, scc);
1993         }
1994       else
1995         {
1996           while (!sbitmap_equal (workset, zero_bitmap))
1997             {
1998               v = find_max_dv_min_mob (g, workset);
1999               v_node = &g->nodes[v];
2000               node_order[pos++] = v;
2001               v_node_preds = NODE_PREDECESSORS (v_node);
2002               sbitmap_a_and_b (tmp, v_node_preds, scc);
2003
2004               /* Don't consider the already ordered predecessors again.  */
2005               sbitmap_difference (tmp, tmp, nodes_ordered);
2006               sbitmap_a_or_b (workset, workset, tmp);
2007               RESET_BIT (workset, v);
2008               SET_BIT (nodes_ordered, v);
2009             }
2010           dir = TOPDOWN;
2011           sbitmap_zero (successors);
2012           find_successors (successors, g, nodes_ordered);
2013           sbitmap_a_and_b (workset, successors, scc);
2014         }
2015     }
2016   sbitmap_free (tmp);
2017   sbitmap_free (workset);
2018   sbitmap_free (zero_bitmap);
2019   sbitmap_free (predecessors);
2020   sbitmap_free (successors);
2021   return pos;
2022 }
2023
2024 \f
2025 /* This page contains functions for manipulating partial-schedules during
2026    modulo scheduling.  */
2027
2028 /* Create a partial schedule and allocate a memory to hold II rows.  */
2029 partial_schedule_ptr
2030 create_partial_schedule (int ii, ddg_ptr g, int history)
2031 {
2032   partial_schedule_ptr ps = (partial_schedule_ptr)
2033                              xmalloc (sizeof (struct partial_schedule));
2034   ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
2035   ps->ii = ii;
2036   ps->history = history;
2037   ps->min_cycle = INT_MAX;
2038   ps->max_cycle = INT_MIN;
2039   ps->g = g;
2040
2041   return ps;
2042 }
2043
2044 /* Free the PS_INSNs in rows array of the given partial schedule.
2045    ??? Consider caching the PS_INSN's.  */
2046 static void
2047 free_ps_insns (partial_schedule_ptr ps)
2048 {
2049   int i;
2050
2051   for (i = 0; i < ps->ii; i++)
2052     {
2053       while (ps->rows[i])
2054         {
2055           ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
2056
2057           free (ps->rows[i]);
2058           ps->rows[i] = ps_insn;
2059         }
2060       ps->rows[i] = NULL;
2061     }
2062 }
2063
2064 /* Free all the memory allocated to the partial schedule.  */
2065 void
2066 free_partial_schedule (partial_schedule_ptr ps)
2067 {
2068   if (!ps)
2069     return;
2070   free_ps_insns (ps);
2071   free (ps->rows);
2072   free (ps);
2073 }
2074
2075 /* Clear the rows array with its PS_INSNs, and create a new one with
2076    NEW_II rows.  */
2077 void
2078 reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
2079 {
2080   if (!ps)
2081     return;
2082   free_ps_insns (ps);
2083   if (new_ii == ps->ii)
2084     return;
2085   ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
2086                                                  * sizeof (ps_insn_ptr));
2087   memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
2088   ps->ii = new_ii;
2089   ps->min_cycle = INT_MAX;
2090   ps->max_cycle = INT_MIN;
2091 }
2092
2093 /* Prints the partial schedule as an ii rows array, for each rows
2094    print the ids of the insns in it.  */
2095 void
2096 print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
2097 {
2098   int i;
2099
2100   for (i = 0; i < ps->ii; i++)
2101     {
2102       ps_insn_ptr ps_i = ps->rows[i];
2103
2104       fprintf (dump, "\n[CYCLE %d ]: ", i);
2105       while (ps_i)
2106         {
2107           fprintf (dump, "%d, ",
2108                    INSN_UID (ps_i->node->insn));
2109           ps_i = ps_i->next_in_row;
2110         }
2111     }
2112 }
2113
2114 /* Creates an object of PS_INSN and initializes it to the given parameters.  */
2115 static ps_insn_ptr
2116 create_ps_insn (ddg_node_ptr node, int rest_count, int cycle)
2117 {
2118   ps_insn_ptr ps_i = xmalloc (sizeof (struct ps_insn));
2119
2120   ps_i->node = node;
2121   ps_i->next_in_row = NULL;
2122   ps_i->prev_in_row = NULL;
2123   ps_i->row_rest_count = rest_count;
2124   ps_i->cycle = cycle;
2125
2126   return ps_i;
2127 }
2128
2129
2130 /* Removes the given PS_INSN from the partial schedule.  Returns false if the
2131    node is not found in the partial schedule, else returns true.  */
2132 static bool
2133 remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
2134 {
2135   int row;
2136
2137   if (!ps || !ps_i)
2138     return false;
2139
2140   row = SMODULO (ps_i->cycle, ps->ii);
2141   if (! ps_i->prev_in_row)
2142     {
2143       if (ps_i != ps->rows[row])
2144         return false;
2145
2146       ps->rows[row] = ps_i->next_in_row;
2147       if (ps->rows[row])
2148         ps->rows[row]->prev_in_row = NULL;
2149     }
2150   else
2151     {
2152       ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
2153       if (ps_i->next_in_row)
2154         ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
2155     }
2156   free (ps_i);
2157   return true;
2158 }
2159
2160 /* Unlike what literature describes for modulo scheduling (which focuses
2161    on VLIW machines) the order of the instructions inside a cycle is
2162    important.  Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
2163    where the current instruction should go relative to the already
2164    scheduled instructions in the given cycle.  Go over these
2165    instructions and find the first possible column to put it in.  */
2166 static bool
2167 ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2168                      sbitmap must_precede, sbitmap must_follow)
2169 {
2170   ps_insn_ptr next_ps_i;
2171   ps_insn_ptr first_must_follow = NULL;
2172   ps_insn_ptr last_must_precede = NULL;
2173   int row;
2174
2175   if (! ps_i)
2176     return false;
2177
2178   row = SMODULO (ps_i->cycle, ps->ii);
2179
2180   /* Find the first must follow and the last must precede
2181      and insert the node immediately after the must precede
2182      but make sure that it there is no must follow after it.  */
2183   for (next_ps_i = ps->rows[row];
2184        next_ps_i;
2185        next_ps_i = next_ps_i->next_in_row)
2186     {
2187       if (TEST_BIT (must_follow, next_ps_i->node->cuid)
2188           && ! first_must_follow)
2189         first_must_follow = next_ps_i;
2190       if (TEST_BIT (must_precede, next_ps_i->node->cuid))
2191         {
2192           /* If we have already met a node that must follow, then
2193              there is no possible column.  */
2194           if (first_must_follow)
2195             return false;
2196           else
2197             last_must_precede = next_ps_i;
2198         }
2199     }
2200
2201   /* Now insert the node after INSERT_AFTER_PSI.  */
2202
2203   if (! last_must_precede)
2204     {
2205       ps_i->next_in_row = ps->rows[row];
2206       ps_i->prev_in_row = NULL;
2207       if (ps_i->next_in_row)
2208         ps_i->next_in_row->prev_in_row = ps_i;
2209       ps->rows[row] = ps_i;
2210     }
2211   else
2212     {
2213       ps_i->next_in_row = last_must_precede->next_in_row;
2214       last_must_precede->next_in_row = ps_i;
2215       ps_i->prev_in_row = last_must_precede;
2216       if (ps_i->next_in_row)
2217         ps_i->next_in_row->prev_in_row = ps_i;
2218     }
2219
2220   return true;
2221 }
2222
2223 /* Advances the PS_INSN one column in its current row; returns false
2224    in failure and true in success.  Bit N is set in MUST_FOLLOW if
2225    the node with cuid N must be come after the node pointed to by
2226    PS_I when scheduled in the same cycle.  */
2227 static int
2228 ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2229                         sbitmap must_follow)
2230 {
2231   ps_insn_ptr prev, next;
2232   int row;
2233   ddg_node_ptr next_node;
2234
2235   if (!ps || !ps_i)
2236     return false;
2237
2238   row = SMODULO (ps_i->cycle, ps->ii);
2239
2240   if (! ps_i->next_in_row)
2241     return false;
2242
2243   next_node = ps_i->next_in_row->node;
2244
2245   /* Check if next_in_row is dependent on ps_i, both having same sched
2246      times (typically ANTI_DEP).  If so, ps_i cannot skip over it.  */
2247   if (TEST_BIT (must_follow, next_node->cuid))
2248     return false;
2249
2250   /* Advance PS_I over its next_in_row in the doubly linked list.  */
2251   prev = ps_i->prev_in_row;
2252   next = ps_i->next_in_row;
2253
2254   if (ps_i == ps->rows[row])
2255     ps->rows[row] = next;
2256
2257   ps_i->next_in_row = next->next_in_row;
2258
2259   if (next->next_in_row)
2260     next->next_in_row->prev_in_row = ps_i;
2261
2262   next->next_in_row = ps_i;
2263   ps_i->prev_in_row = next;
2264
2265   next->prev_in_row = prev;
2266   if (prev)
2267     prev->next_in_row = next;
2268
2269   return true;
2270 }
2271
2272 /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
2273    Returns 0 if this is not possible and a PS_INSN otherwise.  Bit N is
2274    set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
2275    before/after (respectively) the node pointed to by PS_I when scheduled
2276    in the same cycle.  */
2277 static ps_insn_ptr
2278 add_node_to_ps (partial_schedule_ptr ps, ddg_node_ptr node, int cycle,
2279                 sbitmap must_precede, sbitmap must_follow)
2280 {
2281   ps_insn_ptr ps_i;
2282   int rest_count = 1;
2283   int row = SMODULO (cycle, ps->ii);
2284
2285   if (ps->rows[row]
2286       && ps->rows[row]->row_rest_count >= issue_rate)
2287     return NULL;
2288
2289   if (ps->rows[row])
2290     rest_count += ps->rows[row]->row_rest_count;
2291
2292   ps_i = create_ps_insn (node, rest_count, cycle);
2293
2294   /* Finds and inserts PS_I according to MUST_FOLLOW and
2295      MUST_PRECEDE.  */
2296   if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow))
2297     {
2298       free (ps_i);
2299       return NULL;
2300     }
2301
2302   return ps_i;
2303 }
2304
2305 /* Advance time one cycle.  Assumes DFA is being used.  */
2306 static void
2307 advance_one_cycle (void)
2308 {
2309   if (targetm.sched.dfa_pre_cycle_insn)
2310     state_transition (curr_state,
2311                       targetm.sched.dfa_pre_cycle_insn ());
2312
2313   state_transition (curr_state, NULL);
2314
2315   if (targetm.sched.dfa_post_cycle_insn)
2316     state_transition (curr_state,
2317                       targetm.sched.dfa_post_cycle_insn ());
2318 }
2319
2320 /* Given the kernel of a loop (from FIRST_INSN to LAST_INSN), finds
2321    the number of cycles according to DFA that the kernel fits in,
2322    we use this to check if we done well with SMS after we add
2323    register moves.  In some cases register moves overhead makes
2324    it even worse than the original loop.  We want SMS to be performed
2325    when it gives less cycles after register moves are added.  */
2326 static int
2327 kernel_number_of_cycles (rtx first_insn, rtx last_insn)
2328 {
2329   int cycles = 0;
2330   rtx insn;
2331   int can_issue_more = issue_rate;
2332
2333   state_reset (curr_state);
2334
2335   for (insn = first_insn;
2336        insn != NULL_RTX && insn != last_insn;
2337        insn = NEXT_INSN (insn))
2338     {
2339       if (! INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE)
2340         continue;
2341
2342       /* Check if there is room for the current insn.  */
2343       if (!can_issue_more || state_dead_lock_p (curr_state))
2344         {
2345           cycles ++;
2346           advance_one_cycle ();
2347           can_issue_more = issue_rate;
2348         }
2349
2350         /* Update the DFA state and return with failure if the DFA found
2351            recource conflicts.  */
2352       if (state_transition (curr_state, insn) >= 0)
2353         {
2354           cycles ++;
2355           advance_one_cycle ();
2356           can_issue_more = issue_rate;
2357         }
2358
2359       if (targetm.sched.variable_issue)
2360         can_issue_more =
2361           targetm.sched.variable_issue (sched_dump, sched_verbose,
2362                                         insn, can_issue_more);
2363       /* A naked CLOBBER or USE generates no instruction, so don't
2364          let them consume issue slots.  */
2365       else if (GET_CODE (PATTERN (insn)) != USE
2366                && GET_CODE (PATTERN (insn)) != CLOBBER)
2367         can_issue_more--;
2368     }
2369   return cycles;
2370 }
2371
2372 /* Checks if PS has resource conflicts according to DFA, starting from
2373    FROM cycle to TO cycle; returns true if there are conflicts and false
2374    if there are no conflicts.  Assumes DFA is being used.  */
2375 static int
2376 ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
2377 {
2378   int cycle;
2379
2380   state_reset (curr_state);
2381
2382   for (cycle = from; cycle <= to; cycle++)
2383     {
2384       ps_insn_ptr crr_insn;
2385       /* Holds the remaining issue slots in the current row.  */
2386       int can_issue_more = issue_rate;
2387
2388       /* Walk through the DFA for the current row.  */
2389       for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
2390            crr_insn;
2391            crr_insn = crr_insn->next_in_row)
2392         {
2393           rtx insn = crr_insn->node->insn;
2394
2395           if (!INSN_P (insn))
2396             continue;
2397
2398           /* Check if there is room for the current insn.  */
2399           if (!can_issue_more || state_dead_lock_p (curr_state))
2400             return true;
2401
2402           /* Update the DFA state and return with failure if the DFA found
2403              recource conflicts.  */
2404           if (state_transition (curr_state, insn) >= 0)
2405             return true;
2406
2407           if (targetm.sched.variable_issue)
2408             can_issue_more =
2409               targetm.sched.variable_issue (sched_dump, sched_verbose,
2410                                             insn, can_issue_more);
2411           /* A naked CLOBBER or USE generates no instruction, so don't
2412              let them consume issue slots.  */
2413           else if (GET_CODE (PATTERN (insn)) != USE
2414                    && GET_CODE (PATTERN (insn)) != CLOBBER)
2415             can_issue_more--;
2416         }
2417
2418       /* Advance the DFA to the next cycle.  */
2419       advance_one_cycle ();
2420     }
2421   return false;
2422 }
2423
2424 /* Checks if the given node causes resource conflicts when added to PS at
2425    cycle C.  If not the node is added to PS and returned; otherwise zero
2426    is returned.  Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
2427    cuid N must be come before/after (respectively) the node pointed to by
2428    PS_I when scheduled in the same cycle.  */
2429 ps_insn_ptr
2430 ps_add_node_check_conflicts (partial_schedule_ptr ps, ddg_node_ptr n,
2431                              int c, sbitmap must_precede,
2432                              sbitmap must_follow)
2433 {
2434   int has_conflicts = 0;
2435   ps_insn_ptr ps_i;
2436
2437   /* First add the node to the PS, if this succeeds check for
2438      conflicts, trying different issue slots in the same row.  */
2439   if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow)))
2440     return NULL; /* Failed to insert the node at the given cycle.  */
2441
2442   has_conflicts = ps_has_conflicts (ps, c, c)
2443                   || (ps->history > 0
2444                       && ps_has_conflicts (ps,
2445                                            c - ps->history,
2446                                            c + ps->history));
2447
2448   /* Try different issue slots to find one that the given node can be
2449      scheduled in without conflicts.  */
2450   while (has_conflicts)
2451     {
2452       if (! ps_insn_advance_column (ps, ps_i, must_follow))
2453         break;
2454       has_conflicts = ps_has_conflicts (ps, c, c)
2455                       || (ps->history > 0
2456                           && ps_has_conflicts (ps,
2457                                                c - ps->history,
2458                                                c + ps->history));
2459     }
2460
2461   if (has_conflicts)
2462     {
2463       remove_node_from_ps (ps, ps_i);
2464       return NULL;
2465     }
2466
2467   ps->min_cycle = MIN (ps->min_cycle, c);
2468   ps->max_cycle = MAX (ps->max_cycle, c);
2469   return ps_i;
2470 }
2471
2472 /* Rotate the rows of PS such that insns scheduled at time
2473    START_CYCLE will appear in row 0.  Updates max/min_cycles.  */
2474 void
2475 rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
2476 {
2477   int i, row, backward_rotates;
2478   int last_row = ps->ii - 1;
2479
2480   if (start_cycle == 0)
2481     return;
2482
2483   backward_rotates = SMODULO (start_cycle, ps->ii);
2484
2485   /* Revisit later and optimize this into a single loop.  */
2486   for (i = 0; i < backward_rotates; i++)
2487     {
2488       ps_insn_ptr first_row = ps->rows[0];
2489
2490       for (row = 0; row < last_row; row++)
2491         ps->rows[row] = ps->rows[row+1];
2492
2493       ps->rows[last_row] = first_row;
2494     }
2495
2496   ps->max_cycle -= start_cycle;
2497   ps->min_cycle -= start_cycle;
2498 }
2499
2500 /* Remove the node N from the partial schedule PS; because we restart the DFA
2501    each time we want to check for resource conflicts; this is equivalent to
2502    unscheduling the node N.  */
2503 static bool
2504 ps_unschedule_node (partial_schedule_ptr ps, ddg_node_ptr n)
2505 {
2506   ps_insn_ptr ps_i;
2507   int row = SMODULO (SCHED_TIME (n), ps->ii);
2508
2509   if (row < 0 || row > ps->ii)
2510     return false;
2511
2512   for (ps_i = ps->rows[row];
2513        ps_i &&  ps_i->node != n;
2514        ps_i = ps_i->next_in_row);
2515   if (!ps_i)
2516     return false;
2517
2518   return remove_node_from_ps (ps, ps_i);
2519 }
2520 #endif /* INSN_SCHEDULING*/
2521