gcc/modulo-sched.c

   1 /* Swing Modulo Scheduling implementation.
   2    Copyright (C) 2004, 2005
   3    Free Software Foundation, Inc.
   4    Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 2, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING.  If not, write to the Free
  20 Software Foundation, 59 Temple Place - Suite 330, Boston, MA
  21 02111-1307, USA.  */
  22
  23
  24 #include "config.h"
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "tm.h"
  28 #include "toplev.h"
  29 #include "rtl.h"
  30 #include "tm_p.h"
  31 #include "hard-reg-set.h"
  32 #include "regs.h"
  33 #include "function.h"
  34 #include "flags.h"
  35 #include "insn-config.h"
  36 #include "insn-attr.h"
  37 #include "except.h"
  38 #include "toplev.h"
  39 #include "recog.h"
  40 #include "sched-int.h"
  41 #include "target.h"
  42 #include "cfglayout.h"
  43 #include "cfgloop.h"
  44 #include "cfghooks.h"
  45 #include "expr.h"
  46 #include "params.h"
  47 #include "gcov-io.h"
  48 #include "df.h"
  49 #include "ddg.h"
  50
  51 #ifdef INSN_SCHEDULING
  52
  53 /* This file contains the implementation of the Swing Modulo Scheduler,
  54    described in the following references:
  55    [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
  56        Lifetime--sensitive modulo scheduling in a production environment.
  57        IEEE Trans. on Comps., 50(3), March 2001
  58    [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
  59        Swing Modulo Scheduling: A Lifetime Sensitive Approach.
  60        PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
  61
  62    The basic structure is:
  63    1. Build a data-dependence graph (DDG) for each loop.
  64    2. Use the DDG to order the insns of a loop (not in topological order
  65       necessarily, but rather) trying to place each insn after all its
  66       predecessors _or_ after all its successors.
  67    3. Compute MII: a lower bound on the number of cycles to schedule the loop.
  68    4. Use the ordering to perform list-scheduling of the loop:
  69       1. Set II = MII.  We will try to schedule the loop within II cycles.
  70       2. Try to schedule the insns one by one according to the ordering.
  71          For each insn compute an interval of cycles by considering already-
  72          scheduled preds and succs (and associated latencies); try to place
  73          the insn in the cycles of this window checking for potential
  74          resource conflicts (using the DFA interface).
  75          Note: this is different from the cycle-scheduling of schedule_insns;
  76          here the insns are not scheduled monotonically top-down (nor bottom-
  77          up).
  78       3. If failed in scheduling all insns - bump II++ and try again, unless
  79          II reaches an upper bound MaxII, in which case report failure.
  80    5. If we succeeded in scheduling the loop within II cycles, we now
  81       generate prolog and epilog, decrease the counter of the loop, and
  82       perform modulo variable expansion for live ranges that span more than
  83       II cycles (i.e. use register copies to prevent a def from overwriting
  84       itself before reaching the use).
  85 */
  86
  87 \f
  88 /* This page defines partial-schedule structures and functions for
  89    modulo scheduling.  */
  90
  91 typedef struct partial_schedule *partial_schedule_ptr;
  92 typedef struct ps_insn *ps_insn_ptr;
  93
  94 /* The minimum (absolute) cycle that a node of ps was scheduled in.  */
  95 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
  96
  97 /* The maximum (absolute) cycle that a node of ps was scheduled in.  */
  98 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
  99
 100 /* Perform signed modulo, always returning a non-negative value.  */
 101 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
 102
 103 /* The number of different iterations the nodes in ps span, assuming
 104    the stage boundaries are placed efficiently.  */
 105 #define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \
 106                              + 1 + (ps)->ii - 1) / (ps)->ii)
 107
 108 /* A single instruction in the partial schedule.  */
 109 struct ps_insn
 110 {
 111   /* The corresponding DDG_NODE.  */
 112   ddg_node_ptr node;
 113
 114   /* The (absolute) cycle in which the PS instruction is scheduled.
 115      Same as SCHED_TIME (node).  */
 116   int cycle;
 117
 118   /* The next/prev PS_INSN in the same row.  */
 119   ps_insn_ptr next_in_row,
 120               prev_in_row;
 121
 122   /* The number of nodes in the same row that come after this node.  */
 123   int row_rest_count;
 124 };
 125
 126 /* Holds the partial schedule as an array of II rows.  Each entry of the
 127    array points to a linked list of PS_INSNs, which represents the
 128    instructions that are scheduled for that row.  */
 129 struct partial_schedule
 130 {
 131   int ii;       /* Number of rows in the partial schedule.  */
 132   int history;  /* Threshold for conflict checking using DFA.  */
 133
 134   /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii).  */
 135   ps_insn_ptr *rows;
 136
 137   /* The earliest absolute cycle of an insn in the partial schedule.  */
 138   int min_cycle;
 139
 140   /* The latest absolute cycle of an insn in the partial schedule.  */
 141   int max_cycle;
 142
 143   ddg_ptr g;    /* The DDG of the insns in the partial schedule.  */
 144 };
 145
 146 /* We use this to record all the register replacements we do in
 147    the kernel so we can undo SMS if it is not profitable.  */
 148 struct undo_replace_buff_elem
 149 {
 150   rtx insn;
 151   rtx orig_reg;
 152   rtx new_reg;
 153   struct undo_replace_buff_elem *next;
 154 };
 155
 156
 157
 158 partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
 159 void free_partial_schedule (partial_schedule_ptr);
 160 void reset_partial_schedule (partial_schedule_ptr, int new_ii);
 161 void print_partial_schedule (partial_schedule_ptr, FILE *);
 162 static int kernel_number_of_cycles (rtx first_insn, rtx last_insn);
 163 static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
 164                                                 ddg_node_ptr node, int cycle,
 165                                                 sbitmap must_precede,
 166                                                 sbitmap must_follow);
 167 static void rotate_partial_schedule (partial_schedule_ptr, int);
 168 void set_row_column_for_ps (partial_schedule_ptr);
 169 static bool ps_unschedule_node (partial_schedule_ptr, ddg_node_ptr );
 170
 171 \f
 172 /* This page defines constants and structures for the modulo scheduling
 173    driver.  */
 174
 175 /* As in haifa-sched.c:  */
 176 /* issue_rate is the number of insns that can be scheduled in the same
 177    machine cycle.  It can be defined in the config/mach/mach.h file,
 178    otherwise we set it to 1.  */
 179
 180 static int issue_rate;
 181
 182 /* For printing statistics.  */
 183 static FILE *stats_file;
 184
 185 static int sms_order_nodes (ddg_ptr, int, int * result);
 186 static void set_node_sched_params (ddg_ptr);
 187 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int,
 188                                                    int *, FILE*);
 189 static void permute_partial_schedule (partial_schedule_ptr ps, rtx last);
 190 static void generate_prolog_epilog (partial_schedule_ptr ,struct loop * loop, rtx);
 191 static void duplicate_insns_of_cycles (partial_schedule_ptr ps,
 192                                        int from_stage, int to_stage,
 193                                        int is_prolog);
 194
 195 #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
 196 #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
 197 #define SCHED_FIRST_REG_MOVE(x) \
 198         (((node_sched_params_ptr)(x)->aux.info)->first_reg_move)
 199 #define SCHED_NREG_MOVES(x) \
 200         (((node_sched_params_ptr)(x)->aux.info)->nreg_moves)
 201 #define SCHED_ROW(x) (((node_sched_params_ptr)(x)->aux.info)->row)
 202 #define SCHED_STAGE(x) (((node_sched_params_ptr)(x)->aux.info)->stage)
 203 #define SCHED_COLUMN(x) (((node_sched_params_ptr)(x)->aux.info)->column)
 204
 205 /* The scheduling parameters held for each node.  */
 206 typedef struct node_sched_params
 207 {
 208   int asap;     /* A lower-bound on the absolute scheduling cycle.  */
 209   int time;     /* The absolute scheduling cycle (time >= asap).  */
 210
 211   /* The following field (first_reg_move) is a pointer to the first
 212      register-move instruction added to handle the modulo-variable-expansion
 213      of the register defined by this node.  This register-move copies the
 214      original register defined by the node.  */
 215   rtx first_reg_move;
 216
 217   /* The number of register-move instructions added, immediately preceding
 218      first_reg_move.  */
 219   int nreg_moves;
 220
 221   int row;    /* Holds time % ii.  */
 222   int stage;  /* Holds time / ii.  */
 223
 224   /* The column of a node inside the ps.  If nodes u, v are on the same row,
 225      u will precede v if column (u) < column (v).  */
 226   int column;
 227 } *node_sched_params_ptr;
 228
 229 \f
 230 /* The following three functions are copied from the current scheduler
 231    code in order to use sched_analyze() for computing the dependencies.
 232    They are used when initializing the sched_info structure.  */
 233 static const char *
 234 sms_print_insn (rtx insn, int aligned ATTRIBUTE_UNUSED)
 235 {
 236   static char tmp[80];
 237
 238   sprintf (tmp, "i%4d", INSN_UID (insn));
 239   return tmp;
 240 }
 241
 242 static int
 243 contributes_to_priority (rtx next, rtx insn)
 244 {
 245   return BLOCK_NUM (next) == BLOCK_NUM (insn);
 246 }
 247
 248 static void
 249 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
 250                                regset cond_exec ATTRIBUTE_UNUSED,
 251                                regset used ATTRIBUTE_UNUSED,
 252                                regset set ATTRIBUTE_UNUSED)
 253 {
 254 }
 255
 256 static struct sched_info sms_sched_info =
 257 {
 258   NULL,
 259   NULL,
 260   NULL,
 261   NULL,
 262   NULL,
 263   sms_print_insn,
 264   contributes_to_priority,
 265   compute_jump_reg_dependencies,
 266   NULL, NULL,
 267   NULL, NULL,
 268   0, 0, 0
 269 };
 270
 271
 272 /* Return the register decremented and tested in INSN,
 273    or zero if it is not a decrement-and-branch insn.  */
 274
 275 static rtx
 276 doloop_register_get (rtx insn)
 277 {
 278   rtx pattern, reg, condition;
 279
 280   if (! JUMP_P (insn))
 281     return NULL_RTX;
 282
 283   pattern = PATTERN (insn);
 284   condition = doloop_condition_get (pattern);
 285   if (! condition)
 286     return NULL_RTX;
 287
 288   if (REG_P (XEXP (condition, 0)))
 289     reg = XEXP (condition, 0);
 290   else if (GET_CODE (XEXP (condition, 0)) == PLUS
 291            && REG_P (XEXP (XEXP (condition, 0), 0)))
 292     reg = XEXP (XEXP (condition, 0), 0);
 293   else
 294     gcc_unreachable ();
 295
 296   return reg;
 297 }
 298
 299 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
 300    that the number of iterations is a compile-time constant.  If so,
 301    return the rtx that sets COUNT_REG to a constant, and set COUNT to
 302    this constant.  Otherwise return 0.  */
 303 static rtx
 304 const_iteration_count (rtx count_reg, basic_block pre_header,
 305                        HOST_WIDEST_INT * count)
 306 {
 307   rtx insn;
 308   rtx head, tail;
 309
 310   if (! pre_header)
 311     return NULL_RTX;
 312
 313   get_block_head_tail (pre_header->index, &head, &tail);
 314
 315   for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
 316     if (INSN_P (insn) && single_set (insn) &&
 317         rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
 318       {
 319         rtx pat = single_set (insn);
 320
 321         if (GET_CODE (SET_SRC (pat)) == CONST_INT)
 322           {
 323             *count = INTVAL (SET_SRC (pat));
 324             return insn;
 325           }
 326
 327         return NULL_RTX;
 328       }
 329
 330   return NULL_RTX;
 331 }
 332
 333 /* A very simple resource-based lower bound on the initiation interval.
 334    ??? Improve the accuracy of this bound by considering the
 335    utilization of various units.  */
 336 static int
 337 res_MII (ddg_ptr g)
 338 {
 339   return (g->num_nodes / issue_rate);
 340 }
 341
 342
 343 /* Points to the array that contains the sched data for each node.  */
 344 static node_sched_params_ptr node_sched_params;
 345
 346 /* Allocate sched_params for each node and initialize it.  Assumes that
 347    the aux field of each node contain the asap bound (computed earlier),
 348    and copies it into the sched_params field.  */
 349 static void
 350 set_node_sched_params (ddg_ptr g)
 351 {
 352   int i;
 353
 354   /* Allocate for each node in the DDG a place to hold the "sched_data".  */
 355   /* Initialize ASAP/ALAP/HIGHT to zero.  */
 356   node_sched_params = (node_sched_params_ptr)
 357                        xcalloc (g->num_nodes,
 358                                 sizeof (struct node_sched_params));
 359
 360   /* Set the pointer of the general data of the node to point to the
 361      appropriate sched_params structure.  */
 362   for (i = 0; i < g->num_nodes; i++)
 363     {
 364       /* Watch out for aliasing problems?  */
 365       node_sched_params[i].asap = g->nodes[i].aux.count;
 366       g->nodes[i].aux.info = &node_sched_params[i];
 367     }
 368 }
 369
 370 static void
 371 print_node_sched_params (FILE * dump_file, int num_nodes)
 372 {
 373   int i;
 374
 375   if (! dump_file)
 376     return;
 377   for (i = 0; i < num_nodes; i++)
 378     {
 379       node_sched_params_ptr nsp = &node_sched_params[i];
 380       rtx reg_move = nsp->first_reg_move;
 381       int j;
 382
 383       fprintf (dump_file, "Node %d:\n", i);
 384       fprintf (dump_file, " asap = %d:\n", nsp->asap);
 385       fprintf (dump_file, " time = %d:\n", nsp->time);
 386       fprintf (dump_file, " nreg_moves = %d:\n", nsp->nreg_moves);
 387       for (j = 0; j < nsp->nreg_moves; j++)
 388         {
 389           fprintf (dump_file, " reg_move = ");
 390           print_rtl_single (dump_file, reg_move);
 391           reg_move = PREV_INSN (reg_move);
 392         }
 393     }
 394 }
 395
 396 /* Calculate an upper bound for II.  SMS should not schedule the loop if it
 397    requires more cycles than this bound.  Currently set to the sum of the
 398    longest latency edge for each node.  Reset based on experiments.  */
 399 static int
 400 calculate_maxii (ddg_ptr g)
 401 {
 402   int i;
 403   int maxii = 0;
 404
 405   for (i = 0; i < g->num_nodes; i++)
 406     {
 407       ddg_node_ptr u = &g->nodes[i];
 408       ddg_edge_ptr e;
 409       int max_edge_latency = 0;
 410
 411       for (e = u->out; e; e = e->next_out)
 412         max_edge_latency = MAX (max_edge_latency, e->latency);
 413
 414       maxii += max_edge_latency;
 415     }
 416   return maxii;
 417 }
 418
 419 /*
 420    Breaking intra-loop register anti-dependences:
 421    Each intra-loop register anti-dependence implies a cross-iteration true
 422    dependence of distance 1. Therefore, we can remove such false dependencies
 423    and figure out if the partial schedule broke them by checking if (for a
 424    true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
 425    if so generate a register move.   The number of such moves is equal to:
 426               SCHED_TIME (use) - SCHED_TIME (def)       { 0 broken
 427    nreg_moves = ----------------------------------- + 1 - {   dependence.
 428                             ii                          { 1 if not.
 429 */
 430 static struct undo_replace_buff_elem *
 431 generate_reg_moves (partial_schedule_ptr ps)
 432 {
 433   ddg_ptr g = ps->g;
 434   int ii = ps->ii;
 435   int i;
 436   struct undo_replace_buff_elem *reg_move_replaces = NULL;
 437
 438   for (i = 0; i < g->num_nodes; i++)
 439     {
 440       ddg_node_ptr u = &g->nodes[i];
 441       ddg_edge_ptr e;
 442       int nreg_moves = 0, i_reg_move;
 443       sbitmap *uses_of_defs;
 444       rtx last_reg_move;
 445       rtx prev_reg, old_reg;
 446
 447       /* Compute the number of reg_moves needed for u, by looking at life
 448          ranges started at u (excluding self-loops).  */
 449       for (e = u->out; e; e = e->next_out)
 450         if (e->type == TRUE_DEP && e->dest != e->src)
 451           {
 452             int nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
 453
 454             if (e->distance == 1)
 455               nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
 456
 457             /* If dest precedes src in the schedule of the kernel, then dest
 458                will read before src writes and we can save one reg_copy.  */
 459             if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
 460                 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
 461               nreg_moves4e--;
 462
 463             nreg_moves = MAX (nreg_moves, nreg_moves4e);
 464           }
 465
 466       if (nreg_moves == 0)
 467         continue;
 468
 469       /* Every use of the register defined by node may require a different
 470          copy of this register, depending on the time the use is scheduled.
 471          Set a bitmap vector, telling which nodes use each copy of this
 472          register.  */
 473       uses_of_defs = sbitmap_vector_alloc (nreg_moves, g->num_nodes);
 474       sbitmap_vector_zero (uses_of_defs, nreg_moves);
 475       for (e = u->out; e; e = e->next_out)
 476         if (e->type == TRUE_DEP && e->dest != e->src)
 477           {
 478             int dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
 479
 480             if (e->distance == 1)
 481               dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
 482
 483             if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
 484                 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
 485               dest_copy--;
 486
 487             if (dest_copy)
 488               SET_BIT (uses_of_defs[dest_copy - 1], e->dest->cuid);
 489           }
 490
 491       /* Now generate the reg_moves, attaching relevant uses to them.  */
 492       SCHED_NREG_MOVES (u) = nreg_moves;
 493       old_reg = prev_reg = copy_rtx (SET_DEST (single_set (u->insn)));
 494       last_reg_move = u->insn;
 495
 496       for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
 497         {
 498           int i_use;
 499           rtx new_reg = gen_reg_rtx (GET_MODE (prev_reg));
 500           rtx reg_move = gen_move_insn (new_reg, prev_reg);
 501
 502           add_insn_before (reg_move, last_reg_move);
 503           last_reg_move = reg_move;
 504
 505           if (!SCHED_FIRST_REG_MOVE (u))
 506             SCHED_FIRST_REG_MOVE (u) = reg_move;
 507
 508           EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs[i_reg_move], 0, i_use,
 509             {
 510               struct undo_replace_buff_elem *rep;
 511
 512               rep = (struct undo_replace_buff_elem *)
 513                     xcalloc (1, sizeof (struct undo_replace_buff_elem));
 514               rep->insn = g->nodes[i_use].insn;
 515               rep->orig_reg = old_reg;
 516               rep->new_reg = new_reg;
 517
 518               if (! reg_move_replaces)
 519                 reg_move_replaces = rep;
 520               else
 521                 {
 522                   rep->next = reg_move_replaces;
 523                   reg_move_replaces = rep;
 524                 }
 525
 526               replace_rtx (g->nodes[i_use].insn, old_reg, new_reg);
 527             });
 528
 529           prev_reg = new_reg;
 530         }
 531     }
 532   return reg_move_replaces;
 533 }
 534
 535 /* We call this when we want to undo the SMS schedule for a given loop.
 536    One of the things that we do is to delete the register moves generated
 537    for the sake of SMS; this function deletes the register move instructions
 538    recorded in the undo buffer.  */
 539 static void
 540 undo_generate_reg_moves (partial_schedule_ptr ps,
 541                          struct undo_replace_buff_elem *reg_move_replaces)
 542 {
 543   int i,j;
 544
 545   for (i = 0; i < ps->g->num_nodes; i++)
 546     {
 547       ddg_node_ptr u = &ps->g->nodes[i];
 548       rtx prev;
 549       rtx crr = SCHED_FIRST_REG_MOVE (u);
 550
 551       for (j = 0; j < SCHED_NREG_MOVES (u); j++)
 552         {
 553           prev = PREV_INSN (crr);
 554           delete_insn (crr);
 555           crr = prev;
 556         }
 557       SCHED_FIRST_REG_MOVE (u) = NULL_RTX;
 558     }
 559
 560   while (reg_move_replaces)
 561     {
 562       struct undo_replace_buff_elem *rep = reg_move_replaces;
 563
 564       reg_move_replaces = reg_move_replaces->next;
 565       replace_rtx (rep->insn, rep->new_reg, rep->orig_reg);
 566     }
 567 }
 568
 569 /* Free memory allocated for the undo buffer.  */
 570 static void
 571 free_undo_replace_buff (struct undo_replace_buff_elem *reg_move_replaces)
 572 {
 573
 574   while (reg_move_replaces)
 575     {
 576       struct undo_replace_buff_elem *rep = reg_move_replaces;
 577
 578       reg_move_replaces = reg_move_replaces->next;
 579       free (rep);
 580     }
 581 }
 582
 583 /* Bump the SCHED_TIMEs of all nodes to start from zero.  Set the values
 584    of SCHED_ROW and SCHED_STAGE.  */
 585 static void
 586 normalize_sched_times (partial_schedule_ptr ps)
 587 {
 588   int i;
 589   ddg_ptr g = ps->g;
 590   int amount = PS_MIN_CYCLE (ps);
 591   int ii = ps->ii;
 592
 593   /* Don't include the closing branch assuming that it is the last node.  */
 594   for (i = 0; i < g->num_nodes - 1; i++)
 595     {
 596       ddg_node_ptr u = &g->nodes[i];
 597       int normalized_time = SCHED_TIME (u) - amount;
 598
 599       gcc_assert (normalized_time >= 0);
 600
 601       SCHED_TIME (u) = normalized_time;
 602       SCHED_ROW (u) = normalized_time % ii;
 603       SCHED_STAGE (u) = normalized_time / ii;
 604     }
 605 }
 606
 607 /* Set SCHED_COLUMN of each node according to its position in PS.  */
 608 static void
 609 set_columns_for_ps (partial_schedule_ptr ps)
 610 {
 611   int row;
 612
 613   for (row = 0; row < ps->ii; row++)
 614     {
 615       ps_insn_ptr cur_insn = ps->rows[row];
 616       int column = 0;
 617
 618       for (; cur_insn; cur_insn = cur_insn->next_in_row)
 619         SCHED_COLUMN (cur_insn->node) = column++;
 620     }
 621 }
 622
 623 /* Permute the insns according to their order in PS, from row 0 to
 624    row ii-1, and position them right before LAST.  This schedules
 625    the insns of the loop kernel.  */
 626 static void
 627 permute_partial_schedule (partial_schedule_ptr ps, rtx last)
 628 {
 629   int ii = ps->ii;
 630   int row;
 631   ps_insn_ptr ps_ij;
 632
 633   for (row = 0; row < ii ; row++)
 634     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
 635       if (PREV_INSN (last) != ps_ij->node->insn)
 636         reorder_insns_nobb (ps_ij->node->first_note, ps_ij->node->insn,
 637                             PREV_INSN (last));
 638 }
 639
 640 /* As part of undoing SMS we return to the original ordering of the
 641    instructions inside the loop kernel.  Given the partial schedule PS, this
 642    function returns the ordering of the instruction according to their CUID
 643    in the DDG (PS->G), which is the original order of the instruction before
 644    performing SMS.  */
 645 static void
 646 undo_permute_partial_schedule (partial_schedule_ptr ps, rtx last)
 647 {
 648   int i;
 649
 650   for (i = 0 ; i < ps->g->num_nodes; i++)
 651     if (last == ps->g->nodes[i].insn
 652         || last == ps->g->nodes[i].first_note)
 653       break;
 654     else if (PREV_INSN (last) != ps->g->nodes[i].insn)
 655       reorder_insns_nobb (ps->g->nodes[i].first_note, ps->g->nodes[i].insn,
 656                           PREV_INSN (last));
 657 }
 658
 659 /* Used to generate the prologue & epilogue.  Duplicate the subset of
 660    nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive
 661    of both), together with a prefix/suffix of their reg_moves.  */
 662 static void
 663 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
 664                            int to_stage, int for_prolog)
 665 {
 666   int row;
 667   ps_insn_ptr ps_ij;
 668
 669   for (row = 0; row < ps->ii; row++)
 670     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
 671       {
 672         ddg_node_ptr u_node = ps_ij->node;
 673         int j, i_reg_moves;
 674         rtx reg_move = NULL_RTX;
 675
 676         if (for_prolog)
 677           {
 678             /* SCHED_STAGE (u_node) >= from_stage == 0.  Generate increasing
 679                number of reg_moves starting with the second occurrence of
 680                u_node, which is generated if its SCHED_STAGE <= to_stage.  */
 681             i_reg_moves = to_stage - SCHED_STAGE (u_node) + 1;
 682             i_reg_moves = MAX (i_reg_moves, 0);
 683             i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
 684
 685             /* The reg_moves start from the *first* reg_move backwards.  */
 686             if (i_reg_moves)
 687               {
 688                 reg_move = SCHED_FIRST_REG_MOVE (u_node);
 689                 for (j = 1; j < i_reg_moves; j++)
 690                   reg_move = PREV_INSN (reg_move);
 691               }
 692           }
 693         else /* It's for the epilog.  */
 694           {
 695             /* SCHED_STAGE (u_node) <= to_stage.  Generate all reg_moves,
 696                starting to decrease one stage after u_node no longer occurs;
 697                that is, generate all reg_moves until
 698                SCHED_STAGE (u_node) == from_stage - 1.  */
 699             i_reg_moves = SCHED_NREG_MOVES (u_node)
 700                        - (from_stage - SCHED_STAGE (u_node) - 1);
 701             i_reg_moves = MAX (i_reg_moves, 0);
 702             i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
 703
 704             /* The reg_moves start from the *last* reg_move forwards.  */
 705             if (i_reg_moves)
 706               {
 707                 reg_move = SCHED_FIRST_REG_MOVE (u_node);
 708                 for (j = 1; j < SCHED_NREG_MOVES (u_node); j++)
 709                   reg_move = PREV_INSN (reg_move);
 710               }
 711           }
 712
 713         for (j = 0; j < i_reg_moves; j++, reg_move = NEXT_INSN (reg_move))
 714           emit_insn (copy_rtx (PATTERN (reg_move)));
 715         if (SCHED_STAGE (u_node) >= from_stage
 716             && SCHED_STAGE (u_node) <= to_stage)
 717           duplicate_insn_chain (u_node->first_note, u_node->insn);
 718       }
 719 }
 720
 721
 722 /* Generate the instructions (including reg_moves) for prolog & epilog.  */
 723 static void
 724 generate_prolog_epilog (partial_schedule_ptr ps, struct loop * loop, rtx count_reg)
 725 {
 726   int i;
 727   int last_stage = PS_STAGE_COUNT (ps) - 1;
 728   edge e;
 729
 730   /* Generate the prolog, inserting its insns on the loop-entry edge.  */
 731   start_sequence ();
 732
 733   if (count_reg)
 734    /* Generate a subtract instruction at the beginning of the prolog to
 735       adjust the loop count by STAGE_COUNT.  */
 736    emit_insn (gen_sub2_insn (count_reg, GEN_INT (last_stage)));
 737
 738   for (i = 0; i < last_stage; i++)
 739     duplicate_insns_of_cycles (ps, 0, i, 1);
 740
 741   /* Put the prolog ,  on the one and only entry edge.  */
 742   e = loop_preheader_edge (loop);
 743   loop_split_edge_with(e , get_insns());
 744
 745   end_sequence ();
 746
 747   /* Generate the epilog, inserting its insns on the loop-exit edge.  */
 748   start_sequence ();
 749
 750   for (i = 0; i < last_stage; i++)
 751     duplicate_insns_of_cycles (ps, i + 1, last_stage, 0);
 752
 753   /* Put the epilogue on the one and only one exit edge.  */
 754   gcc_assert (loop->single_exit);
 755   e = loop->single_exit;
 756   loop_split_edge_with(e , get_insns());
 757   end_sequence ();
 758 }
 759
 760 /* Return the line note insn preceding INSN, for debugging.  Taken from
 761    emit-rtl.c.  */
 762 static rtx
 763 find_line_note (rtx insn)
 764 {
 765   for (; insn; insn = PREV_INSN (insn))
 766     if (NOTE_P (insn)
 767         && NOTE_LINE_NUMBER (insn) >= 0)
 768       break;
 769
 770   return insn;
 771 }
 772
 773 /* Return true if all the BBs of the loop are empty except the
 774    loop header.  */
 775 static bool
 776 loop_single_full_bb_p (struct loop *loop)
 777 {
 778   unsigned i;
 779   basic_block *bbs = get_loop_body (loop);
 780
 781   for (i = 0; i < loop->num_nodes ; i++)
 782     {
 783       rtx head, tail;
 784       bool empty_bb = true;
 785
 786       if (bbs[i] == loop->header)
 787         continue;
 788
 789       /* Make sure that basic blocks other than the header
 790          have only notes labels or jumps.  */
 791       get_block_head_tail (bbs[i]->index, &head, &tail);
 792       for (; head != NEXT_INSN (tail); head = NEXT_INSN (head))
 793         {
 794           if (NOTE_P (head) || LABEL_P (head)
 795               || (INSN_P (head) && JUMP_P (head)))
 796             continue;
 797           empty_bb = false;
 798           break;
 799         }
 800
 801       if (! empty_bb)
 802         {
 803           free (bbs);
 804           return false;
 805         }
 806     }
 807   free (bbs);
 808   return true;
 809 }
 810
 811 /* A simple loop from SMS point of view; it is a loop that is composed of
 812    either a single basic block or two BBs - a header and a latch.  */
 813 #define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 )                     \
 814                                   && (EDGE_COUNT (loop->latch->preds) == 1) \
 815                                   && (EDGE_COUNT (loop->latch->succs) == 1))
 816
 817 /* Return true if the loop is in its canonical form and false if not.
 818    i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit.  */
 819 static bool
 820 loop_canon_p (struct loop *loop, FILE *dump_file)
 821 {
 822
 823   if (loop->inner || ! loop->outer)
 824     return false;
 825
 826   if (!loop->single_exit)
 827     {
 828       if (dump_file)
 829         {
 830           rtx line_note = find_line_note (BB_END (loop->header));
 831
 832           fprintf (dump_file, "SMS loop many exits ");
 833           if (line_note)
 834             {
 835               expanded_location xloc;
 836               NOTE_EXPANDED_LOCATION (xloc, line_note);
 837               fprintf (stats_file, " %s %d (file, line)\n",
 838                        xloc.file, xloc.line);
 839             }
 840         }
 841       return false;
 842     }
 843
 844   if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop))
 845     {
 846       if (dump_file)
 847         {
 848           rtx line_note = find_line_note (BB_END (loop->header));
 849
 850           fprintf (dump_file, "SMS loop many BBs. ");
 851           if (line_note)
 852             {
 853               expanded_location xloc;
 854               NOTE_EXPANDED_LOCATION (xloc, line_note);
 855               fprintf (stats_file, " %s %d (file, line)\n",
 856                        xloc.file, xloc.line);
 857             }
 858         }
 859       return false;
 860     }
 861
 862     return true;
 863 }
 864
 865 /* If there are more than one entry for the loop,
 866    make it one by splitting the first entry edge and
 867    redirecting the others to the new BB.  */
 868 static void
 869 canon_loop (struct loop *loop)
 870 {
 871   edge e;
 872   edge_iterator i;
 873
 874   /* Avoid annoying special cases of edges going to exit
 875      block.  */
 876   FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR->preds)
 877     if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1))
 878       loop_split_edge_with (e, NULL_RTX);
 879
 880   if (loop->latch == loop->header
 881       || EDGE_COUNT (loop->latch->succs) > 1)
 882     {
 883       FOR_EACH_EDGE (e, i, loop->header->preds)
 884         if (e->src == loop->latch)
 885           break;
 886       loop_split_edge_with (e, NULL_RTX);
 887     }
 888 }
 889
 890 /* Build the loop information without loop
 891    canonization, the loop canonization will
 892    be performed if the loop is SMSable.  */
 893 static struct loops *
 894 build_loops_structure (FILE *dumpfile)
 895 {
 896   struct loops *loops = xcalloc (1, sizeof (struct loops));
 897
 898   /* Find the loops.  */
 899
 900   if (flow_loops_find (loops) <= 1)
 901     {
 902       /* No loops.  */
 903       flow_loops_free (loops);
 904       free (loops);
 905
 906       return NULL;
 907     }
 908
 909   /* Not going to update these.  */
 910   free (loops->cfg.rc_order);
 911   loops->cfg.rc_order = NULL;
 912   free (loops->cfg.dfs_order);
 913   loops->cfg.dfs_order = NULL;
 914
 915   create_preheaders (loops, CP_SIMPLE_PREHEADERS);
 916   mark_single_exit_loops (loops);
 917   /* Dump loops.  */
 918   flow_loops_dump (loops, dumpfile, NULL, 1);
 919
 920 #ifdef ENABLE_CHECKING
 921   verify_dominators (CDI_DOMINATORS);
 922   verify_loop_structure (loops);
 923 #endif
 924
 925   return loops;
 926 }
 927
 928 /* Main entry point, perform SMS scheduling on the loops of the function
 929    that consist of single basic blocks.  */
 930 void
 931 sms_schedule (FILE *dump_file)
 932 {
 933   static int passes = 0;
 934   rtx insn;
 935   ddg_ptr *g_arr, g;
 936   int * node_order;
 937   int maxii;
 938   unsigned i,num_loops;
 939   partial_schedule_ptr ps;
 940   struct df *df;
 941   struct loops *loops;
 942   basic_block bb = NULL;
 943   /* vars to the versioning only if needed*/
 944   struct loop * nloop;
 945   basic_block condition_bb = NULL;
 946   edge latch_edge;
 947   gcov_type trip_count = 0;
 948
 949   if (! (loops = build_loops_structure (dump_file)))
 950     return;  /* There is no loops to schedule.  */
 951
 952
 953   stats_file = dump_file;
 954
 955   /* Initialize issue_rate.  */
 956   if (targetm.sched.issue_rate)
 957     {
 958       int temp = reload_completed;
 959
 960       reload_completed = 1;
 961       issue_rate = targetm.sched.issue_rate ();
 962       reload_completed = temp;
 963     }
 964   else
 965     issue_rate = 1;
 966
 967   /* Initialize the scheduler.  */
 968   current_sched_info = &sms_sched_info;
 969   sched_init (NULL);
 970
 971   /* Init Data Flow analysis, to be used in interloop dep calculation.  */
 972   df = df_init ();
 973   df_analyze (df, 0, DF_ALL);
 974
 975   /* Allocate memory to hold the DDG array one entry for each loop.
 976      We use loop->num as index into this array.  */
 977   g_arr = xcalloc (loops->num, sizeof (ddg_ptr));
 978
 979
 980   /* Build DDGs for all the relevant loops and hold them in G_ARR
 981      indexed by the loop index.  */
 982   for (i = 0; i < loops->num; i++)
 983     {
 984       rtx head, tail;
 985       rtx count_reg;
 986       struct loop *loop = loops->parray[i];
 987
 988       /* For debugging.  */
 989       if ((passes++ > MAX_SMS_LOOP_NUMBER) && (MAX_SMS_LOOP_NUMBER != -1))
 990         {
 991           if (dump_file)
 992             fprintf (dump_file, "SMS reached MAX_PASSES... \n");
 993
 994           break;
 995         }
 996
 997       if (! loop_canon_p (loop, dump_file))
 998         continue;
 999
1000       if (! loop_single_full_bb_p (loop))
1001         continue;
1002
1003       bb = loop->header;
1004
1005       get_block_head_tail (bb->index, &head, &tail);
1006       latch_edge = loop_latch_edge (loop);
1007       gcc_assert (loop->single_exit);
1008       if (loop->single_exit->count)
1009         trip_count = latch_edge->count / loop->single_exit->count;
1010
1011       /* Perfrom SMS only on loops that their average count is above threshold.  */
1012
1013       if ( latch_edge->count
1014           && (latch_edge->count < loop->single_exit->count * SMS_LOOP_AVERAGE_COUNT_THRESHOLD))
1015         {
1016           if (stats_file)
1017             {
1018               rtx line_note = find_line_note (tail);
1019
1020               if (line_note)
1021                 {
1022                   expanded_location xloc;
1023                   NOTE_EXPANDED_LOCATION (xloc, line_note);
1024                   fprintf (stats_file, "SMS bb %s %d (file, line)\n",
1025                            xloc.file, xloc.line);
1026                 }
1027               fprintf (stats_file, "SMS single-bb-loop\n");
1028               if (profile_info && flag_branch_probabilities)
1029                 {
1030                   fprintf (stats_file, "SMS loop-count ");
1031                   fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1032                            (HOST_WIDEST_INT) bb->count);
1033                   fprintf (stats_file, "\n");
1034                   fprintf (stats_file, "SMS trip-count ");
1035                   fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1036                            (HOST_WIDEST_INT) trip_count);
1037                   fprintf (stats_file, "\n");
1038                   fprintf (stats_file, "SMS profile-sum-max ");
1039                   fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1040                            (HOST_WIDEST_INT) profile_info->sum_max);
1041                   fprintf (stats_file, "\n");
1042                 }
1043             }
1044           continue;
1045         }
1046
1047       /* Make sure this is a doloop.  */
1048       if ( !(count_reg = doloop_register_get (tail)))
1049         continue;
1050
1051       /* Don't handle BBs with calls or barriers, or !single_set insns.  */
1052       for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
1053         if (CALL_P (insn)
1054             || BARRIER_P (insn)
1055             || (INSN_P (insn) && !JUMP_P (insn)
1056                 && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
1057           break;
1058
1059       if (insn != NEXT_INSN (tail))
1060         {
1061           if (stats_file)
1062             {
1063               if (CALL_P (insn))
1064                 fprintf (stats_file, "SMS loop-with-call\n");
1065               else if (BARRIER_P (insn))
1066                 fprintf (stats_file, "SMS loop-with-barrier\n");
1067               else
1068                 fprintf (stats_file, "SMS loop-with-not-single-set\n");
1069               print_rtl_single (stats_file, insn);
1070             }
1071
1072           continue;
1073         }
1074
1075       if (! (g = create_ddg (bb, df, 0)))
1076         {
1077           if (stats_file)
1078             fprintf (stats_file, "SMS doloop\n");
1079           continue;
1080         }
1081
1082       g_arr[i] = g;
1083     }
1084
1085   /* Release Data Flow analysis data structures.  */
1086   df_finish (df);
1087
1088   /* We don't want to perform SMS on new loops - created by versioning.  */
1089   num_loops = loops->num;
1090   /* Go over the built DDGs and perfrom SMS for each one of them.  */
1091   for (i = 0; i < num_loops; i++)
1092     {
1093       rtx head, tail;
1094       rtx count_reg, count_init;
1095       int mii, rec_mii;
1096       unsigned stage_count = 0;
1097       HOST_WIDEST_INT loop_count = 0;
1098       struct loop *loop = loops->parray[i];
1099
1100       if (! (g = g_arr[i]))
1101         continue;
1102
1103       if (dump_file)
1104         print_ddg (dump_file, g);
1105
1106       get_block_head_tail (loop->header->index, &head, &tail);
1107
1108       latch_edge = loop_latch_edge (loop);
1109       gcc_assert (loop->single_exit);
1110       if (loop->single_exit->count)
1111         trip_count = latch_edge->count / loop->single_exit->count;
1112
1113       if (stats_file)
1114         {
1115           rtx line_note = find_line_note (tail);
1116
1117           if (line_note)
1118             {
1119               expanded_location xloc;
1120               NOTE_EXPANDED_LOCATION (xloc, line_note);
1121               fprintf (stats_file, "SMS bb %s %d (file, line)\n",
1122                        xloc.file, xloc.line);
1123             }
1124           fprintf (stats_file, "SMS single-bb-loop\n");
1125           if (profile_info && flag_branch_probabilities)
1126             {
1127               fprintf (stats_file, "SMS loop-count ");
1128               fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1129                        (HOST_WIDEST_INT) bb->count);
1130               fprintf (stats_file, "\n");
1131               fprintf (stats_file, "SMS profile-sum-max ");
1132               fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1133                        (HOST_WIDEST_INT) profile_info->sum_max);
1134               fprintf (stats_file, "\n");
1135             }
1136           fprintf (stats_file, "SMS doloop\n");
1137           fprintf (stats_file, "SMS built-ddg %d\n", g->num_nodes);
1138           fprintf (stats_file, "SMS num-loads %d\n", g->num_loads);
1139           fprintf (stats_file, "SMS num-stores %d\n", g->num_stores);
1140         }
1141
1142
1143       /* In case of th loop have doloop register it gets special
1144          handling.  */
1145       count_init = NULL_RTX;
1146       if ((count_reg = doloop_register_get (tail)))
1147         {
1148           basic_block pre_header;
1149
1150           pre_header = loop_preheader_edge (loop)->src;
1151           count_init = const_iteration_count (count_reg, pre_header,
1152                                               &loop_count);
1153         }
1154       gcc_assert (count_reg);
1155
1156       if (stats_file && count_init)
1157         {
1158           fprintf (stats_file, "SMS const-doloop ");
1159           fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1160                      loop_count);
1161           fprintf (stats_file, "\n");
1162         }
1163
1164       node_order = (int *) xmalloc (sizeof (int) * g->num_nodes);
1165
1166       mii = 1; /* Need to pass some estimate of mii.  */
1167       rec_mii = sms_order_nodes (g, mii, node_order);
1168       mii = MAX (res_MII (g), rec_mii);
1169       maxii = (calculate_maxii (g) * SMS_MAX_II_FACTOR) / 100;
1170
1171       if (stats_file)
1172         fprintf (stats_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1173                  rec_mii, mii, maxii);
1174
1175       /* After sms_order_nodes and before sms_schedule_by_order, to copy over
1176          ASAP.  */
1177       set_node_sched_params (g);
1178
1179       ps = sms_schedule_by_order (g, mii, maxii, node_order, dump_file);
1180
1181       if (ps)
1182         stage_count = PS_STAGE_COUNT (ps);
1183
1184       /* Stage count of 1 means that there is no interleaving between
1185          iterations, let the scheduling passes do the job.  */
1186       if (stage_count < 1
1187           || (count_init && (loop_count <= stage_count))
1188           || (flag_branch_probabilities && (trip_count <= stage_count)))
1189         {
1190           if (dump_file)
1191             {
1192               fprintf (dump_file, "SMS failed... \n");
1193               fprintf (dump_file, "SMS sched-failed (stage-count=%d, loop-count=", stage_count);
1194               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, loop_count);
1195               fprintf (dump_file, ", trip-count=");
1196               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, trip_count);
1197               fprintf (dump_file, ")\n");
1198             }
1199           continue;
1200         }
1201       else
1202         {
1203           int orig_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1204           int new_cycles;
1205           struct undo_replace_buff_elem *reg_move_replaces;
1206
1207           if (stats_file)
1208             {
1209               fprintf (stats_file,
1210                        "SMS succeeded %d %d (with ii, sc)\n", ps->ii,
1211                        stage_count);
1212               print_partial_schedule (ps, stats_file);
1213               fprintf (stats_file,
1214                        "SMS Branch (%d) will later be scheduled at cycle %d.\n",
1215                        g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1);
1216             }
1217
1218           /* Set the stage boundaries.  If the DDG is built with closing_branch_deps,
1219              the closing_branch was scheduled and should appear in the last (ii-1)
1220              row.  Otherwise, we are free to schedule the branch, and we let nodes
1221              that were scheduled at the first PS_MIN_CYCLE cycle appear in the first
1222              row; this should reduce stage_count to minimum.  */
1223           normalize_sched_times (ps);
1224           rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
1225           set_columns_for_ps (ps);
1226
1227           /* Generate the kernel just to be able to measure its cycles.  */
1228           permute_partial_schedule (ps, g->closing_branch->first_note);
1229           reg_move_replaces = generate_reg_moves (ps);
1230
1231           /* Get the number of cycles the new kernel expect to execute in.  */
1232           new_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1233
1234           /* Get back to the original loop so we can do loop versioning.  */
1235           undo_permute_partial_schedule (ps, g->closing_branch->first_note);
1236           if (reg_move_replaces)
1237             undo_generate_reg_moves (ps, reg_move_replaces);
1238
1239           if ( new_cycles >= orig_cycles)
1240             {
1241               /* SMS is not profitable so undo the permutation and reg move generation
1242                  and return the kernel to its original state.  */
1243               if (dump_file)
1244                 fprintf (dump_file, "Undoing SMS because it is not profitable.\n");
1245
1246             }
1247           else
1248             {
1249               canon_loop (loop);
1250
1251               /* case the BCT count is not known , Do loop-versioning */
1252               if (count_reg && ! count_init)
1253                 {
1254                   rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
1255                                                  GEN_INT(stage_count));
1256
1257                   nloop = loop_version (loops, loop, comp_rtx, &condition_bb);
1258                 }
1259
1260               /* Set new iteration count of loop kernel.  */
1261               if (count_reg && count_init)
1262                 SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1263                                                              - stage_count + 1);
1264
1265               /* Now apply the scheduled kernel to the RTL of the loop.  */
1266               permute_partial_schedule (ps, g->closing_branch->first_note);
1267
1268               /* Mark this loop as software pipelined so the later
1269               scheduling passes doesn't touch it.  */
1270               if (! flag_resched_modulo_sched)
1271                 g->bb->flags |= BB_DISABLE_SCHEDULE;
1272               /* The life-info is not valid any more.  */
1273               g->bb->flags |= BB_DIRTY;
1274
1275               reg_move_replaces = generate_reg_moves (ps);
1276               if (dump_file)
1277                 print_node_sched_params (dump_file, g->num_nodes);
1278               /* Generate prolog and epilog.  */
1279               if (count_reg && !count_init)
1280                 generate_prolog_epilog (ps, loop, count_reg);
1281               else
1282                 generate_prolog_epilog (ps, loop, NULL_RTX);
1283             }
1284           free_undo_replace_buff (reg_move_replaces);
1285         }
1286
1287       free_partial_schedule (ps);
1288       free (node_sched_params);
1289       free (node_order);
1290       free_ddg (g);
1291     }
1292
1293   /* Release scheduler data, needed until now because of DFA.  */
1294   sched_finish ();
1295   loop_optimizer_finalize (loops, dump_file);
1296 }
1297
1298 /* The SMS scheduling algorithm itself
1299    -----------------------------------
1300    Input: 'O' an ordered list of insns of a loop.
1301    Output: A scheduling of the loop - kernel, prolog, and epilogue.
1302
1303    'Q' is the empty Set
1304    'PS' is the partial schedule; it holds the currently scheduled nodes with
1305         their cycle/slot.
1306    'PSP' previously scheduled predecessors.
1307    'PSS' previously scheduled successors.
1308    't(u)' the cycle where u is scheduled.
1309    'l(u)' is the latency of u.
1310    'd(v,u)' is the dependence distance from v to u.
1311    'ASAP(u)' the earliest time at which u could be scheduled as computed in
1312              the node ordering phase.
1313    'check_hardware_resources_conflicts(u, PS, c)'
1314                              run a trace around cycle/slot through DFA model
1315                              to check resource conflicts involving instruction u
1316                              at cycle c given the partial schedule PS.
1317    'add_to_partial_schedule_at_time(u, PS, c)'
1318                              Add the node/instruction u to the partial schedule
1319                              PS at time c.
1320    'calculate_register_pressure(PS)'
1321                              Given a schedule of instructions, calculate the register
1322                              pressure it implies.  One implementation could be the
1323                              maximum number of overlapping live ranges.
1324    'maxRP' The maximum allowed register pressure, it is usually derived from the number
1325            registers available in the hardware.
1326
1327    1. II = MII.
1328    2. PS = empty list
1329    3. for each node u in O in pre-computed order
1330    4.   if (PSP(u) != Q && PSS(u) == Q) then
1331    5.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1332    6.     start = Early_start; end = Early_start + II - 1; step = 1
1333    11.  else if (PSP(u) == Q && PSS(u) != Q) then
1334    12.      Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1335    13.     start = Late_start; end = Late_start - II + 1; step = -1
1336    14.  else if (PSP(u) != Q && PSS(u) != Q) then
1337    15.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1338    16.     Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1339    17.     start = Early_start;
1340    18.     end = min(Early_start + II - 1 , Late_start);
1341    19.     step = 1
1342    20.     else "if (PSP(u) == Q && PSS(u) == Q)"
1343    21.    start = ASAP(u); end = start + II - 1; step = 1
1344    22.  endif
1345
1346    23.  success = false
1347    24.  for (c = start ; c != end ; c += step)
1348    25.     if check_hardware_resources_conflicts(u, PS, c) then
1349    26.       add_to_partial_schedule_at_time(u, PS, c)
1350    27.       success = true
1351    28.       break
1352    29.     endif
1353    30.  endfor
1354    31.  if (success == false) then
1355    32.    II = II + 1
1356    33.    if (II > maxII) then
1357    34.       finish - failed to schedule
1358    35.   endif
1359    36.    goto 2.
1360    37.  endif
1361    38. endfor
1362    39. if (calculate_register_pressure(PS) > maxRP) then
1363    40.    goto 32.
1364    41. endif
1365    42. compute epilogue & prologue
1366    43. finish - succeeded to schedule
1367 */
1368
1369 /* A limit on the number of cycles that resource conflicts can span.  ??? Should
1370    be provided by DFA, and be dependent on the type of insn scheduled.  Currently
1371    set to 0 to save compile time.  */
1372 #define DFA_HISTORY SMS_DFA_HISTORY
1373
1374 /* Given the partial schedule PS, this function calculates and returns the
1375    cycles in which we can schedule the node with the given index I.
1376    NOTE: Here we do the backtracking in SMS, in some special cases. We have
1377    noticed that there are several cases in which we fail    to SMS the loop
1378    because the sched window of a node is empty    due to tight data-deps. In
1379    such cases we want to unschedule    some of the predecessors/successors
1380    until we get non-empty    scheduling window.  It returns -1 if the
1381    scheduling window is empty and zero otherwise.  */
1382
1383 static int
1384 get_sched_window (partial_schedule_ptr ps, int *nodes_order, int i,
1385                   sbitmap sched_nodes, int ii, int *start_p, int *step_p, int *end_p)
1386 {
1387   int start, step, end;
1388   ddg_edge_ptr e;
1389   int u = nodes_order [i];
1390   ddg_node_ptr u_node = &ps->g->nodes[u];
1391   sbitmap psp = sbitmap_alloc (ps->g->num_nodes);
1392   sbitmap pss = sbitmap_alloc (ps->g->num_nodes);
1393   sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1394   sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1395   int psp_not_empty;
1396   int pss_not_empty;
1397
1398   /* 1. compute sched window for u (start, end, step).  */
1399   sbitmap_zero (psp);
1400   sbitmap_zero (pss);
1401   psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes);
1402   pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes);
1403
1404   if (psp_not_empty && !pss_not_empty)
1405     {
1406       int early_start = INT_MIN;
1407
1408       end = INT_MAX;
1409       for (e = u_node->in; e != 0; e = e->next_in)
1410         {
1411           ddg_node_ptr v_node = e->src;
1412           if (TEST_BIT (sched_nodes, v_node->cuid))
1413             {
1414               int node_st = SCHED_TIME (v_node)
1415                             + e->latency - (e->distance * ii);
1416
1417               early_start = MAX (early_start, node_st);
1418
1419               if (e->data_type == MEM_DEP)
1420                 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1421             }
1422         }
1423       start = early_start;
1424       end = MIN (end, early_start + ii);
1425       step = 1;
1426     }
1427
1428   else if (!psp_not_empty && pss_not_empty)
1429     {
1430       int late_start = INT_MAX;
1431
1432       end = INT_MIN;
1433       for (e = u_node->out; e != 0; e = e->next_out)
1434         {
1435           ddg_node_ptr v_node = e->dest;
1436           if (TEST_BIT (sched_nodes, v_node->cuid))
1437             {
1438               late_start = MIN (late_start,
1439                                 SCHED_TIME (v_node) - e->latency
1440                                 + (e->distance * ii));
1441               if (e->data_type == MEM_DEP)
1442                 end = MAX (end, SCHED_TIME (v_node) - ii + 1);
1443             }
1444         }
1445       start = late_start;
1446       end = MAX (end, late_start - ii);
1447       step = -1;
1448     }
1449
1450   else if (psp_not_empty && pss_not_empty)
1451     {
1452       int early_start = INT_MIN;
1453       int late_start = INT_MAX;
1454
1455       start = INT_MIN;
1456       end = INT_MAX;
1457       for (e = u_node->in; e != 0; e = e->next_in)
1458         {
1459           ddg_node_ptr v_node = e->src;
1460
1461           if (TEST_BIT (sched_nodes, v_node->cuid))
1462             {
1463               early_start = MAX (early_start,
1464                                  SCHED_TIME (v_node) + e->latency
1465                                  - (e->distance * ii));
1466               if (e->data_type == MEM_DEP)
1467                 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1468             }
1469         }
1470       for (e = u_node->out; e != 0; e = e->next_out)
1471         {
1472           ddg_node_ptr v_node = e->dest;
1473
1474           if (TEST_BIT (sched_nodes, v_node->cuid))
1475             {
1476               late_start = MIN (late_start,
1477                                 SCHED_TIME (v_node) - e->latency
1478                                 + (e->distance * ii));
1479               if (e->data_type == MEM_DEP)
1480                 start = MAX (start, SCHED_TIME (v_node) - ii + 1);
1481             }
1482         }
1483       start = MAX (start, early_start);
1484       end = MIN (end, MIN (early_start + ii, late_start + 1));
1485       step = 1;
1486     }
1487   else /* psp is empty && pss is empty.  */
1488     {
1489       start = SCHED_ASAP (u_node);
1490       end = start + ii;
1491       step = 1;
1492     }
1493
1494   *start_p = start;
1495   *step_p = step;
1496   *end_p = end;
1497   sbitmap_free (psp);
1498   sbitmap_free (pss);
1499
1500   if ((start >= end && step == 1) || (start <= end && step == -1))
1501     return -1;
1502   else
1503     return 0;
1504 }
1505
1506 /* This function implements the scheduling algorithm for SMS according to the
1507    above algorithm.  */
1508 static partial_schedule_ptr
1509 sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order, FILE *dump_file)
1510 {
1511   int ii = mii;
1512   int i, c, success;
1513   int try_again_with_larger_ii = true;
1514   int num_nodes = g->num_nodes;
1515   ddg_edge_ptr e;
1516   int start, end, step; /* Place together into one struct?  */
1517   sbitmap sched_nodes = sbitmap_alloc (num_nodes);
1518   sbitmap must_precede = sbitmap_alloc (num_nodes);
1519   sbitmap must_follow = sbitmap_alloc (num_nodes);
1520   sbitmap tobe_scheduled = sbitmap_alloc (num_nodes);
1521
1522   partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY);
1523
1524   sbitmap_ones (tobe_scheduled);
1525   sbitmap_zero (sched_nodes);
1526
1527   while ((! sbitmap_equal (tobe_scheduled, sched_nodes)
1528          || try_again_with_larger_ii ) && ii < maxii)
1529     {
1530       int j;
1531       bool unscheduled_nodes = false;
1532
1533       if (dump_file)
1534         fprintf(dump_file, "Starting with ii=%d\n", ii);
1535       if (try_again_with_larger_ii)
1536         {
1537           try_again_with_larger_ii = false;
1538           sbitmap_zero (sched_nodes);
1539         }
1540
1541       for (i = 0; i < num_nodes; i++)
1542         {
1543           int u = nodes_order[i];
1544           ddg_node_ptr u_node = &ps->g->nodes[u];
1545           rtx insn = u_node->insn;
1546
1547           if (!INSN_P (insn))
1548             {
1549               RESET_BIT (tobe_scheduled, u);
1550               continue;
1551             }
1552
1553           if (JUMP_P (insn)) /* Closing branch handled later.  */
1554             {
1555               RESET_BIT (tobe_scheduled, u);
1556               continue;
1557             }
1558
1559           if (TEST_BIT (sched_nodes, u))
1560             continue;
1561
1562           /* Try to get non-empty scheduling window.  */
1563           j = i;
1564           while (get_sched_window (ps, nodes_order, i, sched_nodes, ii, &start, &step, &end) < 0
1565                  && j > 0)
1566             {
1567               unscheduled_nodes = true;
1568               if (TEST_BIT (NODE_PREDECESSORS (u_node), nodes_order[j - 1])
1569                   || TEST_BIT (NODE_SUCCESSORS (u_node), nodes_order[j - 1]))
1570                 {
1571                   ps_unschedule_node (ps, &ps->g->nodes[nodes_order[j - 1]]);
1572                   RESET_BIT (sched_nodes, nodes_order [j - 1]);
1573                 }
1574               j--;
1575             }
1576           if (j < 0)
1577             {
1578               /* ??? Try backtracking instead of immediately ii++?  */
1579               ii++;
1580               try_again_with_larger_ii = true;
1581               reset_partial_schedule (ps, ii);
1582               break;
1583             }
1584           /* 2. Try scheduling u in window.  */
1585           if (dump_file)
1586             fprintf(dump_file, "Trying to schedule node %d in (%d .. %d) step %d\n",
1587                     u, start, end, step);
1588
1589           /* use must_follow & must_precede bitmaps to determine order
1590              of nodes within the cycle.  */
1591           sbitmap_zero (must_precede);
1592           sbitmap_zero (must_follow);
1593           for (e = u_node->in; e != 0; e = e->next_in)
1594             if (TEST_BIT (sched_nodes, e->src->cuid)
1595                 && e->latency == (ii * e->distance)
1596                 && start == SCHED_TIME (e->src))
1597              SET_BIT (must_precede, e->src->cuid);
1598
1599           for (e = u_node->out; e != 0; e = e->next_out)
1600             if (TEST_BIT (sched_nodes, e->dest->cuid)
1601                 && e->latency == (ii * e->distance)
1602                 && end == SCHED_TIME (e->dest))
1603              SET_BIT (must_follow, e->dest->cuid);
1604
1605           success = 0;
1606           if ((step > 0 && start < end) ||  (step < 0 && start > end))
1607             for (c = start; c != end; c += step)
1608               {
1609                 ps_insn_ptr psi;
1610
1611                 psi = ps_add_node_check_conflicts (ps, u_node, c,
1612                                                    must_precede,
1613                                                    must_follow);
1614
1615                 if (psi)
1616                   {
1617                     SCHED_TIME (u_node) = c;
1618                     SET_BIT (sched_nodes, u);
1619                     success = 1;
1620                     if (dump_file)
1621                       fprintf(dump_file, "Schedule in %d\n", c);
1622                     break;
1623                   }
1624               }
1625           if (!success)
1626             {
1627               /* ??? Try backtracking instead of immediately ii++?  */
1628               ii++;
1629               try_again_with_larger_ii = true;
1630               reset_partial_schedule (ps, ii);
1631               break;
1632             }
1633           if (unscheduled_nodes)
1634             break;
1635
1636           /* ??? If (success), check register pressure estimates.  */
1637         } /* Continue with next node.  */
1638     } /* While try_again_with_larger_ii.  */
1639
1640   sbitmap_free (sched_nodes);
1641
1642   if (ii >= maxii)
1643     {
1644       free_partial_schedule (ps);
1645       ps = NULL;
1646     }
1647   return ps;
1648 }
1649
1650 \f
1651 /* This page implements the algorithm for ordering the nodes of a DDG
1652    for modulo scheduling, activated through the
1653    "int sms_order_nodes (ddg_ptr, int mii, int * result)" API.  */
1654
1655 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
1656 #define ASAP(x) (ORDER_PARAMS ((x))->asap)
1657 #define ALAP(x) (ORDER_PARAMS ((x))->alap)
1658 #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
1659 #define MOB(x) (ALAP ((x)) - ASAP ((x)))
1660 #define DEPTH(x) (ASAP ((x)))
1661
1662 typedef struct node_order_params * nopa;
1663
1664 static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
1665 static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
1666 static nopa  calculate_order_params (ddg_ptr, int mii);
1667 static int find_max_asap (ddg_ptr, sbitmap);
1668 static int find_max_hv_min_mob (ddg_ptr, sbitmap);
1669 static int find_max_dv_min_mob (ddg_ptr, sbitmap);
1670
1671 enum sms_direction {BOTTOMUP, TOPDOWN};
1672
1673 struct node_order_params
1674 {
1675   int asap;
1676   int alap;
1677   int height;
1678 };
1679
1680 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1.  */
1681 static void
1682 check_nodes_order (int *node_order, int num_nodes)
1683 {
1684   int i;
1685   sbitmap tmp = sbitmap_alloc (num_nodes);
1686
1687   sbitmap_zero (tmp);
1688
1689   for (i = 0; i < num_nodes; i++)
1690     {
1691       int u = node_order[i];
1692
1693       gcc_assert (u < num_nodes && u >= 0 && !TEST_BIT (tmp, u));
1694
1695       SET_BIT (tmp, u);
1696     }
1697
1698   sbitmap_free (tmp);
1699 }
1700
1701 /* Order the nodes of G for scheduling and pass the result in
1702    NODE_ORDER.  Also set aux.count of each node to ASAP.
1703    Return the recMII for the given DDG.  */
1704 static int
1705 sms_order_nodes (ddg_ptr g, int mii, int * node_order)
1706 {
1707   int i;
1708   int rec_mii = 0;
1709   ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
1710
1711   nopa nops = calculate_order_params (g, mii);
1712
1713   order_nodes_of_sccs (sccs, node_order);
1714
1715   if (sccs->num_sccs > 0)
1716     /* First SCC has the largest recurrence_length.  */
1717     rec_mii = sccs->sccs[0]->recurrence_length;
1718
1719   /* Save ASAP before destroying node_order_params.  */
1720   for (i = 0; i < g->num_nodes; i++)
1721     {
1722       ddg_node_ptr v = &g->nodes[i];
1723       v->aux.count = ASAP (v);
1724     }
1725
1726   free (nops);
1727   free_ddg_all_sccs (sccs);
1728   check_nodes_order (node_order, g->num_nodes);
1729
1730   return rec_mii;
1731 }
1732
1733 static void
1734 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
1735 {
1736   int i, pos = 0;
1737   ddg_ptr g = all_sccs->ddg;
1738   int num_nodes = g->num_nodes;
1739   sbitmap prev_sccs = sbitmap_alloc (num_nodes);
1740   sbitmap on_path = sbitmap_alloc (num_nodes);
1741   sbitmap tmp = sbitmap_alloc (num_nodes);
1742   sbitmap ones = sbitmap_alloc (num_nodes);
1743
1744   sbitmap_zero (prev_sccs);
1745   sbitmap_ones (ones);
1746
1747   /* Perfrom the node ordering starting from the SCC with the highest recMII.
1748      For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc.  */
1749   for (i = 0; i < all_sccs->num_sccs; i++)
1750     {
1751       ddg_scc_ptr scc = all_sccs->sccs[i];
1752
1753       /* Add nodes on paths from previous SCCs to the current SCC.  */
1754       find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
1755       sbitmap_a_or_b (tmp, scc->nodes, on_path);
1756
1757       /* Add nodes on paths from the current SCC to previous SCCs.  */
1758       find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
1759       sbitmap_a_or_b (tmp, tmp, on_path);
1760
1761       /* Remove nodes of previous SCCs from current extended SCC.  */
1762       sbitmap_difference (tmp, tmp, prev_sccs);
1763
1764       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1765       /* Above call to order_nodes_in_scc updated prev_sccs |= tmp.  */
1766     }
1767
1768   /* Handle the remaining nodes that do not belong to any scc.  Each call
1769      to order_nodes_in_scc handles a single connected component.  */
1770   while (pos < g->num_nodes)
1771     {
1772       sbitmap_difference (tmp, ones, prev_sccs);
1773       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1774     }
1775   sbitmap_free (prev_sccs);
1776   sbitmap_free (on_path);
1777   sbitmap_free (tmp);
1778   sbitmap_free (ones);
1779 }
1780
1781 /* MII is needed if we consider backarcs (that do not close recursive cycles).  */
1782 static struct node_order_params *
1783 calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED)
1784 {
1785   int u;
1786   int max_asap;
1787   int num_nodes = g->num_nodes;
1788   ddg_edge_ptr e;
1789   /* Allocate a place to hold ordering params for each node in the DDG.  */
1790   nopa node_order_params_arr;
1791
1792   /* Initialize of ASAP/ALAP/HEIGHT to zero.  */
1793   node_order_params_arr = (nopa) xcalloc (num_nodes,
1794                                           sizeof (struct node_order_params));
1795
1796   /* Set the aux pointer of each node to point to its order_params structure.  */
1797   for (u = 0; u < num_nodes; u++)
1798     g->nodes[u].aux.info = &node_order_params_arr[u];
1799
1800   /* Disregarding a backarc from each recursive cycle to obtain a DAG,
1801      calculate ASAP, ALAP, mobility, distance, and height for each node
1802      in the dependence (direct acyclic) graph.  */
1803
1804   /* We assume that the nodes in the array are in topological order.  */
1805
1806   max_asap = 0;
1807   for (u = 0; u < num_nodes; u++)
1808     {
1809       ddg_node_ptr u_node = &g->nodes[u];
1810
1811       ASAP (u_node) = 0;
1812       for (e = u_node->in; e; e = e->next_in)
1813         if (e->distance == 0)
1814           ASAP (u_node) = MAX (ASAP (u_node),
1815                                ASAP (e->src) + e->latency);
1816       max_asap = MAX (max_asap, ASAP (u_node));
1817     }
1818
1819   for (u = num_nodes - 1; u > -1; u--)
1820     {
1821       ddg_node_ptr u_node = &g->nodes[u];
1822
1823       ALAP (u_node) = max_asap;
1824       HEIGHT (u_node) = 0;
1825       for (e = u_node->out; e; e = e->next_out)
1826         if (e->distance == 0)
1827           {
1828             ALAP (u_node) = MIN (ALAP (u_node),
1829                                  ALAP (e->dest) - e->latency);
1830             HEIGHT (u_node) = MAX (HEIGHT (u_node),
1831                                    HEIGHT (e->dest) + e->latency);
1832           }
1833     }
1834
1835   return node_order_params_arr;
1836 }
1837
1838 static int
1839 find_max_asap (ddg_ptr g, sbitmap nodes)
1840 {
1841   int u;
1842   int max_asap = -1;
1843   int result = -1;
1844
1845   EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u,
1846     {
1847       ddg_node_ptr u_node = &g->nodes[u];
1848
1849       if (max_asap < ASAP (u_node))
1850         {
1851           max_asap = ASAP (u_node);
1852           result = u;
1853         }
1854     });
1855   return result;
1856 }
1857
1858 static int
1859 find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
1860 {
1861   int u;
1862   int max_hv = -1;
1863   int min_mob = INT_MAX;
1864   int result = -1;
1865
1866   EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u,
1867     {
1868       ddg_node_ptr u_node = &g->nodes[u];
1869
1870       if (max_hv < HEIGHT (u_node))
1871         {
1872           max_hv = HEIGHT (u_node);
1873           min_mob = MOB (u_node);
1874           result = u;
1875         }
1876       else if ((max_hv == HEIGHT (u_node))
1877                && (min_mob > MOB (u_node)))
1878         {
1879           min_mob = MOB (u_node);
1880           result = u;
1881         }
1882     });
1883   return result;
1884 }
1885
1886 static int
1887 find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
1888 {
1889   int u;
1890   int max_dv = -1;
1891   int min_mob = INT_MAX;
1892   int result = -1;
1893
1894   EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u,
1895     {
1896       ddg_node_ptr u_node = &g->nodes[u];
1897
1898       if (max_dv < DEPTH (u_node))
1899         {
1900           max_dv = DEPTH (u_node);
1901           min_mob = MOB (u_node);
1902           result = u;
1903         }
1904       else if ((max_dv == DEPTH (u_node))
1905                && (min_mob > MOB (u_node)))
1906         {
1907           min_mob = MOB (u_node);
1908           result = u;
1909         }
1910     });
1911   return result;
1912 }
1913
1914 /* Places the nodes of SCC into the NODE_ORDER array starting
1915    at position POS, according to the SMS ordering algorithm.
1916    NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
1917    the NODE_ORDER array, starting from position zero.  */
1918 static int
1919 order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
1920                     int * node_order, int pos)
1921 {
1922   enum sms_direction dir;
1923   int num_nodes = g->num_nodes;
1924   sbitmap workset = sbitmap_alloc (num_nodes);
1925   sbitmap tmp = sbitmap_alloc (num_nodes);
1926   sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
1927   sbitmap predecessors = sbitmap_alloc (num_nodes);
1928   sbitmap successors = sbitmap_alloc (num_nodes);
1929
1930   sbitmap_zero (predecessors);
1931   find_predecessors (predecessors, g, nodes_ordered);
1932
1933   sbitmap_zero (successors);
1934   find_successors (successors, g, nodes_ordered);
1935
1936   sbitmap_zero (tmp);
1937   if (sbitmap_a_and_b_cg (tmp, predecessors, scc))
1938     {
1939       sbitmap_copy (workset, tmp);
1940       dir = BOTTOMUP;
1941     }
1942   else if (sbitmap_a_and_b_cg (tmp, successors, scc))
1943     {
1944       sbitmap_copy (workset, tmp);
1945       dir = TOPDOWN;
1946     }
1947   else
1948     {
1949       int u;
1950
1951       sbitmap_zero (workset);
1952       if ((u = find_max_asap (g, scc)) >= 0)
1953         SET_BIT (workset, u);
1954       dir = BOTTOMUP;
1955     }
1956
1957   sbitmap_zero (zero_bitmap);
1958   while (!sbitmap_equal (workset, zero_bitmap))
1959     {
1960       int v;
1961       ddg_node_ptr v_node;
1962       sbitmap v_node_preds;
1963       sbitmap v_node_succs;
1964
1965       if (dir == TOPDOWN)
1966         {
1967           while (!sbitmap_equal (workset, zero_bitmap))
1968             {
1969               v = find_max_hv_min_mob (g, workset);
1970               v_node = &g->nodes[v];
1971               node_order[pos++] = v;
1972               v_node_succs = NODE_SUCCESSORS (v_node);
1973               sbitmap_a_and_b (tmp, v_node_succs, scc);
1974
1975               /* Don't consider the already ordered successors again.  */
1976               sbitmap_difference (tmp, tmp, nodes_ordered);
1977               sbitmap_a_or_b (workset, workset, tmp);
1978               RESET_BIT (workset, v);
1979               SET_BIT (nodes_ordered, v);
1980             }
1981           dir = BOTTOMUP;
1982           sbitmap_zero (predecessors);
1983           find_predecessors (predecessors, g, nodes_ordered);
1984           sbitmap_a_and_b (workset, predecessors, scc);
1985         }
1986       else
1987         {
1988           while (!sbitmap_equal (workset, zero_bitmap))
1989             {
1990               v = find_max_dv_min_mob (g, workset);
1991               v_node = &g->nodes[v];
1992               node_order[pos++] = v;
1993               v_node_preds = NODE_PREDECESSORS (v_node);
1994               sbitmap_a_and_b (tmp, v_node_preds, scc);
1995
1996               /* Don't consider the already ordered predecessors again.  */
1997               sbitmap_difference (tmp, tmp, nodes_ordered);
1998               sbitmap_a_or_b (workset, workset, tmp);
1999               RESET_BIT (workset, v);
2000               SET_BIT (nodes_ordered, v);
2001             }
2002           dir = TOPDOWN;
2003           sbitmap_zero (successors);
2004           find_successors (successors, g, nodes_ordered);
2005           sbitmap_a_and_b (workset, successors, scc);
2006         }
2007     }
2008   sbitmap_free (tmp);
2009   sbitmap_free (workset);
2010   sbitmap_free (zero_bitmap);
2011   sbitmap_free (predecessors);
2012   sbitmap_free (successors);
2013   return pos;
2014 }
2015
2016 \f
2017 /* This page contains functions for manipulating partial-schedules during
2018    modulo scheduling.  */
2019
2020 /* Create a partial schedule and allocate a memory to hold II rows.  */
2021 partial_schedule_ptr
2022 create_partial_schedule (int ii, ddg_ptr g, int history)
2023 {
2024   partial_schedule_ptr ps = (partial_schedule_ptr)
2025                              xmalloc (sizeof (struct partial_schedule));
2026   ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
2027   ps->ii = ii;
2028   ps->history = history;
2029   ps->min_cycle = INT_MAX;
2030   ps->max_cycle = INT_MIN;
2031   ps->g = g;
2032
2033   return ps;
2034 }
2035
2036 /* Free the PS_INSNs in rows array of the given partial schedule.
2037    ??? Consider caching the PS_INSN's.  */
2038 static void
2039 free_ps_insns (partial_schedule_ptr ps)
2040 {
2041   int i;
2042
2043   for (i = 0; i < ps->ii; i++)
2044     {
2045       while (ps->rows[i])
2046         {
2047           ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
2048
2049           free (ps->rows[i]);
2050           ps->rows[i] = ps_insn;
2051         }
2052       ps->rows[i] = NULL;
2053     }
2054 }
2055
2056 /* Free all the memory allocated to the partial schedule.  */
2057 void
2058 free_partial_schedule (partial_schedule_ptr ps)
2059 {
2060   if (!ps)
2061     return;
2062   free_ps_insns (ps);
2063   free (ps->rows);
2064   free (ps);
2065 }
2066
2067 /* Clear the rows array with its PS_INSNs, and create a new one with
2068    NEW_II rows.  */
2069 void
2070 reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
2071 {
2072   if (!ps)
2073     return;
2074   free_ps_insns (ps);
2075   if (new_ii == ps->ii)
2076     return;
2077   ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
2078                                                  * sizeof (ps_insn_ptr));
2079   memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
2080   ps->ii = new_ii;
2081   ps->min_cycle = INT_MAX;
2082   ps->max_cycle = INT_MIN;
2083 }
2084
2085 /* Prints the partial schedule as an ii rows array, for each rows
2086    print the ids of the insns in it.  */
2087 void
2088 print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
2089 {
2090   int i;
2091
2092   for (i = 0; i < ps->ii; i++)
2093     {
2094       ps_insn_ptr ps_i = ps->rows[i];
2095
2096       fprintf (dump, "\n[CYCLE %d ]: ", i);
2097       while (ps_i)
2098         {
2099           fprintf (dump, "%d, ",
2100                    INSN_UID (ps_i->node->insn));
2101           ps_i = ps_i->next_in_row;
2102         }
2103     }
2104 }
2105
2106 /* Creates an object of PS_INSN and initializes it to the given parameters.  */
2107 static ps_insn_ptr
2108 create_ps_insn (ddg_node_ptr node, int rest_count, int cycle)
2109 {
2110   ps_insn_ptr ps_i = xmalloc (sizeof (struct ps_insn));
2111
2112   ps_i->node = node;
2113   ps_i->next_in_row = NULL;
2114   ps_i->prev_in_row = NULL;
2115   ps_i->row_rest_count = rest_count;
2116   ps_i->cycle = cycle;
2117
2118   return ps_i;
2119 }
2120
2121
2122 /* Removes the given PS_INSN from the partial schedule.  Returns false if the
2123    node is not found in the partial schedule, else returns true.  */
2124 static bool
2125 remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
2126 {
2127   int row;
2128
2129   if (!ps || !ps_i)
2130     return false;
2131
2132   row = SMODULO (ps_i->cycle, ps->ii);
2133   if (! ps_i->prev_in_row)
2134     {
2135       if (ps_i != ps->rows[row])
2136         return false;
2137
2138       ps->rows[row] = ps_i->next_in_row;
2139       if (ps->rows[row])
2140         ps->rows[row]->prev_in_row = NULL;
2141     }
2142   else
2143     {
2144       ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
2145       if (ps_i->next_in_row)
2146         ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
2147     }
2148   free (ps_i);
2149   return true;
2150 }
2151
2152 /* Unlike what literature describes for modulo scheduling (which focuses
2153    on VLIW machines) the order of the instructions inside a cycle is
2154    important.  Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
2155    where the current instruction should go relative to the already
2156    scheduled instructions in the given cycle.  Go over these
2157    instructions and find the first possible column to put it in.  */
2158 static bool
2159 ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2160                      sbitmap must_precede, sbitmap must_follow)
2161 {
2162   ps_insn_ptr next_ps_i;
2163   ps_insn_ptr first_must_follow = NULL;
2164   ps_insn_ptr last_must_precede = NULL;
2165   int row;
2166
2167   if (! ps_i)
2168     return false;
2169
2170   row = SMODULO (ps_i->cycle, ps->ii);
2171
2172   /* Find the first must follow and the last must precede
2173      and insert the node immediately after the must precede
2174      but make sure that it there is no must follow after it.  */
2175   for (next_ps_i = ps->rows[row];
2176        next_ps_i;
2177        next_ps_i = next_ps_i->next_in_row)
2178     {
2179       if (TEST_BIT (must_follow, next_ps_i->node->cuid)
2180           && ! first_must_follow)
2181         first_must_follow = next_ps_i;
2182       if (TEST_BIT (must_precede, next_ps_i->node->cuid))
2183         {
2184           /* If we have already met a node that must follow, then
2185              there is no possible column.  */
2186           if (first_must_follow)
2187             return false;
2188           else
2189             last_must_precede = next_ps_i;
2190         }
2191     }
2192
2193   /* Now insert the node after INSERT_AFTER_PSI.  */
2194
2195   if (! last_must_precede)
2196     {
2197       ps_i->next_in_row = ps->rows[row];
2198       ps_i->prev_in_row = NULL;
2199       if (ps_i->next_in_row)
2200         ps_i->next_in_row->prev_in_row = ps_i;
2201       ps->rows[row] = ps_i;
2202     }
2203   else
2204     {
2205       ps_i->next_in_row = last_must_precede->next_in_row;
2206       last_must_precede->next_in_row = ps_i;
2207       ps_i->prev_in_row = last_must_precede;
2208       if (ps_i->next_in_row)
2209         ps_i->next_in_row->prev_in_row = ps_i;
2210     }
2211
2212   return true;
2213 }
2214
2215 /* Advances the PS_INSN one column in its current row; returns false
2216    in failure and true in success.  Bit N is set in MUST_FOLLOW if
2217    the node with cuid N must be come after the node pointed to by
2218    PS_I when scheduled in the same cycle.  */
2219 static int
2220 ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2221                         sbitmap must_follow)
2222 {
2223   ps_insn_ptr prev, next;
2224   int row;
2225   ddg_node_ptr next_node;
2226
2227   if (!ps || !ps_i)
2228     return false;
2229
2230   row = SMODULO (ps_i->cycle, ps->ii);
2231
2232   if (! ps_i->next_in_row)
2233     return false;
2234
2235   next_node = ps_i->next_in_row->node;
2236
2237   /* Check if next_in_row is dependent on ps_i, both having same sched
2238      times (typically ANTI_DEP).  If so, ps_i cannot skip over it.  */
2239   if (TEST_BIT (must_follow, next_node->cuid))
2240     return false;
2241
2242   /* Advance PS_I over its next_in_row in the doubly linked list.  */
2243   prev = ps_i->prev_in_row;
2244   next = ps_i->next_in_row;
2245
2246   if (ps_i == ps->rows[row])
2247     ps->rows[row] = next;
2248
2249   ps_i->next_in_row = next->next_in_row;
2250
2251   if (next->next_in_row)
2252     next->next_in_row->prev_in_row = ps_i;
2253
2254   next->next_in_row = ps_i;
2255   ps_i->prev_in_row = next;
2256
2257   next->prev_in_row = prev;
2258   if (prev)
2259     prev->next_in_row = next;
2260
2261   return true;
2262 }
2263
2264 /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
2265    Returns 0 if this is not possible and a PS_INSN otherwise.  Bit N is
2266    set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
2267    before/after (respectively) the node pointed to by PS_I when scheduled
2268    in the same cycle.  */
2269 static ps_insn_ptr
2270 add_node_to_ps (partial_schedule_ptr ps, ddg_node_ptr node, int cycle,
2271                 sbitmap must_precede, sbitmap must_follow)
2272 {
2273   ps_insn_ptr ps_i;
2274   int rest_count = 1;
2275   int row = SMODULO (cycle, ps->ii);
2276
2277   if (ps->rows[row]
2278       && ps->rows[row]->row_rest_count >= issue_rate)
2279     return NULL;
2280
2281   if (ps->rows[row])
2282     rest_count += ps->rows[row]->row_rest_count;
2283
2284   ps_i = create_ps_insn (node, rest_count, cycle);
2285
2286   /* Finds and inserts PS_I according to MUST_FOLLOW and
2287      MUST_PRECEDE.  */
2288   if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow))
2289     {
2290       free (ps_i);
2291       return NULL;
2292     }
2293
2294   return ps_i;
2295 }
2296
2297 /* Advance time one cycle.  Assumes DFA is being used.  */
2298 static void
2299 advance_one_cycle (void)
2300 {
2301   if (targetm.sched.dfa_pre_cycle_insn)
2302     state_transition (curr_state,
2303                       targetm.sched.dfa_pre_cycle_insn ());
2304
2305   state_transition (curr_state, NULL);
2306
2307   if (targetm.sched.dfa_post_cycle_insn)
2308     state_transition (curr_state,
2309                       targetm.sched.dfa_post_cycle_insn ());
2310 }
2311
2312 /* Given the kernel of a loop (from FIRST_INSN to LAST_INSN), finds
2313    the number of cycles according to DFA that the kernel fits in,
2314    we use this to check if we done well with SMS after we add
2315    register moves.  In some cases register moves overhead makes
2316    it even worse than the original loop.  We want SMS to be performed
2317    when it gives less cycles after register moves are added.  */
2318 static int
2319 kernel_number_of_cycles (rtx first_insn, rtx last_insn)
2320 {
2321   int cycles = 0;
2322   rtx insn;
2323   int can_issue_more = issue_rate;
2324
2325   state_reset (curr_state);
2326
2327   for (insn = first_insn;
2328        insn != NULL_RTX && insn != last_insn;
2329        insn = NEXT_INSN (insn))
2330     {
2331       if (! INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE)
2332         continue;
2333
2334       /* Check if there is room for the current insn.  */
2335       if (!can_issue_more || state_dead_lock_p (curr_state))
2336         {
2337           cycles ++;
2338           advance_one_cycle ();
2339           can_issue_more = issue_rate;
2340         }
2341
2342         /* Update the DFA state and return with failure if the DFA found
2343            recource conflicts.  */
2344       if (state_transition (curr_state, insn) >= 0)
2345         {
2346           cycles ++;
2347           advance_one_cycle ();
2348           can_issue_more = issue_rate;
2349         }
2350
2351       if (targetm.sched.variable_issue)
2352         can_issue_more =
2353           targetm.sched.variable_issue (sched_dump, sched_verbose,
2354                                         insn, can_issue_more);
2355       /* A naked CLOBBER or USE generates no instruction, so don't
2356          let them consume issue slots.  */
2357       else if (GET_CODE (PATTERN (insn)) != USE
2358                && GET_CODE (PATTERN (insn)) != CLOBBER)
2359         can_issue_more--;
2360     }
2361   return cycles;
2362 }
2363
2364 /* Checks if PS has resource conflicts according to DFA, starting from
2365    FROM cycle to TO cycle; returns true if there are conflicts and false
2366    if there are no conflicts.  Assumes DFA is being used.  */
2367 static int
2368 ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
2369 {
2370   int cycle;
2371
2372   state_reset (curr_state);
2373
2374   for (cycle = from; cycle <= to; cycle++)
2375     {
2376       ps_insn_ptr crr_insn;
2377       /* Holds the remaining issue slots in the current row.  */
2378       int can_issue_more = issue_rate;
2379
2380       /* Walk through the DFA for the current row.  */
2381       for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
2382            crr_insn;
2383            crr_insn = crr_insn->next_in_row)
2384         {
2385           rtx insn = crr_insn->node->insn;
2386
2387           if (!INSN_P (insn))
2388             continue;
2389
2390           /* Check if there is room for the current insn.  */
2391           if (!can_issue_more || state_dead_lock_p (curr_state))
2392             return true;
2393
2394           /* Update the DFA state and return with failure if the DFA found
2395              recource conflicts.  */
2396           if (state_transition (curr_state, insn) >= 0)
2397             return true;
2398
2399           if (targetm.sched.variable_issue)
2400             can_issue_more =
2401               targetm.sched.variable_issue (sched_dump, sched_verbose,
2402                                             insn, can_issue_more);
2403           /* A naked CLOBBER or USE generates no instruction, so don't
2404              let them consume issue slots.  */
2405           else if (GET_CODE (PATTERN (insn)) != USE
2406                    && GET_CODE (PATTERN (insn)) != CLOBBER)
2407             can_issue_more--;
2408         }
2409
2410       /* Advance the DFA to the next cycle.  */
2411       advance_one_cycle ();
2412     }
2413   return false;
2414 }
2415
2416 /* Checks if the given node causes resource conflicts when added to PS at
2417    cycle C.  If not the node is added to PS and returned; otherwise zero
2418    is returned.  Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
2419    cuid N must be come before/after (respectively) the node pointed to by
2420    PS_I when scheduled in the same cycle.  */
2421 ps_insn_ptr
2422 ps_add_node_check_conflicts (partial_schedule_ptr ps, ddg_node_ptr n,
2423                              int c, sbitmap must_precede,
2424                              sbitmap must_follow)
2425 {
2426   int has_conflicts = 0;
2427   ps_insn_ptr ps_i;
2428
2429   /* First add the node to the PS, if this succeeds check for
2430      conflicts, trying different issue slots in the same row.  */
2431   if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow)))
2432     return NULL; /* Failed to insert the node at the given cycle.  */
2433
2434   has_conflicts = ps_has_conflicts (ps, c, c)
2435                   || (ps->history > 0
2436                       && ps_has_conflicts (ps,
2437                                            c - ps->history,
2438                                            c + ps->history));
2439
2440   /* Try different issue slots to find one that the given node can be
2441      scheduled in without conflicts.  */
2442   while (has_conflicts)
2443     {
2444       if (! ps_insn_advance_column (ps, ps_i, must_follow))
2445         break;
2446       has_conflicts = ps_has_conflicts (ps, c, c)
2447                       || (ps->history > 0
2448                           && ps_has_conflicts (ps,
2449                                                c - ps->history,
2450                                                c + ps->history));
2451     }
2452
2453   if (has_conflicts)
2454     {
2455       remove_node_from_ps (ps, ps_i);
2456       return NULL;
2457     }
2458
2459   ps->min_cycle = MIN (ps->min_cycle, c);
2460   ps->max_cycle = MAX (ps->max_cycle, c);
2461   return ps_i;
2462 }
2463
2464 /* Rotate the rows of PS such that insns scheduled at time
2465    START_CYCLE will appear in row 0.  Updates max/min_cycles.  */
2466 void
2467 rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
2468 {
2469   int i, row, backward_rotates;
2470   int last_row = ps->ii - 1;
2471
2472   if (start_cycle == 0)
2473     return;
2474
2475   backward_rotates = SMODULO (start_cycle, ps->ii);
2476
2477   /* Revisit later and optimize this into a single loop.  */
2478   for (i = 0; i < backward_rotates; i++)
2479     {
2480       ps_insn_ptr first_row = ps->rows[0];
2481
2482       for (row = 0; row < last_row; row++)
2483         ps->rows[row] = ps->rows[row+1];
2484
2485       ps->rows[last_row] = first_row;
2486     }
2487
2488   ps->max_cycle -= start_cycle;
2489   ps->min_cycle -= start_cycle;
2490 }
2491
2492 /* Remove the node N from the partial schedule PS; because we restart the DFA
2493    each time we want to check for resource conflicts; this is equivalent to
2494    unscheduling the node N.  */
2495 static bool
2496 ps_unschedule_node (partial_schedule_ptr ps, ddg_node_ptr n)
2497 {
2498   ps_insn_ptr ps_i;
2499   int row = SMODULO (SCHED_TIME (n), ps->ii);
2500
2501   if (row < 0 || row > ps->ii)
2502     return false;
2503
2504   for (ps_i = ps->rows[row];
2505        ps_i &&  ps_i->node != n;
2506        ps_i = ps_i->next_in_row);
2507   if (!ps_i)
2508     return false;
2509
2510   return remove_node_from_ps (ps, ps_i);
2511 }
2512 #endif /* INSN_SCHEDULING*/
2513