src/gallium/drivers/vc4/vc4_qir_schedule.c

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  * Copyright © 2014-2015 Broadcom
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * @file vc4_qir_schedule.c
  27  *
  28  * The basic model of the list scheduler is to take a basic block, compute a
  29  * DAG of the dependencies from the bottom up, and make a list of the DAG
  30  * heads.  Heuristically pick a DAG head and schedule (remove) it, then put
  31  * all the parents that are now DAG heads into the list of things to
  32  * schedule.
  33  *
  34  * The goal of scheduling here, before register allocation and conversion to
  35  * QPU instructions, is to reduce register pressure by reordering instructions
  36  * to consume values when possible.
  37  */
  38
  39 #include "vc4_qir.h"
  40 #include "util/dag.h"
  41
  42 static bool debug;
  43
  44 struct schedule_node {
  45         struct dag_node dag;
  46         struct list_head link;
  47         struct qinst *inst;
  48
  49         /* Length of the longest (latency) chain from a DAG head to the this
  50          * instruction.
  51          */
  52         uint32_t delay;
  53
  54         /* Longest time + latency_between(parent, this) of any parent of this
  55          * node.
  56          */
  57         uint32_t unblocked_time;
  58 };
  59
  60 struct schedule_state {
  61         struct dag *dag;
  62
  63         uint32_t time;
  64
  65         uint32_t *temp_writes;
  66
  67         BITSET_WORD *temp_live;
  68 };
  69
  70 /* When walking the instructions in reverse, we need to swap before/after in
  71  * add_dep().
  72  */
  73 enum direction { F, R };
  74
  75 /**
  76  * Marks a dependency between two intructions, that \p after must appear after
  77  * \p before.
  78  *
  79  * Our dependencies are tracked as a DAG.  Since we're scheduling bottom-up,
  80  * the latest instructions with nothing left to schedule are the DAG heads,
  81  * and their inputs are their children.
  82  */
  83 static void
  84 add_dep(enum direction dir,
  85         struct schedule_node *before,
  86         struct schedule_node *after)
  87 {
  88         if (!before || !after)
  89                 return;
  90
  91         assert(before != after);
  92
  93         if (dir == R) {
  94                 struct schedule_node *t = before;
  95                 before = after;
  96                 after = t;
  97         }
  98
  99         dag_add_edge(&after->dag, &before->dag, NULL);
 100 }
 101
 102 static void
 103 add_write_dep(enum direction dir,
 104               struct schedule_node **before,
 105               struct schedule_node *after)
 106 {
 107         add_dep(dir, *before, after);
 108         *before = after;
 109 }
 110
 111 struct schedule_setup_state {
 112         struct schedule_node **last_temp_write;
 113         struct schedule_node *last_sf;
 114         struct schedule_node *last_vary_read;
 115         struct schedule_node *last_vpm_read;
 116         struct schedule_node *last_vpm_write;
 117         struct schedule_node *last_tex_coord;
 118         struct schedule_node *last_tex_result;
 119         struct schedule_node *last_tlb;
 120         struct schedule_node *last_uniforms_reset;
 121         enum direction dir;
 122
 123         /**
 124          * Texture FIFO tracking.  This is done top-to-bottom, and is used to
 125          * track the QOP_TEX_RESULTs and add dependencies on previous ones
 126          * when trying to submit texture coords with TFREQ full or new texture
 127          * fetches with TXRCV full.
 128          */
 129         struct {
 130                 struct schedule_node *node;
 131                 int coords;
 132         } tex_fifo[8];
 133         int tfreq_count; /**< Number of texture coords outstanding. */
 134         int tfrcv_count; /**< Number of texture results outstanding. */
 135         int tex_fifo_pos;
 136 };
 137
 138 static void
 139 block_until_tex_result(struct schedule_setup_state *state, struct schedule_node *n)
 140 {
 141         add_dep(state->dir, state->tex_fifo[0].node, n);
 142
 143         state->tfreq_count -= state->tex_fifo[0].coords;
 144         state->tfrcv_count--;
 145
 146         memmove(&state->tex_fifo[0],
 147                 &state->tex_fifo[1],
 148                 state->tex_fifo_pos * sizeof(state->tex_fifo[0]));
 149         state->tex_fifo_pos--;
 150 }
 151
 152 /**
 153  * Common code for dependencies that need to be tracked both forward and
 154  * backward.
 155  *
 156  * This is for things like "all VPM reads have to happen in order."
 157  */
 158 static void
 159 calculate_deps(struct schedule_setup_state *state, struct schedule_node *n)
 160 {
 161         struct qinst *inst = n->inst;
 162         enum direction dir = state->dir;
 163
 164
 165         /* Add deps for temp registers and varyings accesses.  Note that we
 166          * ignore uniforms accesses, because qir_reorder_uniforms() happens
 167          * after this.
 168          */
 169         for (int i = 0; i < qir_get_nsrc(inst); i++) {
 170                 switch (inst->src[i].file) {
 171                 case QFILE_TEMP:
 172                         add_dep(dir,
 173                                 state->last_temp_write[inst->src[i].index], n);
 174                         break;
 175
 176                 case QFILE_VARY:
 177                         add_write_dep(dir, &state->last_vary_read, n);
 178                         break;
 179
 180                 case QFILE_VPM:
 181                         add_write_dep(dir, &state->last_vpm_read, n);
 182                         break;
 183
 184                 default:
 185                         break;
 186                 }
 187         }
 188
 189         switch (inst->op) {
 190         case QOP_VARY_ADD_C:
 191                 add_dep(dir, state->last_vary_read, n);
 192                 break;
 193
 194         case QOP_TEX_RESULT:
 195                 /* Results have to be fetched in order. */
 196                 add_write_dep(dir, &state->last_tex_result, n);
 197                 break;
 198
 199         case QOP_THRSW:
 200                 /* After a new THRSW, one must collect all texture samples
 201                  * queued since the previous THRSW/program start.  For now, we
 202                  * have one THRSW in between each texture setup and its
 203                  * results collection as our input, and we just make sure that
 204                  * that ordering is maintained.
 205                  */
 206                 add_write_dep(dir, &state->last_tex_coord, n);
 207                 add_write_dep(dir, &state->last_tex_result, n);
 208
 209                 /* accumulators and flags are lost across thread switches. */
 210                 add_write_dep(dir, &state->last_sf, n);
 211
 212                 /* Setup, like the varyings, will need to be drained before we
 213                  * thread switch.
 214                  */
 215                 add_write_dep(dir, &state->last_vary_read, n);
 216
 217                 /* The TLB-locking operations have to stay after the last
 218                  * thread switch.
 219                  */
 220                 add_write_dep(dir, &state->last_tlb, n);
 221                 break;
 222
 223         case QOP_TLB_COLOR_READ:
 224         case QOP_MS_MASK:
 225                 add_write_dep(dir, &state->last_tlb, n);
 226                 break;
 227
 228         default:
 229                 break;
 230         }
 231
 232         switch (inst->dst.file) {
 233         case QFILE_VPM:
 234                 add_write_dep(dir, &state->last_vpm_write, n);
 235                 break;
 236
 237         case QFILE_TEMP:
 238                 add_write_dep(dir, &state->last_temp_write[inst->dst.index], n);
 239                 break;
 240
 241         case QFILE_TLB_COLOR_WRITE:
 242         case QFILE_TLB_COLOR_WRITE_MS:
 243         case QFILE_TLB_Z_WRITE:
 244         case QFILE_TLB_STENCIL_SETUP:
 245                 add_write_dep(dir, &state->last_tlb, n);
 246                 break;
 247
 248         case QFILE_TEX_S_DIRECT:
 249         case QFILE_TEX_S:
 250         case QFILE_TEX_T:
 251         case QFILE_TEX_R:
 252         case QFILE_TEX_B:
 253                 /* Texturing setup gets scheduled in order, because
 254                  * the uniforms referenced by them have to land in a
 255                  * specific order.
 256                  */
 257                 add_write_dep(dir, &state->last_tex_coord, n);
 258                 break;
 259
 260         default:
 261                 break;
 262         }
 263
 264         if (qir_depends_on_flags(inst))
 265                 add_dep(dir, state->last_sf, n);
 266
 267         if (inst->sf)
 268                 add_write_dep(dir, &state->last_sf, n);
 269 }
 270
 271 static void
 272 calculate_forward_deps(struct vc4_compile *c, void *mem_ctx,
 273                        struct list_head *schedule_list)
 274 {
 275         struct schedule_setup_state state;
 276
 277         memset(&state, 0, sizeof(state));
 278         state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *,
 279                                               c->num_temps);
 280         state.dir = F;
 281
 282         list_for_each_entry(struct schedule_node, n, schedule_list, link) {
 283                 struct qinst *inst = n->inst;
 284
 285                 calculate_deps(&state, n);
 286
 287                 for (int i = 0; i < qir_get_nsrc(inst); i++) {
 288                         switch (inst->src[i].file) {
 289                         case QFILE_UNIF:
 290                                 add_dep(state.dir, state.last_uniforms_reset, n);
 291                                 break;
 292                         default:
 293                                 break;
 294                         }
 295                 }
 296
 297                 switch (inst->dst.file) {
 298                 case QFILE_TEX_S_DIRECT:
 299                 case QFILE_TEX_S:
 300                 case QFILE_TEX_T:
 301                 case QFILE_TEX_R:
 302                 case QFILE_TEX_B:
 303                         /* From the VC4 spec:
 304                          *
 305                          *     "The TFREQ input FIFO holds two full lots of s,
 306                          *      t, r, b data, plus associated setup data, per
 307                          *      QPU, that is, there are eight data slots. For
 308                          *      each texture request, slots are only consumed
 309                          *      for the components of s, t, r, and b actually
 310                          *      written. Thus the FIFO can hold four requests
 311                          *      of just (s, t) data, or eight requests of just
 312                          *      s data (for direct addressed data lookups).
 313                          *
 314                          *      Note that there is one FIFO per QPU, and the
 315                          *      FIFO has no concept of threads - that is,
 316                          *      multi-threaded shaders must be careful to use
 317                          *      only 1/2 the FIFO depth before reading
 318                          *      back. Multi-threaded programs must also
 319                          *      therefore always thread switch on texture
 320                          *      fetch as the other thread may have data
 321                          *      waiting in the FIFO."
 322                          *
 323                          * If the texture coordinate fifo is full, block this
 324                          * on the last QOP_TEX_RESULT.
 325                          */
 326                         if (state.tfreq_count == (c->fs_threaded ? 4 : 8)) {
 327                                 block_until_tex_result(&state, n);
 328                         }
 329
 330                         /* From the VC4 spec:
 331                          *
 332                          *     "Since the maximum number of texture requests
 333                          *      in the input (TFREQ) FIFO is four lots of (s,
 334                          *      t) data, the output (TFRCV) FIFO is sized to
 335                          *      holds four lots of max-size color data per
 336                          *      QPU. For non-float color, reads are packed
 337                          *      RGBA8888 data (one read per pixel). For 16-bit
 338                          *      float color, two reads are necessary per
 339                          *      pixel, with reads packed as RG1616 then
 340                          *      BA1616. So per QPU there are eight color slots
 341                          *      in the TFRCV FIFO."
 342                          *
 343                          * If the texture result fifo is full, block adding
 344                          * any more to it until the last QOP_TEX_RESULT.
 345                          */
 346                         if (inst->dst.file == QFILE_TEX_S ||
 347                             inst->dst.file == QFILE_TEX_S_DIRECT) {
 348                                 if (state.tfrcv_count ==
 349                                     (c->fs_threaded ? 2 : 4))
 350                                         block_until_tex_result(&state, n);
 351                                 state.tfrcv_count++;
 352                         }
 353
 354                         state.tex_fifo[state.tex_fifo_pos].coords++;
 355                         state.tfreq_count++;
 356                         break;
 357
 358                 default:
 359                         break;
 360                 }
 361
 362                 switch (inst->op) {
 363                 case QOP_TEX_RESULT:
 364                         /* Results have to be fetched after the
 365                          * coordinate setup.  Note that we're assuming
 366                          * here that our input shader has the texture
 367                          * coord setup and result fetch in order,
 368                          * which is true initially but not of our
 369                          * instruction stream after this pass.
 370                          */
 371                         add_dep(state.dir, state.last_tex_coord, n);
 372
 373                         state.tex_fifo[state.tex_fifo_pos].node = n;
 374
 375                         state.tex_fifo_pos++;
 376                         memset(&state.tex_fifo[state.tex_fifo_pos], 0,
 377                                sizeof(state.tex_fifo[0]));
 378                         break;
 379
 380                 case QOP_UNIFORMS_RESET:
 381                         add_write_dep(state.dir, &state.last_uniforms_reset, n);
 382                         break;
 383
 384                 default:
 385                         break;
 386                 }
 387         }
 388 }
 389
 390 static void
 391 calculate_reverse_deps(struct vc4_compile *c, void *mem_ctx,
 392                        struct list_head *schedule_list)
 393 {
 394         struct schedule_setup_state state;
 395
 396         memset(&state, 0, sizeof(state));
 397         state.dir = R;
 398         state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *,
 399                                               c->num_temps);
 400
 401         list_for_each_entry_rev(struct schedule_node, n, schedule_list, link) {
 402                 calculate_deps(&state, n);
 403         }
 404 }
 405
 406 static int
 407 get_register_pressure_cost(struct schedule_state *state, struct qinst *inst)
 408 {
 409         int cost = 0;
 410
 411         if (inst->dst.file == QFILE_TEMP &&
 412             state->temp_writes[inst->dst.index] == 1)
 413                 cost--;
 414
 415         for (int i = 0; i < qir_get_nsrc(inst); i++) {
 416                 if (inst->src[i].file != QFILE_TEMP ||
 417                     BITSET_TEST(state->temp_live, inst->src[i].index)) {
 418                         continue;
 419                 }
 420
 421                 bool already_counted = false;
 422                 for (int j = 0; j < i; j++) {
 423                         if (inst->src[i].file == inst->src[j].file &&
 424                             inst->src[i].index == inst->src[j].index) {
 425                                 already_counted = true;
 426                         }
 427                 }
 428                 if (!already_counted)
 429                         cost++;
 430         }
 431
 432         return cost;
 433 }
 434
 435 static bool
 436 locks_scoreboard(struct qinst *inst)
 437 {
 438         if (inst->op == QOP_TLB_COLOR_READ)
 439                 return true;
 440
 441         switch (inst->dst.file) {
 442         case QFILE_TLB_Z_WRITE:
 443         case QFILE_TLB_COLOR_WRITE:
 444         case QFILE_TLB_COLOR_WRITE_MS:
 445                 return true;
 446         default:
 447                 return false;
 448         }
 449 }
 450
 451 static struct schedule_node *
 452 choose_instruction(struct schedule_state *state)
 453 {
 454         struct schedule_node *chosen = NULL;
 455
 456         list_for_each_entry(struct schedule_node, n, &state->dag->heads,
 457                             dag.link) {
 458                 /* The branches aren't being tracked as dependencies.  Make
 459                  * sure that they stay scheduled as the last instruction of
 460                  * the block, which is to say the first one we choose to
 461                  * schedule.
 462                  */
 463                 if (n->inst->op == QOP_BRANCH)
 464                         return n;
 465
 466                 if (!chosen) {
 467                         chosen = n;
 468                         continue;
 469                 }
 470
 471                 /* Prefer scheduling things that lock the scoreboard, so that
 472                  * they appear late in the program and we get more parallelism
 473                  * between shaders on multiple QPUs hitting the same fragment.
 474                  */
 475                 if (locks_scoreboard(n->inst) &&
 476                     !locks_scoreboard(chosen->inst)) {
 477                         chosen = n;
 478                         continue;
 479                 } else if (!locks_scoreboard(n->inst) &&
 480                            locks_scoreboard(chosen->inst)) {
 481                         continue;
 482                 }
 483
 484                 /* If we would block on the previously chosen node, but would
 485                  * block less on this one, then prefer it.
 486                  */
 487                 if (chosen->unblocked_time > state->time &&
 488                     n->unblocked_time < chosen->unblocked_time) {
 489                         chosen = n;
 490                         continue;
 491                 } else if (n->unblocked_time > state->time &&
 492                            n->unblocked_time > chosen->unblocked_time) {
 493                         continue;
 494                 }
 495
 496                 /* If we can definitely reduce register pressure, do so
 497                  * immediately.
 498                  */
 499                 int register_pressure_cost =
 500                         get_register_pressure_cost(state, n->inst);
 501                 int chosen_register_pressure_cost =
 502                         get_register_pressure_cost(state, chosen->inst);
 503
 504                 if (register_pressure_cost < chosen_register_pressure_cost) {
 505                         chosen = n;
 506                         continue;
 507                 } else if (register_pressure_cost >
 508                            chosen_register_pressure_cost) {
 509                         continue;
 510                 }
 511
 512                 /* Otherwise, prefer instructions with the deepest chain to
 513                  * the end of the program.  This avoids the problem of
 514                  * "everything generates a temp, nothing finishes freeing one,
 515                  * guess I'll just keep emitting varying mul/adds".
 516                  */
 517                 if (n->delay > chosen->delay) {
 518                         chosen = n;
 519                         continue;
 520                 } else if (n->delay < chosen->delay) {
 521                         continue;
 522                 }
 523         }
 524
 525         return chosen;
 526 }
 527
 528 static void
 529 dump_state(struct vc4_compile *c, struct schedule_state *state)
 530 {
 531         uint32_t i = 0;
 532         list_for_each_entry(struct schedule_node, n, &state->dag->heads,
 533                             dag.link) {
 534                 fprintf(stderr, "%3d: ", i++);
 535                 qir_dump_inst(c, n->inst);
 536                 fprintf(stderr, " (%d cost)\n",
 537                         get_register_pressure_cost(state, n->inst));
 538
 539                 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 540                         struct schedule_node *child =
 541                                 (struct schedule_node *)edge->child;
 542                         fprintf(stderr, "   - ");
 543                         qir_dump_inst(c, child->inst);
 544                         fprintf(stderr, " (%d parents)\n",
 545                                 child->dag.parent_count);
 546                 }
 547         }
 548 }
 549
 550 /* Estimate of how many instructions we should schedule between operations.
 551  *
 552  * These aren't in real cycle counts, because we're just estimating cycle
 553  * times anyway.  QIR instructions will get paired up when turned into QPU
 554  * instructions, or extra NOP delays will have to be added due to register
 555  * allocation choices.
 556  */
 557 static uint32_t
 558 latency_between(struct schedule_node *before, struct schedule_node *after)
 559 {
 560         if ((before->inst->dst.file == QFILE_TEX_S ||
 561              before->inst->dst.file == QFILE_TEX_S_DIRECT) &&
 562             after->inst->op == QOP_TEX_RESULT)
 563                 return 100;
 564
 565         switch (before->inst->op) {
 566         case QOP_RCP:
 567         case QOP_RSQ:
 568         case QOP_EXP2:
 569         case QOP_LOG2:
 570                 for (int i = 0; i < qir_get_nsrc(after->inst); i++) {
 571                         if (after->inst->src[i].file ==
 572                             before->inst->dst.file &&
 573                             after->inst->src[i].index ==
 574                             before->inst->dst.index) {
 575                                 /* There are two QPU delay slots before we can
 576                                  * read a math result, which could be up to 4
 577                                  * QIR instructions if they packed well.
 578                                  */
 579                                 return 4;
 580                         }
 581                 }
 582                 break;
 583         default:
 584                 break;
 585         }
 586
 587         return 1;
 588 }
 589
 590 /** Recursive computation of the delay member of a node. */
 591 static void
 592 compute_delay(struct dag_node *node, void *state)
 593 {
 594         struct schedule_node *n = (struct schedule_node *)node;
 595
 596         /* The color read needs to be scheduled late, to avoid locking the
 597          * scoreboard early.  This is our best tool for encouraging that.  The
 598          * other scoreboard locking ops will have this happen by default,
 599          * since they are generally the DAG heads or close to them.
 600          */
 601         if (n->inst->op == QOP_TLB_COLOR_READ)
 602                 n->delay = 1000;
 603         else
 604                 n->delay = 1;
 605
 606         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 607                 struct schedule_node *child =
 608                         (struct schedule_node *)edge->child;
 609
 610                 n->delay = MAX2(n->delay, (child->delay +
 611                                            latency_between(child, n)));
 612         }
 613 }
 614
 615 static void
 616 schedule_instructions(struct vc4_compile *c,
 617                       struct qblock *block, struct schedule_state *state)
 618 {
 619         if (debug) {
 620                 fprintf(stderr, "initial deps:\n");
 621                 dump_state(c, state);
 622         }
 623
 624         state->time = 0;
 625         while (!list_is_empty(&state->dag->heads)) {
 626                 struct schedule_node *chosen = choose_instruction(state);
 627                 struct qinst *inst = chosen->inst;
 628
 629                 if (debug) {
 630                         fprintf(stderr, "current list:\n");
 631                         dump_state(c, state);
 632                         fprintf(stderr, "chose: ");
 633                         qir_dump_inst(c, inst);
 634                         fprintf(stderr, " (%d cost)\n",
 635                                 get_register_pressure_cost(state, inst));
 636                 }
 637
 638                 state->time = MAX2(state->time, chosen->unblocked_time);
 639
 640                 /* Schedule this instruction back onto the QIR list. */
 641                 list_add(&inst->link, &block->instructions);
 642
 643                 /* Now that we've scheduled a new instruction, some of its
 644                  * children can be promoted to the list of instructions ready to
 645                  * be scheduled.  Update the children's unblocked time for this
 646                  * DAG edge as we do so.
 647                  */
 648                 util_dynarray_foreach(&chosen->dag.edges,
 649                                       struct dag_edge, edge) {
 650                         struct schedule_node *child =
 651                                 (struct schedule_node *)edge->child;
 652
 653                         child->unblocked_time = MAX2(child->unblocked_time,
 654                                                      state->time +
 655                                                      latency_between(child,
 656                                                                      chosen));
 657                 }
 658                 dag_prune_head(state->dag, &chosen->dag);
 659
 660                 /* Update our tracking of register pressure. */
 661                 for (int i = 0; i < qir_get_nsrc(inst); i++) {
 662                         if (inst->src[i].file == QFILE_TEMP)
 663                                 BITSET_SET(state->temp_live, inst->src[i].index);
 664                 }
 665                 if (inst->dst.file == QFILE_TEMP) {
 666                         state->temp_writes[inst->dst.index]--;
 667                         if (state->temp_writes[inst->dst.index] == 0)
 668                                 BITSET_CLEAR(state->temp_live, inst->dst.index);
 669                 }
 670
 671                 state->time++;
 672         }
 673 }
 674
 675 static void
 676 qir_schedule_instructions_block(struct vc4_compile *c,
 677                                 struct qblock *block)
 678 {
 679         struct schedule_state *state = rzalloc(NULL, struct schedule_state);
 680
 681         state->temp_writes = rzalloc_array(state, uint32_t, c->num_temps);
 682         state->temp_live = rzalloc_array(state, BITSET_WORD,
 683                                          BITSET_WORDS(c->num_temps));
 684         state->dag = dag_create(state);
 685
 686         struct list_head setup_list;
 687         list_inithead(&setup_list);
 688
 689         /* Wrap each instruction in a scheduler structure. */
 690         qir_for_each_inst_safe(inst, block) {
 691                 struct schedule_node *n = rzalloc(state, struct schedule_node);
 692
 693                 n->inst = inst;
 694                 list_del(&inst->link);
 695                 list_addtail(&n->link, &setup_list);
 696                 dag_init_node(state->dag, &n->dag);
 697
 698                 if (inst->dst.file == QFILE_TEMP)
 699                         state->temp_writes[inst->dst.index]++;
 700         }
 701
 702         /* Dependencies tracked top-to-bottom. */
 703         calculate_forward_deps(c, state, &setup_list);
 704         /* Dependencies tracked bottom-to-top. */
 705         calculate_reverse_deps(c, state, &setup_list);
 706
 707         dag_traverse_bottom_up(state->dag, compute_delay, NULL);
 708
 709         schedule_instructions(c, block, state);
 710
 711         ralloc_free(state);
 712 }
 713
 714 void
 715 qir_schedule_instructions(struct vc4_compile *c)
 716 {
 717
 718         if (debug) {
 719                 fprintf(stderr, "Pre-schedule instructions\n");
 720                 qir_dump(c);
 721         }
 722
 723         qir_for_each_block(block, c)
 724                 qir_schedule_instructions_block(c, block);
 725
 726         if (debug) {
 727                 fprintf(stderr, "Post-schedule instructions\n");
 728                 qir_dump(c);
 729         }
 730 }