src/gallium/drivers/vc4/vc4_qpu_schedule.c

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  * Copyright © 2014 Broadcom
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * @file vc4_qpu_schedule.c
  27  *
  28  * The basic model of the list scheduler is to take a basic block, compute a
  29  * DAG of the dependencies, and make a list of the DAG heads.  Heuristically
  30  * pick a DAG head, then put all the children that are now DAG heads into the
  31  * list of things to schedule.
  32  *
  33  * The goal of scheduling here is to pack pairs of operations together in a
  34  * single QPU instruction.
  35  */
  36
  37 #include "vc4_qir.h"
  38 #include "vc4_qpu.h"
  39 #include "util/ralloc.h"
  40 #include "util/dag.h"
  41
  42 static bool debug;
  43
  44 struct schedule_node_child;
  45
  46 struct schedule_node {
  47         struct dag_node dag;
  48         struct list_head link;
  49         struct queued_qpu_inst *inst;
  50
  51         /* Longest cycles + instruction_latency() of any parent of this node. */
  52         uint32_t unblocked_time;
  53
  54         /**
  55          * Minimum number of cycles from scheduling this instruction until the
  56          * end of the program, based on the slowest dependency chain through
  57          * the children.
  58          */
  59         uint32_t delay;
  60
  61         /**
  62          * cycles between this instruction being scheduled and when its result
  63          * can be consumed.
  64          */
  65         uint32_t latency;
  66
  67         /**
  68          * Which uniform from uniform_data[] this instruction read, or -1 if
  69          * not reading a uniform.
  70          */
  71         int uniform;
  72 };
  73
  74 /* When walking the instructions in reverse, we need to swap before/after in
  75  * add_dep().
  76  */
  77 enum direction { F, R };
  78
  79 struct schedule_state {
  80         struct dag *dag;
  81         struct schedule_node *last_r[6];
  82         struct schedule_node *last_ra[32];
  83         struct schedule_node *last_rb[32];
  84         struct schedule_node *last_sf;
  85         struct schedule_node *last_vpm_read;
  86         struct schedule_node *last_tmu_write;
  87         struct schedule_node *last_tlb;
  88         struct schedule_node *last_vpm;
  89         struct schedule_node *last_uniforms_reset;
  90         enum direction dir;
  91         /* Estimated cycle when the current instruction would start. */
  92         uint32_t time;
  93 };
  94
  95 static void
  96 add_dep(struct schedule_state *state,
  97         struct schedule_node *before,
  98         struct schedule_node *after,
  99         bool write)
 100 {
 101         bool write_after_read = !write && state->dir == R;
 102         void *edge_data = (void *)(uintptr_t)write_after_read;
 103
 104         if (!before || !after)
 105                 return;
 106
 107         assert(before != after);
 108
 109         if (state->dir == F)
 110                 dag_add_edge(&before->dag, &after->dag, edge_data);
 111         else
 112                 dag_add_edge(&after->dag, &before->dag, edge_data);
 113 }
 114
 115 static void
 116 add_read_dep(struct schedule_state *state,
 117               struct schedule_node *before,
 118               struct schedule_node *after)
 119 {
 120         add_dep(state, before, after, false);
 121 }
 122
 123 static void
 124 add_write_dep(struct schedule_state *state,
 125               struct schedule_node **before,
 126               struct schedule_node *after)
 127 {
 128         add_dep(state, *before, after, true);
 129         *before = after;
 130 }
 131
 132 static bool
 133 qpu_writes_r4(uint64_t inst)
 134 {
 135         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
 136
 137         switch(sig) {
 138         case QPU_SIG_COLOR_LOAD:
 139         case QPU_SIG_LOAD_TMU0:
 140         case QPU_SIG_LOAD_TMU1:
 141         case QPU_SIG_ALPHA_MASK_LOAD:
 142                 return true;
 143         default:
 144                 return false;
 145         }
 146 }
 147
 148 static void
 149 process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
 150                    uint32_t raddr, bool is_a)
 151 {
 152         switch (raddr) {
 153         case QPU_R_VARY:
 154                 add_write_dep(state, &state->last_r[5], n);
 155                 break;
 156
 157         case QPU_R_VPM:
 158                 add_write_dep(state, &state->last_vpm_read, n);
 159                 break;
 160
 161         case QPU_R_UNIF:
 162                 add_read_dep(state, state->last_uniforms_reset, n);
 163                 break;
 164
 165         case QPU_R_NOP:
 166         case QPU_R_ELEM_QPU:
 167         case QPU_R_XY_PIXEL_COORD:
 168         case QPU_R_MS_REV_FLAGS:
 169                 break;
 170
 171         default:
 172                 if (raddr < 32) {
 173                         if (is_a)
 174                                 add_read_dep(state, state->last_ra[raddr], n);
 175                         else
 176                                 add_read_dep(state, state->last_rb[raddr], n);
 177                 } else {
 178                         fprintf(stderr, "unknown raddr %d\n", raddr);
 179                         abort();
 180                 }
 181                 break;
 182         }
 183 }
 184
 185 static bool
 186 is_tmu_write(uint32_t waddr)
 187 {
 188         switch (waddr) {
 189         case QPU_W_TMU0_S:
 190         case QPU_W_TMU0_T:
 191         case QPU_W_TMU0_R:
 192         case QPU_W_TMU0_B:
 193         case QPU_W_TMU1_S:
 194         case QPU_W_TMU1_T:
 195         case QPU_W_TMU1_R:
 196         case QPU_W_TMU1_B:
 197                 return true;
 198         default:
 199                 return false;
 200         }
 201 }
 202
 203 static bool
 204 reads_uniform(uint64_t inst)
 205 {
 206         if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM)
 207                 return false;
 208
 209         return (QPU_GET_FIELD(inst, QPU_RADDR_A) == QPU_R_UNIF ||
 210                 (QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_UNIF &&
 211                  QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM) ||
 212                 is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_ADD)) ||
 213                 is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_MUL)));
 214 }
 215
 216 static void
 217 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
 218                  uint32_t mux)
 219 {
 220         if (mux != QPU_MUX_A && mux != QPU_MUX_B)
 221                 add_read_dep(state, state->last_r[mux], n);
 222 }
 223
 224
 225 static void
 226 process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
 227                    uint32_t waddr, bool is_add)
 228 {
 229         uint64_t inst = n->inst->inst;
 230         bool is_a = is_add ^ ((inst & QPU_WS) != 0);
 231
 232         if (waddr < 32) {
 233                 if (is_a) {
 234                         add_write_dep(state, &state->last_ra[waddr], n);
 235                 } else {
 236                         add_write_dep(state, &state->last_rb[waddr], n);
 237                 }
 238         } else if (is_tmu_write(waddr)) {
 239                 add_write_dep(state, &state->last_tmu_write, n);
 240                 add_read_dep(state, state->last_uniforms_reset, n);
 241         } else if (qpu_waddr_is_tlb(waddr) ||
 242                    waddr == QPU_W_MS_FLAGS) {
 243                 add_write_dep(state, &state->last_tlb, n);
 244         } else {
 245                 switch (waddr) {
 246                 case QPU_W_ACC0:
 247                 case QPU_W_ACC1:
 248                 case QPU_W_ACC2:
 249                 case QPU_W_ACC3:
 250                 case QPU_W_ACC5:
 251                         add_write_dep(state, &state->last_r[waddr - QPU_W_ACC0],
 252                                       n);
 253                         break;
 254
 255                 case QPU_W_VPM:
 256                         add_write_dep(state, &state->last_vpm, n);
 257                         break;
 258
 259                 case QPU_W_VPMVCD_SETUP:
 260                         if (is_a)
 261                                 add_write_dep(state, &state->last_vpm_read, n);
 262                         else
 263                                 add_write_dep(state, &state->last_vpm, n);
 264                         break;
 265
 266                 case QPU_W_SFU_RECIP:
 267                 case QPU_W_SFU_RECIPSQRT:
 268                 case QPU_W_SFU_EXP:
 269                 case QPU_W_SFU_LOG:
 270                         add_write_dep(state, &state->last_r[4], n);
 271                         break;
 272
 273                 case QPU_W_TLB_STENCIL_SETUP:
 274                         /* This isn't a TLB operation that does things like
 275                          * implicitly lock the scoreboard, but it does have to
 276                          * appear before TLB_Z, and each of the TLB_STENCILs
 277                          * have to schedule in the same order relative to each
 278                          * other.
 279                          */
 280                         add_write_dep(state, &state->last_tlb, n);
 281                         break;
 282
 283                 case QPU_W_MS_FLAGS:
 284                         add_write_dep(state, &state->last_tlb, n);
 285                         break;
 286
 287                 case QPU_W_UNIFORMS_ADDRESS:
 288                         add_write_dep(state, &state->last_uniforms_reset, n);
 289                         break;
 290
 291                 case QPU_W_NOP:
 292                         break;
 293
 294                 default:
 295                         fprintf(stderr, "Unknown waddr %d\n", waddr);
 296                         abort();
 297                 }
 298         }
 299 }
 300
 301 static void
 302 process_cond_deps(struct schedule_state *state, struct schedule_node *n,
 303                   uint32_t cond)
 304 {
 305         switch (cond) {
 306         case QPU_COND_NEVER:
 307         case QPU_COND_ALWAYS:
 308                 break;
 309         default:
 310                 add_read_dep(state, state->last_sf, n);
 311                 break;
 312         }
 313 }
 314
 315 /**
 316  * Common code for dependencies that need to be tracked both forward and
 317  * backward.
 318  *
 319  * This is for things like "all reads of r4 have to happen between the r4
 320  * writes that surround them".
 321  */
 322 static void
 323 calculate_deps(struct schedule_state *state, struct schedule_node *n)
 324 {
 325         uint64_t inst = n->inst->inst;
 326         uint32_t add_op = QPU_GET_FIELD(inst, QPU_OP_ADD);
 327         uint32_t mul_op = QPU_GET_FIELD(inst, QPU_OP_MUL);
 328         uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
 329         uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
 330         uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
 331         uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
 332         uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
 333         uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
 334         uint32_t mul_a = QPU_GET_FIELD(inst, QPU_MUL_A);
 335         uint32_t mul_b = QPU_GET_FIELD(inst, QPU_MUL_B);
 336         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
 337
 338         if (sig != QPU_SIG_LOAD_IMM) {
 339                 process_raddr_deps(state, n, raddr_a, true);
 340                 if (sig != QPU_SIG_SMALL_IMM &&
 341                     sig != QPU_SIG_BRANCH)
 342                         process_raddr_deps(state, n, raddr_b, false);
 343         }
 344
 345         if (add_op != QPU_A_NOP) {
 346                 process_mux_deps(state, n, add_a);
 347                 process_mux_deps(state, n, add_b);
 348         }
 349         if (mul_op != QPU_M_NOP) {
 350                 process_mux_deps(state, n, mul_a);
 351                 process_mux_deps(state, n, mul_b);
 352         }
 353
 354         process_waddr_deps(state, n, waddr_add, true);
 355         process_waddr_deps(state, n, waddr_mul, false);
 356         if (qpu_writes_r4(inst))
 357                 add_write_dep(state, &state->last_r[4], n);
 358
 359         switch (sig) {
 360         case QPU_SIG_SW_BREAKPOINT:
 361         case QPU_SIG_NONE:
 362         case QPU_SIG_SMALL_IMM:
 363         case QPU_SIG_LOAD_IMM:
 364                 break;
 365
 366         case QPU_SIG_THREAD_SWITCH:
 367         case QPU_SIG_LAST_THREAD_SWITCH:
 368                 /* All accumulator contents and flags are undefined after the
 369                  * switch.
 370                  */
 371                 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
 372                         add_write_dep(state, &state->last_r[i], n);
 373                 add_write_dep(state, &state->last_sf, n);
 374
 375                 /* Scoreboard-locking operations have to stay after the last
 376                  * thread switch.
 377                  */
 378                 add_write_dep(state, &state->last_tlb, n);
 379
 380                 add_write_dep(state, &state->last_tmu_write, n);
 381                 break;
 382
 383         case QPU_SIG_LOAD_TMU0:
 384         case QPU_SIG_LOAD_TMU1:
 385                 /* TMU loads are coming from a FIFO, so ordering is important.
 386                  */
 387                 add_write_dep(state, &state->last_tmu_write, n);
 388                 break;
 389
 390         case QPU_SIG_COLOR_LOAD:
 391                 add_read_dep(state, state->last_tlb, n);
 392                 break;
 393
 394         case QPU_SIG_BRANCH:
 395                 add_read_dep(state, state->last_sf, n);
 396                 break;
 397
 398         case QPU_SIG_PROG_END:
 399         case QPU_SIG_WAIT_FOR_SCOREBOARD:
 400         case QPU_SIG_SCOREBOARD_UNLOCK:
 401         case QPU_SIG_COVERAGE_LOAD:
 402         case QPU_SIG_COLOR_LOAD_END:
 403         case QPU_SIG_ALPHA_MASK_LOAD:
 404                 fprintf(stderr, "Unhandled signal bits %d\n", sig);
 405                 abort();
 406         }
 407
 408         process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD));
 409         process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_MUL));
 410         if ((inst & QPU_SF) && sig != QPU_SIG_BRANCH)
 411                 add_write_dep(state, &state->last_sf, n);
 412 }
 413
 414 static void
 415 calculate_forward_deps(struct vc4_compile *c, struct dag *dag,
 416                        struct list_head *schedule_list)
 417 {
 418         struct schedule_state state;
 419
 420         memset(&state, 0, sizeof(state));
 421         state.dag = dag;
 422         state.dir = F;
 423
 424         list_for_each_entry(struct schedule_node, node, schedule_list, link)
 425                 calculate_deps(&state, node);
 426 }
 427
 428 static void
 429 calculate_reverse_deps(struct vc4_compile *c, struct dag *dag,
 430                        struct list_head *schedule_list)
 431 {
 432         struct schedule_state state;
 433
 434         memset(&state, 0, sizeof(state));
 435         state.dag = dag;
 436         state.dir = R;
 437
 438         list_for_each_entry_rev(struct schedule_node, node, schedule_list,
 439                                 link) {
 440                 calculate_deps(&state, (struct schedule_node *)node);
 441         }
 442 }
 443
 444 struct choose_scoreboard {
 445         struct dag *dag;
 446         int tick;
 447         int last_sfu_write_tick;
 448         int last_uniforms_reset_tick;
 449         uint32_t last_waddr_a, last_waddr_b;
 450         bool tlb_locked;
 451 };
 452
 453 static bool
 454 reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst)
 455 {
 456         uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
 457         uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
 458         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
 459
 460         /* Full immediate loads don't read any registers. */
 461         if (sig == QPU_SIG_LOAD_IMM)
 462                 return false;
 463
 464         uint32_t src_muxes[] = {
 465                 QPU_GET_FIELD(inst, QPU_ADD_A),
 466                 QPU_GET_FIELD(inst, QPU_ADD_B),
 467                 QPU_GET_FIELD(inst, QPU_MUL_A),
 468                 QPU_GET_FIELD(inst, QPU_MUL_B),
 469         };
 470         for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {
 471                 if ((src_muxes[i] == QPU_MUX_A &&
 472                      raddr_a < 32 &&
 473                      scoreboard->last_waddr_a == raddr_a) ||
 474                     (src_muxes[i] == QPU_MUX_B &&
 475                      sig != QPU_SIG_SMALL_IMM &&
 476                      raddr_b < 32 &&
 477                      scoreboard->last_waddr_b == raddr_b)) {
 478                         return true;
 479                 }
 480
 481                 if (src_muxes[i] == QPU_MUX_R4) {
 482                         if (scoreboard->tick -
 483                             scoreboard->last_sfu_write_tick <= 2) {
 484                                 return true;
 485                         }
 486                 }
 487         }
 488
 489         if (sig == QPU_SIG_SMALL_IMM &&
 490             QPU_GET_FIELD(inst, QPU_SMALL_IMM) >= QPU_SMALL_IMM_MUL_ROT) {
 491                 uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A);
 492                 uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B);
 493
 494                 if (scoreboard->last_waddr_a == mux_a + QPU_W_ACC0 ||
 495                     scoreboard->last_waddr_a == mux_b + QPU_W_ACC0 ||
 496                     scoreboard->last_waddr_b == mux_a + QPU_W_ACC0 ||
 497                     scoreboard->last_waddr_b == mux_b + QPU_W_ACC0) {
 498                         return true;
 499                 }
 500         }
 501
 502         if (reads_uniform(inst) &&
 503             scoreboard->tick - scoreboard->last_uniforms_reset_tick <= 2) {
 504                 return true;
 505         }
 506
 507         return false;
 508 }
 509
 510 static bool
 511 pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, uint64_t inst)
 512 {
 513         return (scoreboard->tick < 2 && qpu_inst_is_tlb(inst));
 514 }
 515
 516 static int
 517 get_instruction_priority(uint64_t inst)
 518 {
 519         uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
 520         uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
 521         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
 522         uint32_t baseline_score;
 523         uint32_t next_score = 0;
 524
 525         /* Schedule TLB operations as late as possible, to get more
 526          * parallelism between shaders.
 527          */
 528         if (qpu_inst_is_tlb(inst))
 529                 return next_score;
 530         next_score++;
 531
 532         /* Schedule texture read results collection late to hide latency. */
 533         if (sig == QPU_SIG_LOAD_TMU0 || sig == QPU_SIG_LOAD_TMU1)
 534                 return next_score;
 535         next_score++;
 536
 537         /* Default score for things that aren't otherwise special. */
 538         baseline_score = next_score;
 539         next_score++;
 540
 541         /* Schedule texture read setup early to hide their latency better. */
 542         if (is_tmu_write(waddr_add) || is_tmu_write(waddr_mul))
 543                 return next_score;
 544         next_score++;
 545
 546         return baseline_score;
 547 }
 548
 549 static struct schedule_node *
 550 choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
 551                                struct list_head *schedule_list,
 552                                struct schedule_node *prev_inst)
 553 {
 554         struct schedule_node *chosen = NULL;
 555         int chosen_prio = 0;
 556
 557         /* Don't pair up anything with a thread switch signal -- emit_thrsw()
 558          * will handle pairing it along with filling the delay slots.
 559          */
 560         if (prev_inst) {
 561                 uint32_t prev_sig = QPU_GET_FIELD(prev_inst->inst->inst,
 562                                                   QPU_SIG);
 563                 if (prev_sig == QPU_SIG_THREAD_SWITCH ||
 564                     prev_sig == QPU_SIG_LAST_THREAD_SWITCH) {
 565                         return NULL;
 566                 }
 567         }
 568
 569         list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
 570                             dag.link) {
 571                 uint64_t inst = n->inst->inst;
 572                 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
 573
 574                 /* Don't choose the branch instruction until it's the last one
 575                  * left.  XXX: We could potentially choose it before it's the
 576                  * last one, if the remaining instructions fit in the delay
 577                  * slots.
 578                  */
 579                 if (sig == QPU_SIG_BRANCH &&
 580                     !list_is_singular(&scoreboard->dag->heads)) {
 581                         continue;
 582                 }
 583
 584                 /* "An instruction must not read from a location in physical
 585                  *  regfile A or B that was written to by the previous
 586                  *  instruction."
 587                  */
 588                 if (reads_too_soon_after_write(scoreboard, inst))
 589                         continue;
 590
 591                 /* "A scoreboard wait must not occur in the first two
 592                  *  instructions of a fragment shader. This is either the
 593                  *  explicit Wait for Scoreboard signal or an implicit wait
 594                  *  with the first tile-buffer read or write instruction."
 595                  */
 596                 if (pixel_scoreboard_too_soon(scoreboard, inst))
 597                         continue;
 598
 599                 /* If we're trying to pair with another instruction, check
 600                  * that they're compatible.
 601                  */
 602                 if (prev_inst) {
 603                         /* Don't pair up a thread switch signal -- we'll
 604                          * handle pairing it when we pick it on its own.
 605                          */
 606                         if (sig == QPU_SIG_THREAD_SWITCH ||
 607                             sig == QPU_SIG_LAST_THREAD_SWITCH) {
 608                                 continue;
 609                         }
 610
 611                         if (prev_inst->uniform != -1 && n->uniform != -1)
 612                                 continue;
 613
 614                         /* Don't merge in something that will lock the TLB.
 615                          * Hopwefully what we have in inst will release some
 616                          * other instructions, allowing us to delay the
 617                          * TLB-locking instruction until later.
 618                          */
 619                         if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
 620                                 continue;
 621
 622                         inst = qpu_merge_inst(prev_inst->inst->inst, inst);
 623                         if (!inst)
 624                                 continue;
 625                 }
 626
 627                 int prio = get_instruction_priority(inst);
 628
 629                 /* Found a valid instruction.  If nothing better comes along,
 630                  * this one works.
 631                  */
 632                 if (!chosen) {
 633                         chosen = n;
 634                         chosen_prio = prio;
 635                         continue;
 636                 }
 637
 638                 if (prio > chosen_prio) {
 639                         chosen = n;
 640                         chosen_prio = prio;
 641                 } else if (prio < chosen_prio) {
 642                         continue;
 643                 }
 644
 645                 if (n->delay > chosen->delay) {
 646                         chosen = n;
 647                         chosen_prio = prio;
 648                 } else if (n->delay < chosen->delay) {
 649                         continue;
 650                 }
 651         }
 652
 653         return chosen;
 654 }
 655
 656 static void
 657 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
 658                              uint64_t inst)
 659 {
 660         uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
 661         uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
 662
 663         if (!(inst & QPU_WS)) {
 664                 scoreboard->last_waddr_a = waddr_add;
 665                 scoreboard->last_waddr_b = waddr_mul;
 666         } else {
 667                 scoreboard->last_waddr_b = waddr_add;
 668                 scoreboard->last_waddr_a = waddr_mul;
 669         }
 670
 671         if ((waddr_add >= QPU_W_SFU_RECIP && waddr_add <= QPU_W_SFU_LOG) ||
 672             (waddr_mul >= QPU_W_SFU_RECIP && waddr_mul <= QPU_W_SFU_LOG)) {
 673                 scoreboard->last_sfu_write_tick = scoreboard->tick;
 674         }
 675
 676         if (waddr_add == QPU_W_UNIFORMS_ADDRESS ||
 677             waddr_mul == QPU_W_UNIFORMS_ADDRESS) {
 678                 scoreboard->last_uniforms_reset_tick = scoreboard->tick;
 679         }
 680
 681         if (qpu_inst_is_tlb(inst))
 682                 scoreboard->tlb_locked = true;
 683 }
 684
 685 static void
 686 dump_state(struct dag *dag)
 687 {
 688         list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) {
 689                 fprintf(stderr, "         t=%4d: ", n->unblocked_time);
 690                 vc4_qpu_disasm(&n->inst->inst, 1);
 691                 fprintf(stderr, "\n");
 692
 693                 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 694                         struct schedule_node *child =
 695                                 (struct schedule_node *)edge->child;
 696                         if (!child)
 697                                 continue;
 698
 699                         fprintf(stderr, "                 - ");
 700                         vc4_qpu_disasm(&child->inst->inst, 1);
 701                         fprintf(stderr, " (%d parents, %c)\n",
 702                                 child->dag.parent_count,
 703                                 edge->data ? 'w' : 'r');
 704                 }
 705         }
 706 }
 707
 708 static uint32_t waddr_latency(uint32_t waddr, uint64_t after)
 709 {
 710         if (waddr < 32)
 711                 return 2;
 712
 713         /* Apply some huge latency between texture fetch requests and getting
 714          * their results back.
 715          *
 716          * FIXME: This is actually pretty bogus.  If we do:
 717          *
 718          * mov tmu0_s, a
 719          * <a bit of math>
 720          * mov tmu0_s, b
 721          * load_tmu0
 722          * <more math>
 723          * load_tmu0
 724          *
 725          * we count that as worse than
 726          *
 727          * mov tmu0_s, a
 728          * mov tmu0_s, b
 729          * <lots of math>
 730          * load_tmu0
 731          * <more math>
 732          * load_tmu0
 733          *
 734          * because we associate the first load_tmu0 with the *second* tmu0_s.
 735          */
 736         if (waddr == QPU_W_TMU0_S) {
 737                 if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU0)
 738                         return 100;
 739         }
 740         if (waddr == QPU_W_TMU1_S) {
 741                 if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU1)
 742                         return 100;
 743         }
 744
 745         switch(waddr) {
 746         case QPU_W_SFU_RECIP:
 747         case QPU_W_SFU_RECIPSQRT:
 748         case QPU_W_SFU_EXP:
 749         case QPU_W_SFU_LOG:
 750                 return 3;
 751         default:
 752                 return 1;
 753         }
 754 }
 755
 756 static uint32_t
 757 instruction_latency(struct schedule_node *before, struct schedule_node *after)
 758 {
 759         uint64_t before_inst = before->inst->inst;
 760         uint64_t after_inst = after->inst->inst;
 761
 762         return MAX2(waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_ADD),
 763                                   after_inst),
 764                     waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_MUL),
 765                                   after_inst));
 766 }
 767
 768 /** Recursive computation of the delay member of a node. */
 769 static void
 770 compute_delay(struct dag_node *node, void *state)
 771 {
 772         struct schedule_node *n = (struct schedule_node *)node;
 773
 774         n->delay = 1;
 775
 776         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 777                 struct schedule_node *child =
 778                         (struct schedule_node *)edge->child;
 779                 n->delay = MAX2(n->delay, (child->delay +
 780                                            instruction_latency(n, child)));
 781         }
 782 }
 783
 784 /* Removes a DAG head, but removing only the WAR edges. (dag_prune_head()
 785  * should be called on it later to finish pruning the other edges).
 786  */
 787 static void
 788 pre_remove_head(struct dag *dag, struct schedule_node *n)
 789 {
 790         list_delinit(&n->dag.link);
 791
 792         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 793                 if (edge->data)
 794                         dag_remove_edge(dag, edge);
 795         }
 796 }
 797
 798 static void
 799 mark_instruction_scheduled(struct dag *dag,
 800                            uint32_t time,
 801                            struct schedule_node *node)
 802 {
 803         if (!node)
 804                 return;
 805
 806         util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) {
 807                 struct schedule_node *child =
 808                         (struct schedule_node *)edge->child;
 809
 810                 if (!child)
 811                         continue;
 812
 813                 uint32_t latency = instruction_latency(node, child);
 814
 815                 child->unblocked_time = MAX2(child->unblocked_time,
 816                                              time + latency);
 817         }
 818         dag_prune_head(dag, &node->dag);
 819 }
 820
 821 /**
 822  * Emits a THRSW/LTHRSW signal in the stream, trying to move it up to pair
 823  * with another instruction.
 824  */
 825 static void
 826 emit_thrsw(struct vc4_compile *c,
 827            struct choose_scoreboard *scoreboard,
 828            uint64_t inst)
 829 {
 830         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
 831
 832         /* There should be nothing in a thrsw inst being scheduled other than
 833          * the signal bits.
 834          */
 835         assert(QPU_GET_FIELD(inst, QPU_OP_ADD) == QPU_A_NOP);
 836         assert(QPU_GET_FIELD(inst, QPU_OP_MUL) == QPU_M_NOP);
 837
 838         /* Try to find an earlier scheduled instruction that we can merge the
 839          * thrsw into.
 840          */
 841         int thrsw_ip = c->qpu_inst_count;
 842         for (int i = 1; i <= MIN2(c->qpu_inst_count, 3); i++) {
 843                 uint64_t prev_instr = c->qpu_insts[c->qpu_inst_count - i];
 844                 uint32_t prev_sig = QPU_GET_FIELD(prev_instr, QPU_SIG);
 845
 846                 if (prev_sig == QPU_SIG_NONE)
 847                         thrsw_ip = c->qpu_inst_count - i;
 848         }
 849
 850         if (thrsw_ip != c->qpu_inst_count) {
 851                 /* Merge the thrsw into the existing instruction. */
 852                 c->qpu_insts[thrsw_ip] =
 853                         QPU_UPDATE_FIELD(c->qpu_insts[thrsw_ip], sig, QPU_SIG);
 854         } else {
 855                 qpu_serialize_one_inst(c, inst);
 856                 update_scoreboard_for_chosen(scoreboard, inst);
 857         }
 858
 859         /* Fill the delay slots. */
 860         while (c->qpu_inst_count < thrsw_ip + 3) {
 861                 update_scoreboard_for_chosen(scoreboard, qpu_NOP());
 862                 qpu_serialize_one_inst(c, qpu_NOP());
 863         }
 864 }
 865
 866 static uint32_t
 867 schedule_instructions(struct vc4_compile *c,
 868                       struct choose_scoreboard *scoreboard,
 869                       struct qblock *block,
 870                       struct list_head *schedule_list,
 871                       enum quniform_contents *orig_uniform_contents,
 872                       uint32_t *orig_uniform_data,
 873                       uint32_t *next_uniform)
 874 {
 875         uint32_t time = 0;
 876
 877         while (!list_is_empty(&scoreboard->dag->heads)) {
 878                 struct schedule_node *chosen =
 879                         choose_instruction_to_schedule(scoreboard,
 880                                                        schedule_list,
 881                                                        NULL);
 882                 struct schedule_node *merge = NULL;
 883
 884                 /* If there are no valid instructions to schedule, drop a NOP
 885                  * in.
 886                  */
 887                 uint64_t inst = chosen ? chosen->inst->inst : qpu_NOP();
 888
 889                 if (debug) {
 890                         fprintf(stderr, "t=%4d: current list:\n",
 891                                 time);
 892                         dump_state(scoreboard->dag);
 893                         fprintf(stderr, "t=%4d: chose: ", time);
 894                         vc4_qpu_disasm(&inst, 1);
 895                         fprintf(stderr, "\n");
 896                 }
 897
 898                 /* Schedule this instruction onto the QPU list. Also try to
 899                  * find an instruction to pair with it.
 900                  */
 901                 if (chosen) {
 902                         time = MAX2(chosen->unblocked_time, time);
 903                         pre_remove_head(scoreboard->dag, chosen);
 904                         if (chosen->uniform != -1) {
 905                                 c->uniform_data[*next_uniform] =
 906                                         orig_uniform_data[chosen->uniform];
 907                                 c->uniform_contents[*next_uniform] =
 908                                         orig_uniform_contents[chosen->uniform];
 909                                 (*next_uniform)++;
 910                         }
 911
 912                         merge = choose_instruction_to_schedule(scoreboard,
 913                                                                schedule_list,
 914                                                                chosen);
 915                         if (merge) {
 916                                 time = MAX2(merge->unblocked_time, time);
 917                                 inst = qpu_merge_inst(inst, merge->inst->inst);
 918                                 assert(inst != 0);
 919                                 if (merge->uniform != -1) {
 920                                         c->uniform_data[*next_uniform] =
 921                                                 orig_uniform_data[merge->uniform];
 922                                         c->uniform_contents[*next_uniform] =
 923                                                 orig_uniform_contents[merge->uniform];
 924                                         (*next_uniform)++;
 925                                 }
 926
 927                                 if (debug) {
 928                                         fprintf(stderr, "t=%4d: merging: ",
 929                                                 time);
 930                                         vc4_qpu_disasm(&merge->inst->inst, 1);
 931                                         fprintf(stderr, "\n");
 932                                         fprintf(stderr, "            resulting in: ");
 933                                         vc4_qpu_disasm(&inst, 1);
 934                                         fprintf(stderr, "\n");
 935                                 }
 936                         }
 937                 }
 938
 939                 if (debug) {
 940                         fprintf(stderr, "\n");
 941                 }
 942
 943                 /* Now that we've scheduled a new instruction, some of its
 944                  * children can be promoted to the list of instructions ready to
 945                  * be scheduled.  Update the children's unblocked time for this
 946                  * DAG edge as we do so.
 947                  */
 948                 mark_instruction_scheduled(scoreboard->dag, time, chosen);
 949                 mark_instruction_scheduled(scoreboard->dag, time, merge);
 950
 951                 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_THREAD_SWITCH ||
 952                     QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LAST_THREAD_SWITCH) {
 953                         emit_thrsw(c, scoreboard, inst);
 954                 } else {
 955                         qpu_serialize_one_inst(c, inst);
 956                         update_scoreboard_for_chosen(scoreboard, inst);
 957                 }
 958
 959                 scoreboard->tick++;
 960                 time++;
 961
 962                 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH) {
 963                         block->branch_qpu_ip = c->qpu_inst_count - 1;
 964                         /* Fill the delay slots.
 965                          *
 966                          * We should fill these with actual instructions,
 967                          * instead, but that will probably need to be done
 968                          * after this, once we know what the leading
 969                          * instructions of the successors are (so we can
 970                          * handle A/B register file write latency)
 971                         */
 972                         inst = qpu_NOP();
 973                         update_scoreboard_for_chosen(scoreboard, inst);
 974                         qpu_serialize_one_inst(c, inst);
 975                         qpu_serialize_one_inst(c, inst);
 976                         qpu_serialize_one_inst(c, inst);
 977                 }
 978         }
 979
 980         return time;
 981 }
 982
 983 static uint32_t
 984 qpu_schedule_instructions_block(struct vc4_compile *c,
 985                                 struct choose_scoreboard *scoreboard,
 986                                 struct qblock *block,
 987                                 enum quniform_contents *orig_uniform_contents,
 988                                 uint32_t *orig_uniform_data,
 989                                 uint32_t *next_uniform)
 990 {
 991         scoreboard->dag = dag_create(NULL);
 992         struct list_head setup_list;
 993
 994         list_inithead(&setup_list);
 995
 996         /* Wrap each instruction in a scheduler structure. */
 997         uint32_t next_sched_uniform = *next_uniform;
 998         while (!list_is_empty(&block->qpu_inst_list)) {
 999                 struct queued_qpu_inst *inst =
1000                         (struct queued_qpu_inst *)block->qpu_inst_list.next;
1001                 struct schedule_node *n = rzalloc(scoreboard->dag,
1002                                                   struct schedule_node);
1003
1004                 dag_init_node(scoreboard->dag, &n->dag);
1005                 n->inst = inst;
1006
1007                 if (reads_uniform(inst->inst)) {
1008                         n->uniform = next_sched_uniform++;
1009                 } else {
1010                         n->uniform = -1;
1011                 }
1012                 list_del(&inst->link);
1013                 list_addtail(&n->link, &setup_list);
1014         }
1015
1016         calculate_forward_deps(c, scoreboard->dag, &setup_list);
1017         calculate_reverse_deps(c, scoreboard->dag, &setup_list);
1018
1019         dag_traverse_bottom_up(scoreboard->dag, compute_delay, NULL);
1020
1021         uint32_t cycles = schedule_instructions(c, scoreboard, block,
1022                                                 &setup_list,
1023                                                 orig_uniform_contents,
1024                                                 orig_uniform_data,
1025                                                 next_uniform);
1026
1027         ralloc_free(scoreboard->dag);
1028         scoreboard->dag = NULL;
1029
1030         return cycles;
1031 }
1032
1033 static void
1034 qpu_set_branch_targets(struct vc4_compile *c)
1035 {
1036         qir_for_each_block(block, c) {
1037                 /* The end block of the program has no branch. */
1038                 if (!block->successors[0])
1039                         continue;
1040
1041                 /* If there was no branch instruction, then the successor
1042                  * block must follow immediately after this one.
1043                  */
1044                 if (block->branch_qpu_ip == ~0) {
1045                         assert(block->end_qpu_ip + 1 ==
1046                                block->successors[0]->start_qpu_ip);
1047                         continue;
1048                 }
1049
1050                 /* Set the branch target for the block that doesn't follow
1051                  * immediately after ours.
1052                  */
1053                 uint64_t *branch_inst = &c->qpu_insts[block->branch_qpu_ip];
1054                 assert(QPU_GET_FIELD(*branch_inst, QPU_SIG) == QPU_SIG_BRANCH);
1055                 assert(QPU_GET_FIELD(*branch_inst, QPU_BRANCH_TARGET) == 0);
1056
1057                 uint32_t branch_target =
1058                         (block->successors[0]->start_qpu_ip -
1059                          (block->branch_qpu_ip + 4)) * sizeof(uint64_t);
1060                 *branch_inst = (*branch_inst |
1061                                 QPU_SET_FIELD(branch_target, QPU_BRANCH_TARGET));
1062
1063                 /* Make sure that the if-we-don't-jump successor was scheduled
1064                  * just after the delay slots.
1065                  */
1066                 if (block->successors[1]) {
1067                         assert(block->successors[1]->start_qpu_ip ==
1068                                block->branch_qpu_ip + 4);
1069                 }
1070         }
1071 }
1072
1073 uint32_t
1074 qpu_schedule_instructions(struct vc4_compile *c)
1075 {
1076         /* We reorder the uniforms as we schedule instructions, so save the
1077          * old data off and replace it.
1078          */
1079         uint32_t *uniform_data = c->uniform_data;
1080         enum quniform_contents *uniform_contents = c->uniform_contents;
1081         c->uniform_contents = ralloc_array(c, enum quniform_contents,
1082                                            c->num_uniforms);
1083         c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
1084         c->uniform_array_size = c->num_uniforms;
1085         uint32_t next_uniform = 0;
1086
1087         struct choose_scoreboard scoreboard;
1088         memset(&scoreboard, 0, sizeof(scoreboard));
1089         scoreboard.last_waddr_a = ~0;
1090         scoreboard.last_waddr_b = ~0;
1091         scoreboard.last_sfu_write_tick = -10;
1092         scoreboard.last_uniforms_reset_tick = -10;
1093
1094         if (debug) {
1095                 fprintf(stderr, "Pre-schedule instructions\n");
1096                 qir_for_each_block(block, c) {
1097                         fprintf(stderr, "BLOCK %d\n", block->index);
1098                         list_for_each_entry(struct queued_qpu_inst, q,
1099                                             &block->qpu_inst_list, link) {
1100                                 vc4_qpu_disasm(&q->inst, 1);
1101                                 fprintf(stderr, "\n");
1102                         }
1103                 }
1104                 fprintf(stderr, "\n");
1105         }
1106
1107         uint32_t cycles = 0;
1108         qir_for_each_block(block, c) {
1109                 block->start_qpu_ip = c->qpu_inst_count;
1110                 block->branch_qpu_ip = ~0;
1111
1112                 cycles += qpu_schedule_instructions_block(c,
1113                                                           &scoreboard,
1114                                                           block,
1115                                                           uniform_contents,
1116                                                           uniform_data,
1117                                                           &next_uniform);
1118
1119                 block->end_qpu_ip = c->qpu_inst_count - 1;
1120         }
1121
1122         qpu_set_branch_targets(c);
1123
1124         assert(next_uniform == c->num_uniforms);
1125
1126         if (debug) {
1127                 fprintf(stderr, "Post-schedule instructions\n");
1128                 vc4_qpu_disasm(c->qpu_insts, c->qpu_inst_count);
1129                 fprintf(stderr, "\n");
1130         }
1131
1132         return cycles;
1133 }