src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 #include "brw_fs.h"
  29 #include "glsl/glsl_types.h"
  30 #include "glsl/ir_optimization.h"
  31 #include "glsl/ir_print_visitor.h"
  32
  33 /** @file brw_fs_schedule_instructions.cpp
  34  *
  35  * List scheduling of FS instructions.
  36  *
  37  * The basic model of the list scheduler is to take a basic block,
  38  * compute a DAG of the dependencies (RAW ordering with latency, WAW
  39  * ordering, WAR ordering), and make a list of the DAG heads.
  40  * Heuristically pick a DAG head, then put all the children that are
  41  * now DAG heads into the list of things to schedule.
  42  *
  43  * The heuristic is the important part.  We're trying to be cheap,
  44  * since actually computing the optimal scheduling is NP complete.
  45  * What we do is track a "current clock".  When we schedule a node, we
  46  * update the earliest-unblocked clock time of its children, and
  47  * increment the clock.  Then, when trying to schedule, we just pick
  48  * the earliest-unblocked instruction to schedule.
  49  *
  50  * Note that often there will be many things which could execute
  51  * immediately, and there are a range of heuristic options to choose
  52  * from in picking among those.
  53  */
  54
  55 class schedule_node : public exec_node
  56 {
  57 public:
  58    schedule_node(fs_inst *inst)
  59    {
  60       this->inst = inst;
  61       this->child_array_size = 0;
  62       this->children = NULL;
  63       this->child_latency = NULL;
  64       this->child_count = 0;
  65       this->parent_count = 0;
  66       this->unblocked_time = 0;
  67
  68       int chans = 8;
  69       int math_latency = 22;
  70
  71       switch (inst->opcode) {
  72       case SHADER_OPCODE_RCP:
  73          this->latency = 1 * chans * math_latency;
  74          break;
  75       case SHADER_OPCODE_RSQ:
  76          this->latency = 2 * chans * math_latency;
  77          break;
  78       case SHADER_OPCODE_INT_QUOTIENT:
  79       case SHADER_OPCODE_SQRT:
  80       case SHADER_OPCODE_LOG2:
  81          /* full precision log.  partial is 2. */
  82          this->latency = 3 * chans * math_latency;
  83          break;
  84       case SHADER_OPCODE_INT_REMAINDER:
  85       case SHADER_OPCODE_EXP2:
  86          /* full precision.  partial is 3, same throughput. */
  87          this->latency = 4 * chans * math_latency;
  88          break;
  89       case SHADER_OPCODE_POW:
  90          this->latency = 8 * chans * math_latency;
  91          break;
  92       case SHADER_OPCODE_SIN:
  93       case SHADER_OPCODE_COS:
  94          /* minimum latency, max is 12 rounds. */
  95          this->latency = 5 * chans * math_latency;
  96          break;
  97       default:
  98          this->latency = 2;
  99          break;
 100       }
 101    }
 102
 103    fs_inst *inst;
 104    schedule_node **children;
 105    int *child_latency;
 106    int child_count;
 107    int parent_count;
 108    int child_array_size;
 109    int unblocked_time;
 110    int latency;
 111 };
 112
 113 class instruction_scheduler {
 114 public:
 115    instruction_scheduler(fs_visitor *v, void *mem_ctx, int grf_count,
 116                          bool post_reg_alloc)
 117    {
 118       this->v = v;
 119       this->mem_ctx = ralloc_context(mem_ctx);
 120       this->grf_count = grf_count;
 121       this->instructions.make_empty();
 122       this->instructions_to_schedule = 0;
 123       this->post_reg_alloc = post_reg_alloc;
 124    }
 125
 126    ~instruction_scheduler()
 127    {
 128       ralloc_free(this->mem_ctx);
 129    }
 130    void add_barrier_deps(schedule_node *n);
 131    void add_dep(schedule_node *before, schedule_node *after, int latency);
 132    void add_dep(schedule_node *before, schedule_node *after);
 133
 134    void add_inst(fs_inst *inst);
 135    void calculate_deps();
 136    void schedule_instructions(fs_inst *next_block_header);
 137
 138    bool is_compressed(fs_inst *inst);
 139
 140    void *mem_ctx;
 141
 142    bool post_reg_alloc;
 143    int instructions_to_schedule;
 144    int grf_count;
 145    exec_list instructions;
 146    fs_visitor *v;
 147 };
 148
 149 void
 150 instruction_scheduler::add_inst(fs_inst *inst)
 151 {
 152    schedule_node *n = new(mem_ctx) schedule_node(inst);
 153
 154    assert(!inst->is_head_sentinel());
 155    assert(!inst->is_tail_sentinel());
 156
 157    this->instructions_to_schedule++;
 158
 159    inst->remove();
 160    instructions.push_tail(n);
 161 }
 162
 163 /**
 164  * Add a dependency between two instruction nodes.
 165  *
 166  * The @after node will be scheduled after @before.  We will try to
 167  * schedule it @latency cycles after @before, but no guarantees there.
 168  */
 169 void
 170 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
 171                                int latency)
 172 {
 173    if (!before || !after)
 174       return;
 175
 176    assert(before != after);
 177
 178    for (int i = 0; i < before->child_count; i++) {
 179       if (before->children[i] == after) {
 180          before->child_latency[i] = MAX2(before->child_latency[i], latency);
 181          return;
 182       }
 183    }
 184
 185    if (before->child_array_size <= before->child_count) {
 186       if (before->child_array_size < 16)
 187          before->child_array_size = 16;
 188       else
 189          before->child_array_size *= 2;
 190
 191       before->children = reralloc(mem_ctx, before->children,
 192                                   schedule_node *,
 193                                   before->child_array_size);
 194       before->child_latency = reralloc(mem_ctx, before->child_latency,
 195                                        int, before->child_array_size);
 196    }
 197
 198    before->children[before->child_count] = after;
 199    before->child_latency[before->child_count] = latency;
 200    before->child_count++;
 201    after->parent_count++;
 202 }
 203
 204 void
 205 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
 206 {
 207    if (!before)
 208       return;
 209
 210    add_dep(before, after, before->latency);
 211 }
 212
 213 /**
 214  * Sometimes we really want this node to execute after everything that
 215  * was before it and before everything that followed it.  This adds
 216  * the deps to do so.
 217  */
 218 void
 219 instruction_scheduler::add_barrier_deps(schedule_node *n)
 220 {
 221    schedule_node *prev = (schedule_node *)n->prev;
 222    schedule_node *next = (schedule_node *)n->next;
 223
 224    if (prev) {
 225       while (!prev->is_head_sentinel()) {
 226          add_dep(prev, n, 0);
 227          prev = (schedule_node *)prev->prev;
 228       }
 229    }
 230
 231    if (next) {
 232       while (!next->is_tail_sentinel()) {
 233          add_dep(n, next, 0);
 234          next = (schedule_node *)next->next;
 235       }
 236    }
 237 }
 238
 239 /* instruction scheduling needs to be aware of when an MRF write
 240  * actually writes 2 MRFs.
 241  */
 242 bool
 243 instruction_scheduler::is_compressed(fs_inst *inst)
 244 {
 245    return (v->dispatch_width == 16 &&
 246            !inst->force_uncompressed &&
 247            !inst->force_sechalf);
 248 }
 249
 250 void
 251 instruction_scheduler::calculate_deps()
 252 {
 253    /* Pre-register-allocation, this tracks the last write per VGRF (so
 254     * different reg_offsets within it can interfere when they shouldn't).
 255     * After register allocation, reg_offsets are gone and we track individual
 256     * GRF registers.
 257     */
 258    schedule_node *last_grf_write[grf_count];
 259    schedule_node *last_mrf_write[BRW_MAX_MRF];
 260    schedule_node *last_conditional_mod[2] = { NULL, NULL };
 261    /* Fixed HW registers are assumed to be separate from the virtual
 262     * GRFs, so they can be tracked separately.  We don't really write
 263     * to fixed GRFs much, so don't bother tracking them on a more
 264     * granular level.
 265     */
 266    schedule_node *last_fixed_grf_write = NULL;
 267    int reg_width = v->dispatch_width / 8;
 268
 269    /* The last instruction always needs to still be the last
 270     * instruction.  Either it's flow control (IF, ELSE, ENDIF, DO,
 271     * WHILE) and scheduling other things after it would disturb the
 272     * basic block, or it's FB_WRITE and we should do a better job at
 273     * dead code elimination anyway.
 274     */
 275    schedule_node *last = (schedule_node *)instructions.get_tail();
 276    add_barrier_deps(last);
 277
 278    memset(last_grf_write, 0, sizeof(last_grf_write));
 279    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 280
 281    /* top-to-bottom dependencies: RAW and WAW. */
 282    foreach_list(node, &instructions) {
 283       schedule_node *n = (schedule_node *)node;
 284       fs_inst *inst = n->inst;
 285
 286       /* read-after-write deps. */
 287       for (int i = 0; i < 3; i++) {
 288          if (inst->src[i].file == GRF) {
 289             if (post_reg_alloc) {
 290                for (int r = 0; r < reg_width; r++)
 291                   add_dep(last_grf_write[inst->src[i].reg + r], n);
 292             } else {
 293                add_dep(last_grf_write[inst->src[i].reg], n);
 294             }
 295          } else if (inst->src[i].file == FIXED_HW_REG &&
 296                     (inst->src[i].fixed_hw_reg.file ==
 297                      BRW_GENERAL_REGISTER_FILE)) {
 298             if (post_reg_alloc) {
 299                for (int r = 0; r < reg_width; r++)
 300                   add_dep(last_grf_write[inst->src[i].fixed_hw_reg.nr + r], n);
 301             } else {
 302                add_dep(last_fixed_grf_write, n);
 303             }
 304          } else if (inst->src[i].file != BAD_FILE &&
 305                     inst->src[i].file != IMM &&
 306                     inst->src[i].file != UNIFORM) {
 307             assert(inst->src[i].file != MRF);
 308             add_barrier_deps(n);
 309          }
 310       }
 311
 312       for (int i = 0; i < inst->mlen; i++) {
 313          /* It looks like the MRF regs are released in the send
 314           * instruction once it's sent, not when the result comes
 315           * back.
 316           */
 317          add_dep(last_mrf_write[inst->base_mrf + i], n);
 318       }
 319
 320       if (inst->predicate) {
 321          add_dep(last_conditional_mod[inst->flag_subreg], n);
 322       }
 323
 324       /* write-after-write deps. */
 325       if (inst->dst.file == GRF) {
 326          if (post_reg_alloc) {
 327             for (int r = 0; r < inst->regs_written() * reg_width; r++) {
 328                add_dep(last_grf_write[inst->dst.reg + r], n);
 329                last_grf_write[inst->dst.reg + r] = n;
 330             }
 331          } else {
 332             add_dep(last_grf_write[inst->dst.reg], n);
 333             last_grf_write[inst->dst.reg] = n;
 334          }
 335       } else if (inst->dst.file == MRF) {
 336          int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
 337
 338          add_dep(last_mrf_write[reg], n);
 339          last_mrf_write[reg] = n;
 340          if (is_compressed(inst)) {
 341             if (inst->dst.reg & BRW_MRF_COMPR4)
 342                reg += 4;
 343             else
 344                reg++;
 345             add_dep(last_mrf_write[reg], n);
 346             last_mrf_write[reg] = n;
 347          }
 348       } else if (inst->dst.file == FIXED_HW_REG &&
 349                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
 350          if (post_reg_alloc) {
 351             for (int r = 0; r < reg_width; r++)
 352                last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
 353          } else {
 354             last_fixed_grf_write = n;
 355          }
 356       } else if (inst->dst.file != BAD_FILE) {
 357          add_barrier_deps(n);
 358       }
 359
 360       if (inst->mlen > 0) {
 361          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 362             add_dep(last_mrf_write[inst->base_mrf + i], n);
 363             last_mrf_write[inst->base_mrf + i] = n;
 364          }
 365       }
 366
 367       /* Treat FS_OPCODE_MOV_DISPATCH_TO_FLAGS as though it had a
 368        * conditional_mod, because it sets the flag register.
 369        */
 370       if (inst->conditional_mod ||
 371           inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
 372          add_dep(last_conditional_mod[inst->flag_subreg], n, 0);
 373          last_conditional_mod[inst->flag_subreg] = n;
 374       }
 375    }
 376
 377    /* bottom-to-top dependencies: WAR */
 378    memset(last_grf_write, 0, sizeof(last_grf_write));
 379    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 380    memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
 381    last_fixed_grf_write = NULL;
 382
 383    exec_node *node;
 384    exec_node *prev;
 385    for (node = instructions.get_tail(), prev = node->prev;
 386         !node->is_head_sentinel();
 387         node = prev, prev = node->prev) {
 388       schedule_node *n = (schedule_node *)node;
 389       fs_inst *inst = n->inst;
 390
 391       /* write-after-read deps. */
 392       for (int i = 0; i < 3; i++) {
 393          if (inst->src[i].file == GRF) {
 394             if (post_reg_alloc) {
 395                for (int r = 0; r < reg_width; r++)
 396                   add_dep(n, last_grf_write[inst->src[i].reg + r]);
 397             } else {
 398                add_dep(n, last_grf_write[inst->src[i].reg]);
 399             }
 400          } else if (inst->src[i].file == FIXED_HW_REG &&
 401                     (inst->src[i].fixed_hw_reg.file ==
 402                      BRW_GENERAL_REGISTER_FILE)) {
 403             if (post_reg_alloc) {
 404                for (int r = 0; r < reg_width; r++)
 405                   add_dep(n, last_grf_write[inst->src[i].fixed_hw_reg.nr + r]);
 406             } else {
 407                add_dep(n, last_fixed_grf_write);
 408             }
 409          } else if (inst->src[i].file != BAD_FILE &&
 410                     inst->src[i].file != IMM &&
 411                     inst->src[i].file != UNIFORM) {
 412             assert(inst->src[i].file != MRF);
 413             add_barrier_deps(n);
 414          }
 415       }
 416
 417       for (int i = 0; i < inst->mlen; i++) {
 418          /* It looks like the MRF regs are released in the send
 419           * instruction once it's sent, not when the result comes
 420           * back.
 421           */
 422          add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
 423       }
 424
 425       if (inst->predicate) {
 426          add_dep(n, last_conditional_mod[inst->flag_subreg]);
 427       }
 428
 429       /* Update the things this instruction wrote, so earlier reads
 430        * can mark this as WAR dependency.
 431        */
 432       if (inst->dst.file == GRF) {
 433          if (post_reg_alloc) {
 434             for (int r = 0; r < inst->regs_written() * reg_width; r++)
 435                last_grf_write[inst->dst.reg + r] = n;
 436          } else {
 437             last_grf_write[inst->dst.reg] = n;
 438          }
 439       } else if (inst->dst.file == MRF) {
 440          int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
 441
 442          last_mrf_write[reg] = n;
 443
 444          if (is_compressed(inst)) {
 445             if (inst->dst.reg & BRW_MRF_COMPR4)
 446                reg += 4;
 447             else
 448                reg++;
 449
 450             last_mrf_write[reg] = n;
 451          }
 452       } else if (inst->dst.file == FIXED_HW_REG &&
 453                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
 454          if (post_reg_alloc) {
 455             for (int r = 0; r < reg_width; r++)
 456                last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
 457          } else {
 458             last_fixed_grf_write = n;
 459          }
 460       } else if (inst->dst.file != BAD_FILE) {
 461          add_barrier_deps(n);
 462       }
 463
 464       if (inst->mlen > 0) {
 465          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 466             last_mrf_write[inst->base_mrf + i] = n;
 467          }
 468       }
 469
 470       /* Treat FS_OPCODE_MOV_DISPATCH_TO_FLAGS as though it had a
 471        * conditional_mod, because it sets the flag register.
 472        */
 473       if (inst->conditional_mod ||
 474           inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
 475          last_conditional_mod[inst->flag_subreg] = n;
 476       }
 477    }
 478 }
 479
 480 void
 481 instruction_scheduler::schedule_instructions(fs_inst *next_block_header)
 482 {
 483    int time = 0;
 484
 485    /* Remove non-DAG heads from the list. */
 486    foreach_list_safe(node, &instructions) {
 487       schedule_node *n = (schedule_node *)node;
 488       if (n->parent_count != 0)
 489          n->remove();
 490    }
 491
 492    while (!instructions.is_empty()) {
 493       schedule_node *chosen = NULL;
 494       int chosen_time = 0;
 495
 496       foreach_list(node, &instructions) {
 497          schedule_node *n = (schedule_node *)node;
 498
 499          if (!chosen || n->unblocked_time < chosen_time) {
 500             chosen = n;
 501             chosen_time = n->unblocked_time;
 502          }
 503       }
 504
 505       /* Schedule this instruction. */
 506       assert(chosen);
 507       chosen->remove();
 508       next_block_header->insert_before(chosen->inst);
 509       instructions_to_schedule--;
 510
 511       /* Bump the clock.  If we expected a delay for scheduling, then
 512        * bump the clock to reflect that.
 513        */
 514       time = MAX2(time + 1, chosen_time);
 515
 516       /* Now that we've scheduled a new instruction, some of its
 517        * children can be promoted to the list of instructions ready to
 518        * be scheduled.  Update the children's unblocked time for this
 519        * DAG edge as we do so.
 520        */
 521       for (int i = 0; i < chosen->child_count; i++) {
 522          schedule_node *child = chosen->children[i];
 523
 524          child->unblocked_time = MAX2(child->unblocked_time,
 525                                       time + chosen->child_latency[i]);
 526
 527          child->parent_count--;
 528          if (child->parent_count == 0) {
 529             instructions.push_tail(child);
 530          }
 531       }
 532
 533       /* Shared resource: the mathbox.  There's one per EU (on later
 534        * generations, it's even more limited pre-gen6), so if we send
 535        * something off to it then the next math isn't going to make
 536        * progress until the first is done.
 537        */
 538       if (chosen->inst->is_math()) {
 539          foreach_list(node, &instructions) {
 540             schedule_node *n = (schedule_node *)node;
 541
 542             if (n->inst->is_math())
 543                n->unblocked_time = MAX2(n->unblocked_time,
 544                                         time + chosen->latency);
 545          }
 546       }
 547    }
 548
 549    assert(instructions_to_schedule == 0);
 550 }
 551
 552 void
 553 fs_visitor::schedule_instructions(bool post_reg_alloc)
 554 {
 555    fs_inst *next_block_header = (fs_inst *)instructions.head;
 556
 557    int grf_count;
 558    if (post_reg_alloc)
 559       grf_count = grf_used;
 560    else
 561       grf_count = virtual_grf_count;
 562
 563    instruction_scheduler sched(this, mem_ctx, grf_count, post_reg_alloc);
 564
 565    while (!next_block_header->is_tail_sentinel()) {
 566       /* Add things to be scheduled until we get to a new BB. */
 567       while (!next_block_header->is_tail_sentinel()) {
 568          fs_inst *inst = next_block_header;
 569          next_block_header = (fs_inst *)next_block_header->next;
 570
 571          sched.add_inst(inst);
 572          if (inst->opcode == BRW_OPCODE_IF ||
 573              inst->opcode == BRW_OPCODE_ELSE ||
 574              inst->opcode == BRW_OPCODE_ENDIF ||
 575              inst->opcode == BRW_OPCODE_DO ||
 576              inst->opcode == BRW_OPCODE_WHILE ||
 577              inst->opcode == BRW_OPCODE_BREAK ||
 578              inst->opcode == BRW_OPCODE_CONTINUE) {
 579             break;
 580          }
 581       }
 582       sched.calculate_deps();
 583       sched.schedule_instructions(next_block_header);
 584    }
 585
 586    this->live_intervals_valid = false;
 587 }