src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 #include "brw_fs.h"
  29 #include "../glsl/glsl_types.h"
  30 #include "../glsl/ir_optimization.h"
  31 #include "../glsl/ir_print_visitor.h"
  32
  33 /** @file brw_fs_schedule_instructions.cpp
  34  *
  35  * List scheduling of FS instructions.
  36  *
  37  * The basic model of the list scheduler is to take a basic block,
  38  * compute a DAG of the dependencies (RAW ordering with latency, WAW
  39  * ordering, WAR ordering), and make a list of the DAG heads.
  40  * Heuristically pick a DAG head, then put all the children that are
  41  * now DAG heads into the list of things to schedule.
  42  *
  43  * The heuristic is the important part.  We're trying to be cheap,
  44  * since actually computing the optimal scheduling is NP complete.
  45  * What we do is track a "current clock".  When we schedule a node, we
  46  * update the earliest-unblocked clock time of its children, and
  47  * increment the clock.  Then, when trying to schedule, we just pick
  48  * the earliest-unblocked instruction to schedule.
  49  *
  50  * Note that often there will be many things which could execute
  51  * immediately, and there are a range of heuristic options to choose
  52  * from in picking among those.
  53  */
  54
  55 class schedule_node : public exec_node
  56 {
  57 public:
  58    schedule_node(fs_inst *inst)
  59    {
  60       this->inst = inst;
  61       this->child_array_size = 0;
  62       this->children = NULL;
  63       this->child_latency = NULL;
  64       this->child_count = 0;
  65       this->parent_count = 0;
  66       this->unblocked_time = 0;
  67
  68       int chans = 8;
  69       int math_latency = 22;
  70
  71       switch (inst->opcode) {
  72       case SHADER_OPCODE_RCP:
  73          this->latency = 1 * chans * math_latency;
  74          break;
  75       case SHADER_OPCODE_RSQ:
  76          this->latency = 2 * chans * math_latency;
  77          break;
  78       case SHADER_OPCODE_SQRT:
  79       case SHADER_OPCODE_LOG2:
  80          /* full precision log.  partial is 2. */
  81          this->latency = 3 * chans * math_latency;
  82          break;
  83       case SHADER_OPCODE_EXP2:
  84          /* full precision.  partial is 3, same throughput. */
  85          this->latency = 4 * chans * math_latency;
  86          break;
  87       case SHADER_OPCODE_POW:
  88          this->latency = 8 * chans * math_latency;
  89          break;
  90       case SHADER_OPCODE_SIN:
  91       case SHADER_OPCODE_COS:
  92          /* minimum latency, max is 12 rounds. */
  93          this->latency = 5 * chans * math_latency;
  94          break;
  95       default:
  96          this->latency = 2;
  97          break;
  98       }
  99    }
 100
 101    fs_inst *inst;
 102    schedule_node **children;
 103    int *child_latency;
 104    int child_count;
 105    int parent_count;
 106    int child_array_size;
 107    int unblocked_time;
 108    int latency;
 109 };
 110
 111 class instruction_scheduler {
 112 public:
 113    instruction_scheduler(fs_visitor *v, void *mem_ctx, int virtual_grf_count)
 114    {
 115       this->v = v;
 116       this->mem_ctx = ralloc_context(mem_ctx);
 117       this->virtual_grf_count = virtual_grf_count;
 118       this->instructions.make_empty();
 119       this->instructions_to_schedule = 0;
 120    }
 121
 122    ~instruction_scheduler()
 123    {
 124       ralloc_free(this->mem_ctx);
 125    }
 126    void add_barrier_deps(schedule_node *n);
 127    void add_dep(schedule_node *before, schedule_node *after, int latency);
 128    void add_dep(schedule_node *before, schedule_node *after);
 129
 130    void add_inst(fs_inst *inst);
 131    void calculate_deps();
 132    void schedule_instructions(fs_inst *next_block_header);
 133
 134    bool is_compressed(fs_inst *inst);
 135
 136    void *mem_ctx;
 137
 138    int instructions_to_schedule;
 139    int virtual_grf_count;
 140    exec_list instructions;
 141    fs_visitor *v;
 142 };
 143
 144 void
 145 instruction_scheduler::add_inst(fs_inst *inst)
 146 {
 147    schedule_node *n = new(mem_ctx) schedule_node(inst);
 148
 149    assert(!inst->is_head_sentinel());
 150    assert(!inst->is_tail_sentinel());
 151
 152    this->instructions_to_schedule++;
 153
 154    inst->remove();
 155    instructions.push_tail(n);
 156 }
 157
 158 /**
 159  * Add a dependency between two instruction nodes.
 160  *
 161  * The @after node will be scheduled after @before.  We will try to
 162  * schedule it @latency cycles after @before, but no guarantees there.
 163  */
 164 void
 165 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
 166                                int latency)
 167 {
 168    if (!before || !after)
 169       return;
 170
 171    assert(before != after);
 172
 173    for (int i = 0; i < before->child_count; i++) {
 174       if (before->children[i] == after) {
 175          before->child_latency[i] = MAX2(before->child_latency[i], latency);
 176          return;
 177       }
 178    }
 179
 180    if (before->child_array_size <= before->child_count) {
 181       if (before->child_array_size < 16)
 182          before->child_array_size = 16;
 183       else
 184          before->child_array_size *= 2;
 185
 186       before->children = reralloc(mem_ctx, before->children,
 187                                   schedule_node *,
 188                                   before->child_array_size);
 189       before->child_latency = reralloc(mem_ctx, before->child_latency,
 190                                        int, before->child_array_size);
 191    }
 192
 193    before->children[before->child_count] = after;
 194    before->child_latency[before->child_count] = latency;
 195    before->child_count++;
 196    after->parent_count++;
 197 }
 198
 199 void
 200 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
 201 {
 202    if (!before)
 203       return;
 204
 205    add_dep(before, after, before->latency);
 206 }
 207
 208 /**
 209  * Sometimes we really want this node to execute after everything that
 210  * was before it and before everything that followed it.  This adds
 211  * the deps to do so.
 212  */
 213 void
 214 instruction_scheduler::add_barrier_deps(schedule_node *n)
 215 {
 216    schedule_node *prev = (schedule_node *)n->prev;
 217    schedule_node *next = (schedule_node *)n->next;
 218
 219    if (prev) {
 220       while (!prev->is_head_sentinel()) {
 221          add_dep(prev, n, 0);
 222          prev = (schedule_node *)prev->prev;
 223       }
 224    }
 225
 226    if (next) {
 227       while (!next->is_tail_sentinel()) {
 228          add_dep(n, next, 0);
 229          next = (schedule_node *)next->next;
 230       }
 231    }
 232 }
 233
 234 /* instruction scheduling needs to be aware of when an MRF write
 235  * actually writes 2 MRFs.
 236  */
 237 bool
 238 instruction_scheduler::is_compressed(fs_inst *inst)
 239 {
 240    return (v->c->dispatch_width == 16 &&
 241            !inst->force_uncompressed &&
 242            !inst->force_sechalf);
 243 }
 244
 245 void
 246 instruction_scheduler::calculate_deps()
 247 {
 248    schedule_node *last_grf_write[virtual_grf_count];
 249    schedule_node *last_mrf_write[BRW_MAX_MRF];
 250    schedule_node *last_conditional_mod = NULL;
 251    /* Fixed HW registers are assumed to be separate from the virtual
 252     * GRFs, so they can be tracked separately.  We don't really write
 253     * to fixed GRFs much, so don't bother tracking them on a more
 254     * granular level.
 255     */
 256    schedule_node *last_fixed_grf_write = NULL;
 257
 258    /* The last instruction always needs to still be the last
 259     * instruction.  Either it's flow control (IF, ELSE, ENDIF, DO,
 260     * WHILE) and scheduling other things after it would disturb the
 261     * basic block, or it's FB_WRITE and we should do a better job at
 262     * dead code elimination anyway.
 263     */
 264    schedule_node *last = (schedule_node *)instructions.get_tail();
 265    add_barrier_deps(last);
 266
 267    memset(last_grf_write, 0, sizeof(last_grf_write));
 268    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 269
 270    /* top-to-bottom dependencies: RAW and WAW. */
 271    foreach_list(node, &instructions) {
 272       schedule_node *n = (schedule_node *)node;
 273       fs_inst *inst = n->inst;
 274
 275       /* read-after-write deps. */
 276       for (int i = 0; i < 3; i++) {
 277          if (inst->src[i].file == GRF) {
 278             add_dep(last_grf_write[inst->src[i].reg], n);
 279          } else if (inst->src[i].file == FIXED_HW_REG &&
 280                     (inst->src[i].fixed_hw_reg.file ==
 281                      BRW_GENERAL_REGISTER_FILE)) {
 282             add_dep(last_fixed_grf_write, n);
 283          } else if (inst->src[i].file != BAD_FILE &&
 284                     inst->src[i].file != IMM &&
 285                     inst->src[i].file != UNIFORM) {
 286             assert(inst->src[i].file != MRF);
 287             add_barrier_deps(n);
 288          }
 289       }
 290
 291       for (int i = 0; i < inst->mlen; i++) {
 292          /* It looks like the MRF regs are released in the send
 293           * instruction once it's sent, not when the result comes
 294           * back.
 295           */
 296          add_dep(last_mrf_write[inst->base_mrf + i], n);
 297       }
 298
 299       if (inst->predicated) {
 300          assert(last_conditional_mod);
 301          add_dep(last_conditional_mod, n);
 302       }
 303
 304       /* write-after-write deps. */
 305       if (inst->dst.file == GRF) {
 306          add_dep(last_grf_write[inst->dst.reg], n);
 307          last_grf_write[inst->dst.reg] = n;
 308       } else if (inst->dst.file == MRF) {
 309          int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
 310
 311          add_dep(last_mrf_write[reg], n);
 312          last_mrf_write[reg] = n;
 313          if (is_compressed(inst)) {
 314             if (inst->dst.reg & BRW_MRF_COMPR4)
 315                reg += 4;
 316             else
 317                reg++;
 318             add_dep(last_mrf_write[reg], n);
 319             last_mrf_write[reg] = n;
 320          }
 321       } else if (inst->dst.file == FIXED_HW_REG &&
 322                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
 323          last_fixed_grf_write = n;
 324       } else if (inst->dst.file != BAD_FILE) {
 325          add_barrier_deps(n);
 326       }
 327
 328       if (inst->mlen > 0) {
 329          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 330             add_dep(last_mrf_write[inst->base_mrf + i], n);
 331             last_mrf_write[inst->base_mrf + i] = n;
 332          }
 333       }
 334
 335       if (inst->conditional_mod) {
 336          add_dep(last_conditional_mod, n, 0);
 337          last_conditional_mod = n;
 338       }
 339    }
 340
 341    /* bottom-to-top dependencies: WAR */
 342    memset(last_grf_write, 0, sizeof(last_grf_write));
 343    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 344    last_conditional_mod = NULL;
 345    last_fixed_grf_write = NULL;
 346
 347    exec_node *node;
 348    exec_node *prev;
 349    for (node = instructions.get_tail(), prev = node->prev;
 350         !node->is_head_sentinel();
 351         node = prev, prev = node->prev) {
 352       schedule_node *n = (schedule_node *)node;
 353       fs_inst *inst = n->inst;
 354
 355       /* write-after-read deps. */
 356       for (int i = 0; i < 3; i++) {
 357          if (inst->src[i].file == GRF) {
 358             add_dep(n, last_grf_write[inst->src[i].reg]);
 359          } else if (inst->src[i].file == FIXED_HW_REG &&
 360                     (inst->src[i].fixed_hw_reg.file ==
 361                      BRW_GENERAL_REGISTER_FILE)) {
 362             add_dep(n, last_fixed_grf_write);
 363          } else if (inst->src[i].file != BAD_FILE &&
 364                     inst->src[i].file != IMM &&
 365                     inst->src[i].file != UNIFORM) {
 366             assert(inst->src[i].file != MRF);
 367             add_barrier_deps(n);
 368          }
 369       }
 370
 371       for (int i = 0; i < inst->mlen; i++) {
 372          /* It looks like the MRF regs are released in the send
 373           * instruction once it's sent, not when the result comes
 374           * back.
 375           */
 376          add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
 377       }
 378
 379       if (inst->predicated) {
 380          add_dep(n, last_conditional_mod);
 381       }
 382
 383       /* Update the things this instruction wrote, so earlier reads
 384        * can mark this as WAR dependency.
 385        */
 386       if (inst->dst.file == GRF) {
 387          last_grf_write[inst->dst.reg] = n;
 388       } else if (inst->dst.file == MRF) {
 389          int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
 390
 391          last_mrf_write[reg] = n;
 392
 393          if (is_compressed(inst)) {
 394             if (inst->dst.reg & BRW_MRF_COMPR4)
 395                reg += 4;
 396             else
 397                reg++;
 398
 399             last_mrf_write[reg] = n;
 400          }
 401       } else if (inst->dst.file == FIXED_HW_REG &&
 402                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
 403          last_fixed_grf_write = n;
 404       } else if (inst->dst.file != BAD_FILE) {
 405          add_barrier_deps(n);
 406       }
 407
 408       if (inst->mlen > 0) {
 409          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 410             last_mrf_write[inst->base_mrf + i] = n;
 411          }
 412       }
 413
 414       if (inst->conditional_mod)
 415          last_conditional_mod = n;
 416    }
 417 }
 418
 419 void
 420 instruction_scheduler::schedule_instructions(fs_inst *next_block_header)
 421 {
 422    int time = 0;
 423
 424    /* Remove non-DAG heads from the list. */
 425    foreach_list_safe(node, &instructions) {
 426       schedule_node *n = (schedule_node *)node;
 427       if (n->parent_count != 0)
 428          n->remove();
 429    }
 430
 431    while (!instructions.is_empty()) {
 432       schedule_node *chosen = NULL;
 433       int chosen_time = 0;
 434
 435       foreach_list(node, &instructions) {
 436          schedule_node *n = (schedule_node *)node;
 437
 438          if (!chosen || n->unblocked_time < chosen_time) {
 439             chosen = n;
 440             chosen_time = n->unblocked_time;
 441          }
 442       }
 443
 444       /* Schedule this instruction. */
 445       assert(chosen);
 446       chosen->remove();
 447       next_block_header->insert_before(chosen->inst);
 448       instructions_to_schedule--;
 449
 450       /* Bump the clock.  If we expected a delay for scheduling, then
 451        * bump the clock to reflect that.
 452        */
 453       time = MAX2(time + 1, chosen_time);
 454
 455       /* Now that we've scheduled a new instruction, some of its
 456        * children can be promoted to the list of instructions ready to
 457        * be scheduled.  Update the children's unblocked time for this
 458        * DAG edge as we do so.
 459        */
 460       for (int i = 0; i < chosen->child_count; i++) {
 461          schedule_node *child = chosen->children[i];
 462
 463          child->unblocked_time = MAX2(child->unblocked_time,
 464                                       time + chosen->child_latency[i]);
 465
 466          child->parent_count--;
 467          if (child->parent_count == 0) {
 468             instructions.push_tail(child);
 469          }
 470       }
 471
 472       /* Shared resource: the mathbox.  There's one per EU (on later
 473        * generations, it's even more limited pre-gen6), so if we send
 474        * something off to it then the next math isn't going to make
 475        * progress until the first is done.
 476        */
 477       if (chosen->inst->is_math()) {
 478          foreach_list(node, &instructions) {
 479             schedule_node *n = (schedule_node *)node;
 480
 481             if (n->inst->is_math())
 482                n->unblocked_time = MAX2(n->unblocked_time,
 483                                         time + chosen->latency);
 484          }
 485       }
 486    }
 487
 488    assert(instructions_to_schedule == 0);
 489 }
 490
 491 void
 492 fs_visitor::schedule_instructions()
 493 {
 494    fs_inst *next_block_header = (fs_inst *)instructions.head;
 495    instruction_scheduler sched(this, mem_ctx, this->virtual_grf_next);
 496
 497    while (!next_block_header->is_tail_sentinel()) {
 498       /* Add things to be scheduled until we get to a new BB. */
 499       while (!next_block_header->is_tail_sentinel()) {
 500          fs_inst *inst = next_block_header;
 501          next_block_header = (fs_inst *)next_block_header->next;
 502
 503          sched.add_inst(inst);
 504          if (inst->opcode == BRW_OPCODE_IF ||
 505              inst->opcode == BRW_OPCODE_ELSE ||
 506              inst->opcode == BRW_OPCODE_ENDIF ||
 507              inst->opcode == BRW_OPCODE_DO ||
 508              inst->opcode == BRW_OPCODE_WHILE ||
 509              inst->opcode == BRW_OPCODE_BREAK ||
 510              inst->opcode == BRW_OPCODE_CONTINUE) {
 511             break;
 512          }
 513       }
 514       sched.calculate_deps();
 515       sched.schedule_instructions(next_block_header);
 516    }
 517
 518    this->live_intervals_valid = false;
 519 }