src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 extern "C" {
  29
  30 #include <sys/types.h>
  31
  32 #include "main/macros.h"
  33 #include "main/shaderobj.h"
  34 #include "main/uniforms.h"
  35 #include "program/prog_optimize.h"
  36 #include "program/register_allocate.h"
  37 #include "program/sampler.h"
  38 #include "program/hash_table.h"
  39 #include "brw_context.h"
  40 #include "brw_eu.h"
  41 #include "brw_wm.h"
  42 }
  43 #include "brw_fs.h"
  44 #include "../glsl/glsl_types.h"
  45 #include "../glsl/ir_optimization.h"
  46 #include "../glsl/ir_print_visitor.h"
  47
  48 /** @file brw_fs_schedule_instructions.cpp
  49  *
  50  * List scheduling of FS instructions.
  51  *
  52  * The basic model of the list scheduler is to take a basic block,
  53  * compute a DAG of the dependencies (RAW ordering with latency, WAW
  54  * ordering, WAR ordering), and make a list of the DAG heads.
  55  * Heuristically pick a DAG head, then put all the children that are
  56  * now DAG heads into the list of things to schedule.
  57  *
  58  * The heuristic is the important part.  We're trying to be cheap,
  59  * since actually computing the optimal scheduling is NP complete.
  60  * What we do is track a "current clock".  When we schedule a node, we
  61  * update the earliest-unblocked clock time of its children, and
  62  * increment the clock.  Then, when trying to schedule, we just pick
  63  * the earliest-unblocked instruction to schedule.
  64  *
  65  * Note that often there will be many things which could execute
  66  * immediately, and there are a range of heuristic options to choose
  67  * from in picking among those.
  68  */
  69
  70 class schedule_node : public exec_node
  71 {
  72 public:
  73    schedule_node(fs_inst *inst)
  74    {
  75       this->inst = inst;
  76       this->child_array_size = 0;
  77       this->children = NULL;
  78       this->child_latency = NULL;
  79       this->child_count = 0;
  80       this->parent_count = 0;
  81       this->unblocked_time = 0;
  82
  83       int chans = 8;
  84       int math_latency = 22;
  85
  86       switch (inst->opcode) {
  87       case FS_OPCODE_RCP:
  88          this->latency = 1 * chans * math_latency;
  89          break;
  90       case FS_OPCODE_RSQ:
  91          this->latency = 2 * chans * math_latency;
  92          break;
  93       case FS_OPCODE_SQRT:
  94       case FS_OPCODE_LOG2:
  95          /* full precision log.  partial is 2. */
  96          this->latency = 3 * chans * math_latency;
  97          break;
  98       case FS_OPCODE_EXP2:
  99          /* full precision.  partial is 3, same throughput. */
 100          this->latency = 4 * chans * math_latency;
 101          break;
 102       case FS_OPCODE_POW:
 103          this->latency = 8 * chans * math_latency;
 104          break;
 105       case FS_OPCODE_SIN:
 106       case FS_OPCODE_COS:
 107          /* minimum latency, max is 12 rounds. */
 108          this->latency = 5 * chans * math_latency;
 109          break;
 110       default:
 111          this->latency = 2;
 112          break;
 113       }
 114    }
 115
 116    fs_inst *inst;
 117    schedule_node **children;
 118    int *child_latency;
 119    int child_count;
 120    int parent_count;
 121    int child_array_size;
 122    int unblocked_time;
 123    int latency;
 124 };
 125
 126 class instruction_scheduler {
 127 public:
 128    instruction_scheduler(fs_visitor *v, void *mem_ctx, int virtual_grf_count)
 129    {
 130       this->v = v;
 131       this->mem_ctx = ralloc_context(mem_ctx);
 132       this->virtual_grf_count = virtual_grf_count;
 133       this->instructions.make_empty();
 134       this->instructions_to_schedule = 0;
 135    }
 136
 137    ~instruction_scheduler()
 138    {
 139       ralloc_free(this->mem_ctx);
 140    }
 141    void add_barrier_deps(schedule_node *n);
 142    void add_dep(schedule_node *before, schedule_node *after, int latency);
 143
 144    void add_inst(fs_inst *inst);
 145    void calculate_deps();
 146    void schedule_instructions(fs_inst *next_block_header);
 147
 148    void *mem_ctx;
 149
 150    int instructions_to_schedule;
 151    int virtual_grf_count;
 152    exec_list instructions;
 153    fs_visitor *v;
 154 };
 155
 156 void
 157 instruction_scheduler::add_inst(fs_inst *inst)
 158 {
 159    schedule_node *n = new(mem_ctx) schedule_node(inst);
 160
 161    assert(!inst->is_head_sentinel());
 162    assert(!inst->is_tail_sentinel());
 163
 164    this->instructions_to_schedule++;
 165
 166    inst->remove();
 167    instructions.push_tail(n);
 168 }
 169
 170 /**
 171  * Add a dependency between two instruction nodes.
 172  *
 173  * The @after node will be scheduled after @before.  We will try to
 174  * schedule it @latency cycles after @before, but no guarantees there.
 175  */
 176 void
 177 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
 178                                int latency)
 179 {
 180    if (!before || !after)
 181       return;
 182
 183    assert(before != after);
 184
 185    for (int i = 0; i < before->child_count; i++) {
 186       if (before->children[i] == after) {
 187          before->child_latency[i] = MAX2(before->child_latency[i], latency);
 188          return;
 189       }
 190    }
 191
 192    if (before->child_array_size <= before->child_count) {
 193       if (before->child_array_size < 16)
 194          before->child_array_size = 16;
 195       else
 196          before->child_array_size *= 2;
 197
 198       before->children = reralloc(mem_ctx, before->children,
 199                                   schedule_node *,
 200                                   before->child_array_size);
 201       before->child_latency = reralloc(mem_ctx, before->child_latency,
 202                                        int, before->child_array_size);
 203    }
 204
 205    before->children[before->child_count] = after;
 206    before->child_latency[before->child_count] = latency;
 207    before->child_count++;
 208    after->parent_count++;
 209 }
 210
 211 /**
 212  * Sometimes we really want this node to execute after everything that
 213  * was before it and before everything that followed it.  This adds
 214  * the deps to do so.
 215  */
 216 void
 217 instruction_scheduler::add_barrier_deps(schedule_node *n)
 218 {
 219    schedule_node *prev = (schedule_node *)n->prev;
 220    schedule_node *next = (schedule_node *)n->next;
 221
 222    if (prev) {
 223       while (!prev->is_head_sentinel()) {
 224          add_dep(prev, n, 0);
 225          prev = (schedule_node *)prev->prev;
 226       }
 227    }
 228
 229    if (next) {
 230       while (!next->is_tail_sentinel()) {
 231          add_dep(n, next, 0);
 232          next = (schedule_node *)next->next;
 233       }
 234    }
 235 }
 236
 237 void
 238 instruction_scheduler::calculate_deps()
 239 {
 240    schedule_node *last_grf_write[virtual_grf_count];
 241    schedule_node *last_mrf_write[BRW_MAX_MRF];
 242    schedule_node *last_conditional_mod = NULL;
 243
 244    /* The last instruction always needs to still be the last
 245     * instruction.  Either it's flow control (IF, ELSE, ENDIF, DO,
 246     * WHILE) and scheduling other things after it would disturb the
 247     * basic block, or it's FB_WRITE and we should do a better job at
 248     * dead code elimination anyway.
 249     */
 250    schedule_node *last = (schedule_node *)instructions.get_tail();
 251    add_barrier_deps(last);
 252
 253    memset(last_grf_write, 0, sizeof(last_grf_write));
 254    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 255
 256    /* top-to-bottom dependencies: RAW and WAW. */
 257    foreach_iter(exec_list_iterator, iter, instructions) {
 258       schedule_node *n = (schedule_node *)iter.get();
 259       fs_inst *inst = n->inst;
 260
 261       /* read-after-write deps. */
 262       for (int i = 0; i < 3; i++) {
 263          if (inst->src[i].file == GRF) {
 264             if (last_grf_write[inst->src[i].reg]) {
 265                add_dep(last_grf_write[inst->src[i].reg], n,
 266                        last_grf_write[inst->src[i].reg]->latency);
 267             }
 268          } else if (inst->src[i].file != BAD_FILE &&
 269                     inst->src[i].file != IMM &&
 270                     inst->src[i].file != UNIFORM) {
 271             assert(inst->src[i].file != MRF);
 272             add_barrier_deps(n);
 273          }
 274       }
 275
 276       for (int i = 0; i < inst->mlen; i++) {
 277          /* It looks like the MRF regs are released in the send
 278           * instruction once it's sent, not when the result comes
 279           * back.
 280           */
 281          if (last_mrf_write[inst->base_mrf + i]) {
 282             add_dep(last_mrf_write[inst->base_mrf + i], n,
 283                     last_mrf_write[inst->base_mrf + i]->latency);
 284          }
 285       }
 286
 287       if (inst->predicated) {
 288          assert(last_conditional_mod);
 289          add_dep(last_conditional_mod, n, last_conditional_mod->latency);
 290       }
 291
 292       /* write-after-write deps. */
 293       if (inst->dst.file == GRF) {
 294          if (last_grf_write[inst->dst.reg]) {
 295             add_dep(last_grf_write[inst->dst.reg], n,
 296                     last_grf_write[inst->dst.reg]->latency);
 297          }
 298          last_grf_write[inst->dst.reg] = n;
 299       } else if (inst->dst.file == MRF) {
 300          if (last_mrf_write[inst->dst.hw_reg]) {
 301             add_dep(last_mrf_write[inst->dst.hw_reg], n,
 302                     last_mrf_write[inst->dst.hw_reg]->latency);
 303          }
 304          last_mrf_write[inst->dst.hw_reg] = n;
 305       } else if (inst->dst.file != BAD_FILE) {
 306          add_barrier_deps(n);
 307       }
 308
 309       if (inst->mlen > 0) {
 310          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 311             if (last_mrf_write[inst->base_mrf + i]) {
 312                add_dep(last_mrf_write[inst->base_mrf + i], n,
 313                        last_mrf_write[inst->base_mrf + i]->latency);
 314             }
 315             last_mrf_write[inst->base_mrf + i] = n;
 316          }
 317       }
 318
 319       if (inst->conditional_mod) {
 320          add_dep(last_conditional_mod, n, 0);
 321          last_conditional_mod = n;
 322       }
 323    }
 324
 325    /* bottom-to-top dependencies: WAR */
 326    memset(last_grf_write, 0, sizeof(last_grf_write));
 327    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 328    last_conditional_mod = NULL;
 329
 330    exec_node *node;
 331    exec_node *prev;
 332    for (node = instructions.get_tail(), prev = node->prev;
 333         !node->is_head_sentinel();
 334         node = prev, prev = node->prev) {
 335       schedule_node *n = (schedule_node *)node;
 336       fs_inst *inst = n->inst;
 337
 338       /* write-after-read deps. */
 339       for (int i = 0; i < 3; i++) {
 340          if (inst->src[i].file == GRF) {
 341             if (last_grf_write[inst->src[i].reg]) {
 342                add_dep(n, last_grf_write[inst->src[i].reg], n->latency);
 343             }
 344          } else if (inst->src[i].file != BAD_FILE &&
 345                     inst->src[i].file != IMM &&
 346                     inst->src[i].file != UNIFORM) {
 347             assert(inst->src[i].file != MRF);
 348             add_barrier_deps(n);
 349          }
 350       }
 351
 352       for (int i = 0; i < inst->mlen; i++) {
 353          /* It looks like the MRF regs are released in the send
 354           * instruction once it's sent, not when the result comes
 355           * back.
 356           */
 357          add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
 358       }
 359
 360       if (inst->predicated) {
 361          if (last_conditional_mod) {
 362             add_dep(n, last_conditional_mod, n->latency);
 363          }
 364       }
 365
 366       /* Update the things this instruction wrote, so earlier reads
 367        * can mark this as WAR dependency.
 368        */
 369       if (inst->dst.file == GRF) {
 370          last_grf_write[inst->dst.reg] = n;
 371       } else if (inst->dst.file == MRF) {
 372          last_mrf_write[inst->dst.hw_reg] = n;
 373       } else if (inst->dst.file != BAD_FILE) {
 374          add_barrier_deps(n);
 375       }
 376
 377       if (inst->mlen > 0) {
 378          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 379             last_mrf_write[inst->base_mrf + i] = n;
 380          }
 381       }
 382
 383       if (inst->conditional_mod)
 384          last_conditional_mod = n;
 385    }
 386 }
 387
 388 void
 389 instruction_scheduler::schedule_instructions(fs_inst *next_block_header)
 390 {
 391    int time = 0;
 392
 393    /* Remove non-DAG heads from the list. */
 394    foreach_iter(exec_list_iterator, iter, instructions) {
 395       schedule_node *n = (schedule_node *)iter.get();
 396       if (n->parent_count != 0)
 397          n->remove();
 398    }
 399
 400    while (!instructions.is_empty()) {
 401       schedule_node *chosen = NULL;
 402       int chosen_time = 0;
 403
 404       foreach_iter(exec_list_iterator, iter, instructions) {
 405          schedule_node *n = (schedule_node *)iter.get();
 406
 407          if (!chosen || n->unblocked_time < chosen_time) {
 408             chosen = n;
 409             chosen_time = n->unblocked_time;
 410          }
 411       }
 412
 413       /* Schedule this instruction. */
 414       assert(chosen);
 415       chosen->remove();
 416       next_block_header->insert_before(chosen->inst);
 417       instructions_to_schedule--;
 418
 419       /* Bump the clock.  If we expected a delay for scheduling, then
 420        * bump the clock to reflect that.
 421        */
 422       time = MAX2(time + 1, chosen_time);
 423
 424       /* Now that we've scheduled a new instruction, some of its
 425        * children can be promoted to the list of instructions ready to
 426        * be scheduled.  Update the children's unblocked time for this
 427        * DAG edge as we do so.
 428        */
 429       for (int i = 0; i < chosen->child_count; i++) {
 430          schedule_node *child = chosen->children[i];
 431
 432          child->unblocked_time = MAX2(child->unblocked_time,
 433                                       time + chosen->child_latency[i]);
 434
 435          child->parent_count--;
 436          if (child->parent_count == 0) {
 437             instructions.push_tail(child);
 438          }
 439       }
 440
 441       /* Shared resource: the mathbox.  There's one per EU (on later
 442        * generations, it's even more limited pre-gen6), so if we send
 443        * something off to it then the next math isn't going to make
 444        * progress until the first is done.
 445        */
 446       if (chosen->inst->is_math()) {
 447          foreach_iter(exec_list_iterator, iter, instructions) {
 448             schedule_node *n = (schedule_node *)iter.get();
 449
 450             if (n->inst->is_math())
 451                n->unblocked_time = MAX2(n->unblocked_time,
 452                                         time + chosen->latency);
 453          }
 454       }
 455    }
 456
 457    assert(instructions_to_schedule == 0);
 458 }
 459
 460 void
 461 fs_visitor::schedule_instructions()
 462 {
 463    fs_inst *next_block_header = (fs_inst *)instructions.head;
 464    instruction_scheduler sched(this, mem_ctx, this->virtual_grf_next);
 465
 466    while (!next_block_header->is_tail_sentinel()) {
 467       /* Add things to be scheduled until we get to a new BB. */
 468       while (!next_block_header->is_tail_sentinel()) {
 469          fs_inst *inst = next_block_header;
 470          next_block_header = (fs_inst *)next_block_header->next;
 471
 472          sched.add_inst(inst);
 473          if (inst->opcode == BRW_OPCODE_IF ||
 474              inst->opcode == BRW_OPCODE_ELSE ||
 475              inst->opcode == BRW_OPCODE_ENDIF ||
 476              inst->opcode == BRW_OPCODE_DO ||
 477              inst->opcode == BRW_OPCODE_WHILE ||
 478              inst->opcode == BRW_OPCODE_BREAK ||
 479              inst->opcode == BRW_OPCODE_CONTINUE) {
 480             break;
 481          }
 482       }
 483       sched.calculate_deps();
 484       sched.schedule_instructions(next_block_header);
 485    }
 486
 487    this->live_intervals_valid = false;
 488 }