src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 #include "brw_fs.h"
  29 #include "glsl/glsl_types.h"
  30 #include "glsl/ir_optimization.h"
  31 #include "glsl/ir_print_visitor.h"
  32
  33 /** @file brw_fs_schedule_instructions.cpp
  34  *
  35  * List scheduling of FS instructions.
  36  *
  37  * The basic model of the list scheduler is to take a basic block,
  38  * compute a DAG of the dependencies (RAW ordering with latency, WAW
  39  * ordering, WAR ordering), and make a list of the DAG heads.
  40  * Heuristically pick a DAG head, then put all the children that are
  41  * now DAG heads into the list of things to schedule.
  42  *
  43  * The heuristic is the important part.  We're trying to be cheap,
  44  * since actually computing the optimal scheduling is NP complete.
  45  * What we do is track a "current clock".  When we schedule a node, we
  46  * update the earliest-unblocked clock time of its children, and
  47  * increment the clock.  Then, when trying to schedule, we just pick
  48  * the earliest-unblocked instruction to schedule.
  49  *
  50  * Note that often there will be many things which could execute
  51  * immediately, and there are a range of heuristic options to choose
  52  * from in picking among those.
  53  */
  54
  55 class schedule_node : public exec_node
  56 {
  57 public:
  58    schedule_node(fs_inst *inst)
  59    {
  60       this->inst = inst;
  61       this->child_array_size = 0;
  62       this->children = NULL;
  63       this->child_latency = NULL;
  64       this->child_count = 0;
  65       this->parent_count = 0;
  66       this->unblocked_time = 0;
  67
  68       int chans = 8;
  69       int math_latency = 22;
  70
  71       switch (inst->opcode) {
  72       case SHADER_OPCODE_RCP:
  73          this->latency = 1 * chans * math_latency;
  74          break;
  75       case SHADER_OPCODE_RSQ:
  76          this->latency = 2 * chans * math_latency;
  77          break;
  78       case SHADER_OPCODE_INT_QUOTIENT:
  79       case SHADER_OPCODE_SQRT:
  80       case SHADER_OPCODE_LOG2:
  81          /* full precision log.  partial is 2. */
  82          this->latency = 3 * chans * math_latency;
  83          break;
  84       case SHADER_OPCODE_INT_REMAINDER:
  85       case SHADER_OPCODE_EXP2:
  86          /* full precision.  partial is 3, same throughput. */
  87          this->latency = 4 * chans * math_latency;
  88          break;
  89       case SHADER_OPCODE_POW:
  90          this->latency = 8 * chans * math_latency;
  91          break;
  92       case SHADER_OPCODE_SIN:
  93       case SHADER_OPCODE_COS:
  94          /* minimum latency, max is 12 rounds. */
  95          this->latency = 5 * chans * math_latency;
  96          break;
  97       default:
  98          this->latency = 2;
  99          break;
 100       }
 101    }
 102
 103    fs_inst *inst;
 104    schedule_node **children;
 105    int *child_latency;
 106    int child_count;
 107    int parent_count;
 108    int child_array_size;
 109    int unblocked_time;
 110    int latency;
 111 };
 112
 113 class instruction_scheduler {
 114 public:
 115    instruction_scheduler(fs_visitor *v, void *mem_ctx, int virtual_grf_count)
 116    {
 117       this->v = v;
 118       this->mem_ctx = ralloc_context(mem_ctx);
 119       this->virtual_grf_count = virtual_grf_count;
 120       this->instructions.make_empty();
 121       this->instructions_to_schedule = 0;
 122    }
 123
 124    ~instruction_scheduler()
 125    {
 126       ralloc_free(this->mem_ctx);
 127    }
 128    void add_barrier_deps(schedule_node *n);
 129    void add_dep(schedule_node *before, schedule_node *after, int latency);
 130    void add_dep(schedule_node *before, schedule_node *after);
 131
 132    void add_inst(fs_inst *inst);
 133    void calculate_deps();
 134    void schedule_instructions(fs_inst *next_block_header);
 135
 136    bool is_compressed(fs_inst *inst);
 137
 138    void *mem_ctx;
 139
 140    int instructions_to_schedule;
 141    int virtual_grf_count;
 142    exec_list instructions;
 143    fs_visitor *v;
 144 };
 145
 146 void
 147 instruction_scheduler::add_inst(fs_inst *inst)
 148 {
 149    schedule_node *n = new(mem_ctx) schedule_node(inst);
 150
 151    assert(!inst->is_head_sentinel());
 152    assert(!inst->is_tail_sentinel());
 153
 154    this->instructions_to_schedule++;
 155
 156    inst->remove();
 157    instructions.push_tail(n);
 158 }
 159
 160 /**
 161  * Add a dependency between two instruction nodes.
 162  *
 163  * The @after node will be scheduled after @before.  We will try to
 164  * schedule it @latency cycles after @before, but no guarantees there.
 165  */
 166 void
 167 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
 168                                int latency)
 169 {
 170    if (!before || !after)
 171       return;
 172
 173    assert(before != after);
 174
 175    for (int i = 0; i < before->child_count; i++) {
 176       if (before->children[i] == after) {
 177          before->child_latency[i] = MAX2(before->child_latency[i], latency);
 178          return;
 179       }
 180    }
 181
 182    if (before->child_array_size <= before->child_count) {
 183       if (before->child_array_size < 16)
 184          before->child_array_size = 16;
 185       else
 186          before->child_array_size *= 2;
 187
 188       before->children = reralloc(mem_ctx, before->children,
 189                                   schedule_node *,
 190                                   before->child_array_size);
 191       before->child_latency = reralloc(mem_ctx, before->child_latency,
 192                                        int, before->child_array_size);
 193    }
 194
 195    before->children[before->child_count] = after;
 196    before->child_latency[before->child_count] = latency;
 197    before->child_count++;
 198    after->parent_count++;
 199 }
 200
 201 void
 202 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
 203 {
 204    if (!before)
 205       return;
 206
 207    add_dep(before, after, before->latency);
 208 }
 209
 210 /**
 211  * Sometimes we really want this node to execute after everything that
 212  * was before it and before everything that followed it.  This adds
 213  * the deps to do so.
 214  */
 215 void
 216 instruction_scheduler::add_barrier_deps(schedule_node *n)
 217 {
 218    schedule_node *prev = (schedule_node *)n->prev;
 219    schedule_node *next = (schedule_node *)n->next;
 220
 221    if (prev) {
 222       while (!prev->is_head_sentinel()) {
 223          add_dep(prev, n, 0);
 224          prev = (schedule_node *)prev->prev;
 225       }
 226    }
 227
 228    if (next) {
 229       while (!next->is_tail_sentinel()) {
 230          add_dep(n, next, 0);
 231          next = (schedule_node *)next->next;
 232       }
 233    }
 234 }
 235
 236 /* instruction scheduling needs to be aware of when an MRF write
 237  * actually writes 2 MRFs.
 238  */
 239 bool
 240 instruction_scheduler::is_compressed(fs_inst *inst)
 241 {
 242    return (v->c->dispatch_width == 16 &&
 243            !inst->force_uncompressed &&
 244            !inst->force_sechalf);
 245 }
 246
 247 void
 248 instruction_scheduler::calculate_deps()
 249 {
 250    schedule_node *last_grf_write[virtual_grf_count];
 251    schedule_node *last_mrf_write[BRW_MAX_MRF];
 252    schedule_node *last_conditional_mod = NULL;
 253    /* Fixed HW registers are assumed to be separate from the virtual
 254     * GRFs, so they can be tracked separately.  We don't really write
 255     * to fixed GRFs much, so don't bother tracking them on a more
 256     * granular level.
 257     */
 258    schedule_node *last_fixed_grf_write = NULL;
 259
 260    /* The last instruction always needs to still be the last
 261     * instruction.  Either it's flow control (IF, ELSE, ENDIF, DO,
 262     * WHILE) and scheduling other things after it would disturb the
 263     * basic block, or it's FB_WRITE and we should do a better job at
 264     * dead code elimination anyway.
 265     */
 266    schedule_node *last = (schedule_node *)instructions.get_tail();
 267    add_barrier_deps(last);
 268
 269    memset(last_grf_write, 0, sizeof(last_grf_write));
 270    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 271
 272    /* top-to-bottom dependencies: RAW and WAW. */
 273    foreach_list(node, &instructions) {
 274       schedule_node *n = (schedule_node *)node;
 275       fs_inst *inst = n->inst;
 276
 277       /* read-after-write deps. */
 278       for (int i = 0; i < 3; i++) {
 279          if (inst->src[i].file == GRF) {
 280             add_dep(last_grf_write[inst->src[i].reg], n);
 281          } else if (inst->src[i].file == FIXED_HW_REG &&
 282                     (inst->src[i].fixed_hw_reg.file ==
 283                      BRW_GENERAL_REGISTER_FILE)) {
 284             add_dep(last_fixed_grf_write, n);
 285          } else if (inst->src[i].file != BAD_FILE &&
 286                     inst->src[i].file != IMM &&
 287                     inst->src[i].file != UNIFORM) {
 288             assert(inst->src[i].file != MRF);
 289             add_barrier_deps(n);
 290          }
 291       }
 292
 293       for (int i = 0; i < inst->mlen; i++) {
 294          /* It looks like the MRF regs are released in the send
 295           * instruction once it's sent, not when the result comes
 296           * back.
 297           */
 298          add_dep(last_mrf_write[inst->base_mrf + i], n);
 299       }
 300
 301       if (inst->predicated) {
 302          assert(last_conditional_mod);
 303          add_dep(last_conditional_mod, n);
 304       }
 305
 306       /* write-after-write deps. */
 307       if (inst->dst.file == GRF) {
 308          add_dep(last_grf_write[inst->dst.reg], n);
 309          last_grf_write[inst->dst.reg] = n;
 310       } else if (inst->dst.file == MRF) {
 311          int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
 312
 313          add_dep(last_mrf_write[reg], n);
 314          last_mrf_write[reg] = n;
 315          if (is_compressed(inst)) {
 316             if (inst->dst.reg & BRW_MRF_COMPR4)
 317                reg += 4;
 318             else
 319                reg++;
 320             add_dep(last_mrf_write[reg], n);
 321             last_mrf_write[reg] = n;
 322          }
 323       } else if (inst->dst.file == FIXED_HW_REG &&
 324                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
 325          last_fixed_grf_write = n;
 326       } else if (inst->dst.file != BAD_FILE) {
 327          add_barrier_deps(n);
 328       }
 329
 330       if (inst->mlen > 0) {
 331          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 332             add_dep(last_mrf_write[inst->base_mrf + i], n);
 333             last_mrf_write[inst->base_mrf + i] = n;
 334          }
 335       }
 336
 337       if (inst->conditional_mod) {
 338          add_dep(last_conditional_mod, n, 0);
 339          last_conditional_mod = n;
 340       }
 341    }
 342
 343    /* bottom-to-top dependencies: WAR */
 344    memset(last_grf_write, 0, sizeof(last_grf_write));
 345    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 346    last_conditional_mod = NULL;
 347    last_fixed_grf_write = NULL;
 348
 349    exec_node *node;
 350    exec_node *prev;
 351    for (node = instructions.get_tail(), prev = node->prev;
 352         !node->is_head_sentinel();
 353         node = prev, prev = node->prev) {
 354       schedule_node *n = (schedule_node *)node;
 355       fs_inst *inst = n->inst;
 356
 357       /* write-after-read deps. */
 358       for (int i = 0; i < 3; i++) {
 359          if (inst->src[i].file == GRF) {
 360             add_dep(n, last_grf_write[inst->src[i].reg]);
 361          } else if (inst->src[i].file == FIXED_HW_REG &&
 362                     (inst->src[i].fixed_hw_reg.file ==
 363                      BRW_GENERAL_REGISTER_FILE)) {
 364             add_dep(n, last_fixed_grf_write);
 365          } else if (inst->src[i].file != BAD_FILE &&
 366                     inst->src[i].file != IMM &&
 367                     inst->src[i].file != UNIFORM) {
 368             assert(inst->src[i].file != MRF);
 369             add_barrier_deps(n);
 370          }
 371       }
 372
 373       for (int i = 0; i < inst->mlen; i++) {
 374          /* It looks like the MRF regs are released in the send
 375           * instruction once it's sent, not when the result comes
 376           * back.
 377           */
 378          add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
 379       }
 380
 381       if (inst->predicated) {
 382          add_dep(n, last_conditional_mod);
 383       }
 384
 385       /* Update the things this instruction wrote, so earlier reads
 386        * can mark this as WAR dependency.
 387        */
 388       if (inst->dst.file == GRF) {
 389          last_grf_write[inst->dst.reg] = n;
 390       } else if (inst->dst.file == MRF) {
 391          int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
 392
 393          last_mrf_write[reg] = n;
 394
 395          if (is_compressed(inst)) {
 396             if (inst->dst.reg & BRW_MRF_COMPR4)
 397                reg += 4;
 398             else
 399                reg++;
 400
 401             last_mrf_write[reg] = n;
 402          }
 403       } else if (inst->dst.file == FIXED_HW_REG &&
 404                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
 405          last_fixed_grf_write = n;
 406       } else if (inst->dst.file != BAD_FILE) {
 407          add_barrier_deps(n);
 408       }
 409
 410       if (inst->mlen > 0) {
 411          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 412             last_mrf_write[inst->base_mrf + i] = n;
 413          }
 414       }
 415
 416       if (inst->conditional_mod)
 417          last_conditional_mod = n;
 418    }
 419 }
 420
 421 void
 422 instruction_scheduler::schedule_instructions(fs_inst *next_block_header)
 423 {
 424    int time = 0;
 425
 426    /* Remove non-DAG heads from the list. */
 427    foreach_list_safe(node, &instructions) {
 428       schedule_node *n = (schedule_node *)node;
 429       if (n->parent_count != 0)
 430          n->remove();
 431    }
 432
 433    while (!instructions.is_empty()) {
 434       schedule_node *chosen = NULL;
 435       int chosen_time = 0;
 436
 437       foreach_list(node, &instructions) {
 438          schedule_node *n = (schedule_node *)node;
 439
 440          if (!chosen || n->unblocked_time < chosen_time) {
 441             chosen = n;
 442             chosen_time = n->unblocked_time;
 443          }
 444       }
 445
 446       /* Schedule this instruction. */
 447       assert(chosen);
 448       chosen->remove();
 449       next_block_header->insert_before(chosen->inst);
 450       instructions_to_schedule--;
 451
 452       /* Bump the clock.  If we expected a delay for scheduling, then
 453        * bump the clock to reflect that.
 454        */
 455       time = MAX2(time + 1, chosen_time);
 456
 457       /* Now that we've scheduled a new instruction, some of its
 458        * children can be promoted to the list of instructions ready to
 459        * be scheduled.  Update the children's unblocked time for this
 460        * DAG edge as we do so.
 461        */
 462       for (int i = 0; i < chosen->child_count; i++) {
 463          schedule_node *child = chosen->children[i];
 464
 465          child->unblocked_time = MAX2(child->unblocked_time,
 466                                       time + chosen->child_latency[i]);
 467
 468          child->parent_count--;
 469          if (child->parent_count == 0) {
 470             instructions.push_tail(child);
 471          }
 472       }
 473
 474       /* Shared resource: the mathbox.  There's one per EU (on later
 475        * generations, it's even more limited pre-gen6), so if we send
 476        * something off to it then the next math isn't going to make
 477        * progress until the first is done.
 478        */
 479       if (chosen->inst->is_math()) {
 480          foreach_list(node, &instructions) {
 481             schedule_node *n = (schedule_node *)node;
 482
 483             if (n->inst->is_math())
 484                n->unblocked_time = MAX2(n->unblocked_time,
 485                                         time + chosen->latency);
 486          }
 487       }
 488    }
 489
 490    assert(instructions_to_schedule == 0);
 491 }
 492
 493 void
 494 fs_visitor::schedule_instructions()
 495 {
 496    fs_inst *next_block_header = (fs_inst *)instructions.head;
 497    instruction_scheduler sched(this, mem_ctx, this->virtual_grf_next);
 498
 499    while (!next_block_header->is_tail_sentinel()) {
 500       /* Add things to be scheduled until we get to a new BB. */
 501       while (!next_block_header->is_tail_sentinel()) {
 502          fs_inst *inst = next_block_header;
 503          next_block_header = (fs_inst *)next_block_header->next;
 504
 505          sched.add_inst(inst);
 506          if (inst->opcode == BRW_OPCODE_IF ||
 507              inst->opcode == BRW_OPCODE_ELSE ||
 508              inst->opcode == BRW_OPCODE_ENDIF ||
 509              inst->opcode == BRW_OPCODE_DO ||
 510              inst->opcode == BRW_OPCODE_WHILE ||
 511              inst->opcode == BRW_OPCODE_BREAK ||
 512              inst->opcode == BRW_OPCODE_CONTINUE) {
 513             break;
 514          }
 515       }
 516       sched.calculate_deps();
 517       sched.schedule_instructions(next_block_header);
 518    }
 519
 520    this->live_intervals_valid = false;
 521 }