src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 #include "brw_fs.h"
  29 #include "glsl/glsl_types.h"
  30 #include "glsl/ir_optimization.h"
  31 #include "glsl/ir_print_visitor.h"
  32
  33 /** @file brw_fs_schedule_instructions.cpp
  34  *
  35  * List scheduling of FS instructions.
  36  *
  37  * The basic model of the list scheduler is to take a basic block,
  38  * compute a DAG of the dependencies (RAW ordering with latency, WAW
  39  * ordering with latency, WAR ordering), and make a list of the DAG heads.
  40  * Heuristically pick a DAG head, then put all the children that are
  41  * now DAG heads into the list of things to schedule.
  42  *
  43  * The heuristic is the important part.  We're trying to be cheap,
  44  * since actually computing the optimal scheduling is NP complete.
  45  * What we do is track a "current clock".  When we schedule a node, we
  46  * update the earliest-unblocked clock time of its children, and
  47  * increment the clock.  Then, when trying to schedule, we just pick
  48  * the earliest-unblocked instruction to schedule.
  49  *
  50  * Note that often there will be many things which could execute
  51  * immediately, and there are a range of heuristic options to choose
  52  * from in picking among those.
  53  */
  54
  55 static bool debug = false;
  56
  57 class schedule_node : public exec_node
  58 {
  59 public:
  60    schedule_node(fs_inst *inst, int gen)
  61    {
  62       this->inst = inst;
  63       this->child_array_size = 0;
  64       this->children = NULL;
  65       this->child_latency = NULL;
  66       this->child_count = 0;
  67       this->parent_count = 0;
  68       this->unblocked_time = 0;
  69
  70       if (gen >= 7)
  71          set_latency_gen7();
  72       else
  73          set_latency_gen4();
  74    }
  75
  76    void set_latency_gen4();
  77    void set_latency_gen7();
  78
  79    fs_inst *inst;
  80    schedule_node **children;
  81    int *child_latency;
  82    int child_count;
  83    int parent_count;
  84    int child_array_size;
  85    int unblocked_time;
  86    int latency;
  87 };
  88
  89 void
  90 schedule_node::set_latency_gen4()
  91 {
  92    int chans = 8;
  93    int math_latency = 22;
  94
  95    switch (inst->opcode) {
  96    case SHADER_OPCODE_RCP:
  97       this->latency = 1 * chans * math_latency;
  98       break;
  99    case SHADER_OPCODE_RSQ:
 100       this->latency = 2 * chans * math_latency;
 101       break;
 102    case SHADER_OPCODE_INT_QUOTIENT:
 103    case SHADER_OPCODE_SQRT:
 104    case SHADER_OPCODE_LOG2:
 105       /* full precision log.  partial is 2. */
 106       this->latency = 3 * chans * math_latency;
 107       break;
 108    case SHADER_OPCODE_INT_REMAINDER:
 109    case SHADER_OPCODE_EXP2:
 110       /* full precision.  partial is 3, same throughput. */
 111       this->latency = 4 * chans * math_latency;
 112       break;
 113    case SHADER_OPCODE_POW:
 114       this->latency = 8 * chans * math_latency;
 115       break;
 116    case SHADER_OPCODE_SIN:
 117    case SHADER_OPCODE_COS:
 118       /* minimum latency, max is 12 rounds. */
 119       this->latency = 5 * chans * math_latency;
 120       break;
 121    default:
 122       this->latency = 2;
 123       break;
 124    }
 125 }
 126
 127 void
 128 schedule_node::set_latency_gen7()
 129 {
 130    switch (inst->opcode) {
 131    case BRW_OPCODE_MAD:
 132       /* 3 cycles (this is said to be 4 cycles sometimes depending on the
 133        * register numbers in the sources):
 134        * mad(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
 135        *
 136        * 20 cycles:
 137        * mad(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
 138        * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
 139        */
 140       latency = 17;
 141       break;
 142
 143    case SHADER_OPCODE_RCP:
 144    case SHADER_OPCODE_RSQ:
 145    case SHADER_OPCODE_SQRT:
 146    case SHADER_OPCODE_LOG2:
 147    case SHADER_OPCODE_EXP2:
 148    case SHADER_OPCODE_SIN:
 149    case SHADER_OPCODE_COS:
 150       /* 2 cycles:
 151        * math inv(8) g4<1>F g2<0,1,0>F      null       { align1 WE_normal 1Q };
 152        *
 153        * 18 cycles:
 154        * math inv(8) g4<1>F g2<0,1,0>F      null       { align1 WE_normal 1Q };
 155        * mov(8)      null   g4<8,8,1>F                 { align1 WE_normal 1Q };
 156        *
 157        * Same for exp2, log2, rsq, sqrt, sin, cos.
 158        */
 159       latency = 16;
 160       break;
 161
 162    case SHADER_OPCODE_POW:
 163       /* 2 cycles:
 164        * math pow(8) g4<1>F g2<0,1,0>F   g2.1<0,1,0>F  { align1 WE_normal 1Q };
 165        *
 166        * 26 cycles:
 167        * math pow(8) g4<1>F g2<0,1,0>F   g2.1<0,1,0>F  { align1 WE_normal 1Q };
 168        * mov(8)      null   g4<8,8,1>F                 { align1 WE_normal 1Q };
 169        */
 170       latency = 24;
 171       break;
 172
 173    case SHADER_OPCODE_TEX:
 174    case SHADER_OPCODE_TXD:
 175    case SHADER_OPCODE_TXF:
 176    case SHADER_OPCODE_TXL:
 177       /* 18 cycles:
 178        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
 179        * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
 180        * send(8) g4<1>UW    g114<8,8,1>F
 181        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
 182        *
 183        * 697 +/-49 cycles (min 610, n=26):
 184        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
 185        * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
 186        * send(8) g4<1>UW    g114<8,8,1>F
 187        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
 188        * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
 189        *
 190        * So the latency on our first texture load of the batchbuffer takes
 191        * ~700 cycles, since the caches are cold at that point.
 192        *
 193        * 840 +/- 92 cycles (min 720, n=25):
 194        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
 195        * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
 196        * send(8) g4<1>UW    g114<8,8,1>F
 197        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
 198        * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
 199        * send(8) g4<1>UW    g114<8,8,1>F
 200        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
 201        * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
 202        *
 203        * On the second load, it takes just an extra ~140 cycles, and after
 204        * accounting for the 14 cycles of the MOV's latency, that makes ~130.
 205        *
 206        * 683 +/- 49 cycles (min = 602, n=47):
 207        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
 208        * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
 209        * send(8) g4<1>UW    g114<8,8,1>F
 210        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
 211        * send(8) g50<1>UW   g114<8,8,1>F
 212        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
 213        * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
 214        *
 215        * The unit appears to be pipelined, since this matches up with the
 216        * cache-cold case, despite there being two loads here.  If you replace
 217        * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39).
 218        *
 219        * So, take some number between the cache-hot 140 cycles and the
 220        * cache-cold 700 cycles.  No particular tuning was done on this.
 221        *
 222        * I haven't done significant testing of the non-TEX opcodes.  TXL at
 223        * least looked about the same as TEX.
 224        */
 225       latency = 200;
 226       break;
 227
 228    case SHADER_OPCODE_TXS:
 229       /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41
 230        * cycles (n=15):
 231        * mov(8)   g114<1>UD  0D                        { align1 WE_normal 1Q };
 232        * send(8)  g6<1>UW    g114<8,8,1>F
 233        *   sampler (10, 0, 10, 1) mlen 1 rlen 4        { align1 WE_normal 1Q };
 234        * mov(16)  g6<1>F     g6<8,8,1>D                { align1 WE_normal 1Q };
 235        *
 236        *
 237        * Two loads was 535 +/- 30 cycles (n=19):
 238        * mov(16)   g114<1>UD  0D                       { align1 WE_normal 1H };
 239        * send(16)  g6<1>UW    g114<8,8,1>F
 240        *   sampler (10, 0, 10, 2) mlen 2 rlen 8        { align1 WE_normal 1H };
 241        * mov(16)   g114<1>UD  0D                       { align1 WE_normal 1H };
 242        * mov(16)   g6<1>F     g6<8,8,1>D               { align1 WE_normal 1H };
 243        * send(16)  g8<1>UW    g114<8,8,1>F
 244        *   sampler (10, 0, 10, 2) mlen 2 rlen 8        { align1 WE_normal 1H };
 245        * mov(16)   g8<1>F     g8<8,8,1>D               { align1 WE_normal 1H };
 246        * add(16)   g6<1>F     g6<8,8,1>F   g8<8,8,1>F  { align1 WE_normal 1H };
 247        *
 248        * Since the only caches that should matter are just the
 249        * instruction/state cache containing the surface state, assume that we
 250        * always have hot caches.
 251        */
 252       latency = 100;
 253       break;
 254
 255    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 256    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 257       /* testing using varying-index pull constants:
 258        *
 259        * 16 cycles:
 260        * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
 261        * send(8) g4<1>F  g4<8,8,1>D
 262        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
 263        *
 264        * ~480 cycles:
 265        * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
 266        * send(8) g4<1>F  g4<8,8,1>D
 267        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
 268        * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
 269        *
 270        * ~620 cycles:
 271        * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
 272        * send(8) g4<1>F  g4<8,8,1>D
 273        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
 274        * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
 275        * send(8) g4<1>F  g4<8,8,1>D
 276        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
 277        * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
 278        *
 279        * So, if it's cache-hot, it's about 140.  If it's cache cold, it's
 280        * about 460.  We expect to mostly be cache hot, so pick something more
 281        * in that direction.
 282        */
 283       latency = 200;
 284       break;
 285
 286    default:
 287       /* 2 cycles:
 288        * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
 289        *
 290        * 16 cycles:
 291        * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
 292        * mov(8) null   g4<8,8,1>F                      { align1 WE_normal 1Q };
 293        */
 294       latency = 14;
 295       break;
 296    }
 297 }
 298
 299 class instruction_scheduler {
 300 public:
 301    instruction_scheduler(fs_visitor *v, void *mem_ctx, int grf_count,
 302                          bool post_reg_alloc)
 303    {
 304       this->v = v;
 305       this->mem_ctx = ralloc_context(mem_ctx);
 306       this->grf_count = grf_count;
 307       this->instructions.make_empty();
 308       this->instructions_to_schedule = 0;
 309       this->post_reg_alloc = post_reg_alloc;
 310    }
 311
 312    ~instruction_scheduler()
 313    {
 314       ralloc_free(this->mem_ctx);
 315    }
 316    void add_barrier_deps(schedule_node *n);
 317    void add_dep(schedule_node *before, schedule_node *after, int latency);
 318    void add_dep(schedule_node *before, schedule_node *after);
 319
 320    void add_inst(fs_inst *inst);
 321    void calculate_deps();
 322    void schedule_instructions(fs_inst *next_block_header);
 323
 324    bool is_compressed(fs_inst *inst);
 325
 326    void *mem_ctx;
 327
 328    bool post_reg_alloc;
 329    int instructions_to_schedule;
 330    int grf_count;
 331    exec_list instructions;
 332    fs_visitor *v;
 333 };
 334
 335 void
 336 instruction_scheduler::add_inst(fs_inst *inst)
 337 {
 338    schedule_node *n = new(mem_ctx) schedule_node(inst, v->intel->gen);
 339
 340    assert(!inst->is_head_sentinel());
 341    assert(!inst->is_tail_sentinel());
 342
 343    this->instructions_to_schedule++;
 344
 345    inst->remove();
 346    instructions.push_tail(n);
 347 }
 348
 349 /**
 350  * Add a dependency between two instruction nodes.
 351  *
 352  * The @after node will be scheduled after @before.  We will try to
 353  * schedule it @latency cycles after @before, but no guarantees there.
 354  */
 355 void
 356 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
 357                                int latency)
 358 {
 359    if (!before || !after)
 360       return;
 361
 362    assert(before != after);
 363
 364    for (int i = 0; i < before->child_count; i++) {
 365       if (before->children[i] == after) {
 366          before->child_latency[i] = MAX2(before->child_latency[i], latency);
 367          return;
 368       }
 369    }
 370
 371    if (before->child_array_size <= before->child_count) {
 372       if (before->child_array_size < 16)
 373          before->child_array_size = 16;
 374       else
 375          before->child_array_size *= 2;
 376
 377       before->children = reralloc(mem_ctx, before->children,
 378                                   schedule_node *,
 379                                   before->child_array_size);
 380       before->child_latency = reralloc(mem_ctx, before->child_latency,
 381                                        int, before->child_array_size);
 382    }
 383
 384    before->children[before->child_count] = after;
 385    before->child_latency[before->child_count] = latency;
 386    before->child_count++;
 387    after->parent_count++;
 388 }
 389
 390 void
 391 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
 392 {
 393    if (!before)
 394       return;
 395
 396    add_dep(before, after, before->latency);
 397 }
 398
 399 /**
 400  * Sometimes we really want this node to execute after everything that
 401  * was before it and before everything that followed it.  This adds
 402  * the deps to do so.
 403  */
 404 void
 405 instruction_scheduler::add_barrier_deps(schedule_node *n)
 406 {
 407    schedule_node *prev = (schedule_node *)n->prev;
 408    schedule_node *next = (schedule_node *)n->next;
 409
 410    if (prev) {
 411       while (!prev->is_head_sentinel()) {
 412          add_dep(prev, n, 0);
 413          prev = (schedule_node *)prev->prev;
 414       }
 415    }
 416
 417    if (next) {
 418       while (!next->is_tail_sentinel()) {
 419          add_dep(n, next, 0);
 420          next = (schedule_node *)next->next;
 421       }
 422    }
 423 }
 424
 425 /* instruction scheduling needs to be aware of when an MRF write
 426  * actually writes 2 MRFs.
 427  */
 428 bool
 429 instruction_scheduler::is_compressed(fs_inst *inst)
 430 {
 431    return (v->dispatch_width == 16 &&
 432            !inst->force_uncompressed &&
 433            !inst->force_sechalf);
 434 }
 435
 436 void
 437 instruction_scheduler::calculate_deps()
 438 {
 439    /* Pre-register-allocation, this tracks the last write per VGRF (so
 440     * different reg_offsets within it can interfere when they shouldn't).
 441     * After register allocation, reg_offsets are gone and we track individual
 442     * GRF registers.
 443     */
 444    schedule_node *last_grf_write[grf_count];
 445    schedule_node *last_mrf_write[BRW_MAX_MRF];
 446    schedule_node *last_conditional_mod[2] = { NULL, NULL };
 447    /* Fixed HW registers are assumed to be separate from the virtual
 448     * GRFs, so they can be tracked separately.  We don't really write
 449     * to fixed GRFs much, so don't bother tracking them on a more
 450     * granular level.
 451     */
 452    schedule_node *last_fixed_grf_write = NULL;
 453    int reg_width = v->dispatch_width / 8;
 454
 455    /* The last instruction always needs to still be the last
 456     * instruction.  Either it's flow control (IF, ELSE, ENDIF, DO,
 457     * WHILE) and scheduling other things after it would disturb the
 458     * basic block, or it's FB_WRITE and we should do a better job at
 459     * dead code elimination anyway.
 460     */
 461    schedule_node *last = (schedule_node *)instructions.get_tail();
 462    add_barrier_deps(last);
 463
 464    memset(last_grf_write, 0, sizeof(last_grf_write));
 465    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 466
 467    /* top-to-bottom dependencies: RAW and WAW. */
 468    foreach_list(node, &instructions) {
 469       schedule_node *n = (schedule_node *)node;
 470       fs_inst *inst = n->inst;
 471
 472       /* read-after-write deps. */
 473       for (int i = 0; i < 3; i++) {
 474          if (inst->src[i].file == GRF) {
 475             if (post_reg_alloc) {
 476                for (int r = 0; r < reg_width; r++)
 477                   add_dep(last_grf_write[inst->src[i].reg + r], n);
 478             } else {
 479                add_dep(last_grf_write[inst->src[i].reg], n);
 480             }
 481          } else if (inst->src[i].file == FIXED_HW_REG &&
 482                     (inst->src[i].fixed_hw_reg.file ==
 483                      BRW_GENERAL_REGISTER_FILE)) {
 484             if (post_reg_alloc) {
 485                for (int r = 0; r < reg_width; r++)
 486                   add_dep(last_grf_write[inst->src[i].fixed_hw_reg.nr + r], n);
 487             } else {
 488                add_dep(last_fixed_grf_write, n);
 489             }
 490          } else if (inst->src[i].file != BAD_FILE &&
 491                     inst->src[i].file != IMM &&
 492                     inst->src[i].file != UNIFORM) {
 493             assert(inst->src[i].file != MRF);
 494             add_barrier_deps(n);
 495          }
 496       }
 497
 498       for (int i = 0; i < inst->mlen; i++) {
 499          /* It looks like the MRF regs are released in the send
 500           * instruction once it's sent, not when the result comes
 501           * back.
 502           */
 503          add_dep(last_mrf_write[inst->base_mrf + i], n);
 504       }
 505
 506       if (inst->predicate) {
 507          add_dep(last_conditional_mod[inst->flag_subreg], n);
 508       }
 509
 510       /* write-after-write deps. */
 511       if (inst->dst.file == GRF) {
 512          if (post_reg_alloc) {
 513             for (int r = 0; r < inst->regs_written() * reg_width; r++) {
 514                add_dep(last_grf_write[inst->dst.reg + r], n);
 515                last_grf_write[inst->dst.reg + r] = n;
 516             }
 517          } else {
 518             add_dep(last_grf_write[inst->dst.reg], n);
 519             last_grf_write[inst->dst.reg] = n;
 520          }
 521       } else if (inst->dst.file == MRF) {
 522          int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
 523
 524          add_dep(last_mrf_write[reg], n);
 525          last_mrf_write[reg] = n;
 526          if (is_compressed(inst)) {
 527             if (inst->dst.reg & BRW_MRF_COMPR4)
 528                reg += 4;
 529             else
 530                reg++;
 531             add_dep(last_mrf_write[reg], n);
 532             last_mrf_write[reg] = n;
 533          }
 534       } else if (inst->dst.file == FIXED_HW_REG &&
 535                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
 536          if (post_reg_alloc) {
 537             for (int r = 0; r < reg_width; r++)
 538                last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
 539          } else {
 540             last_fixed_grf_write = n;
 541          }
 542       } else if (inst->dst.file != BAD_FILE) {
 543          add_barrier_deps(n);
 544       }
 545
 546       if (inst->mlen > 0) {
 547          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 548             add_dep(last_mrf_write[inst->base_mrf + i], n);
 549             last_mrf_write[inst->base_mrf + i] = n;
 550          }
 551       }
 552
 553       /* Treat FS_OPCODE_MOV_DISPATCH_TO_FLAGS as though it had a
 554        * conditional_mod, because it sets the flag register.
 555        */
 556       if (inst->conditional_mod ||
 557           inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
 558          add_dep(last_conditional_mod[inst->flag_subreg], n, 0);
 559          last_conditional_mod[inst->flag_subreg] = n;
 560       }
 561    }
 562
 563    /* bottom-to-top dependencies: WAR */
 564    memset(last_grf_write, 0, sizeof(last_grf_write));
 565    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 566    memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
 567    last_fixed_grf_write = NULL;
 568
 569    exec_node *node;
 570    exec_node *prev;
 571    for (node = instructions.get_tail(), prev = node->prev;
 572         !node->is_head_sentinel();
 573         node = prev, prev = node->prev) {
 574       schedule_node *n = (schedule_node *)node;
 575       fs_inst *inst = n->inst;
 576
 577       /* write-after-read deps. */
 578       for (int i = 0; i < 3; i++) {
 579          if (inst->src[i].file == GRF) {
 580             if (post_reg_alloc) {
 581                for (int r = 0; r < reg_width; r++)
 582                   add_dep(n, last_grf_write[inst->src[i].reg + r]);
 583             } else {
 584                add_dep(n, last_grf_write[inst->src[i].reg]);
 585             }
 586          } else if (inst->src[i].file == FIXED_HW_REG &&
 587                     (inst->src[i].fixed_hw_reg.file ==
 588                      BRW_GENERAL_REGISTER_FILE)) {
 589             if (post_reg_alloc) {
 590                for (int r = 0; r < reg_width; r++)
 591                   add_dep(n, last_grf_write[inst->src[i].fixed_hw_reg.nr + r]);
 592             } else {
 593                add_dep(n, last_fixed_grf_write);
 594             }
 595          } else if (inst->src[i].file != BAD_FILE &&
 596                     inst->src[i].file != IMM &&
 597                     inst->src[i].file != UNIFORM) {
 598             assert(inst->src[i].file != MRF);
 599             add_barrier_deps(n);
 600          }
 601       }
 602
 603       for (int i = 0; i < inst->mlen; i++) {
 604          /* It looks like the MRF regs are released in the send
 605           * instruction once it's sent, not when the result comes
 606           * back.
 607           */
 608          add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
 609       }
 610
 611       if (inst->predicate) {
 612          add_dep(n, last_conditional_mod[inst->flag_subreg]);
 613       }
 614
 615       /* Update the things this instruction wrote, so earlier reads
 616        * can mark this as WAR dependency.
 617        */
 618       if (inst->dst.file == GRF) {
 619          if (post_reg_alloc) {
 620             for (int r = 0; r < inst->regs_written() * reg_width; r++)
 621                last_grf_write[inst->dst.reg + r] = n;
 622          } else {
 623             last_grf_write[inst->dst.reg] = n;
 624          }
 625       } else if (inst->dst.file == MRF) {
 626          int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
 627
 628          last_mrf_write[reg] = n;
 629
 630          if (is_compressed(inst)) {
 631             if (inst->dst.reg & BRW_MRF_COMPR4)
 632                reg += 4;
 633             else
 634                reg++;
 635
 636             last_mrf_write[reg] = n;
 637          }
 638       } else if (inst->dst.file == FIXED_HW_REG &&
 639                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
 640          if (post_reg_alloc) {
 641             for (int r = 0; r < reg_width; r++)
 642                last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
 643          } else {
 644             last_fixed_grf_write = n;
 645          }
 646       } else if (inst->dst.file != BAD_FILE) {
 647          add_barrier_deps(n);
 648       }
 649
 650       if (inst->mlen > 0) {
 651          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 652             last_mrf_write[inst->base_mrf + i] = n;
 653          }
 654       }
 655
 656       /* Treat FS_OPCODE_MOV_DISPATCH_TO_FLAGS as though it had a
 657        * conditional_mod, because it sets the flag register.
 658        */
 659       if (inst->conditional_mod ||
 660           inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
 661          last_conditional_mod[inst->flag_subreg] = n;
 662       }
 663    }
 664 }
 665
 666 void
 667 instruction_scheduler::schedule_instructions(fs_inst *next_block_header)
 668 {
 669    int time = 0;
 670
 671    /* Remove non-DAG heads from the list. */
 672    foreach_list_safe(node, &instructions) {
 673       schedule_node *n = (schedule_node *)node;
 674       if (n->parent_count != 0)
 675          n->remove();
 676    }
 677
 678    while (!instructions.is_empty()) {
 679       schedule_node *chosen = NULL;
 680       int chosen_time = 0;
 681
 682       if (post_reg_alloc) {
 683          /* Of the instructions closest ready to execute or the closest to
 684           * being ready, choose the oldest one.
 685           */
 686          foreach_list(node, &instructions) {
 687             schedule_node *n = (schedule_node *)node;
 688
 689             if (!chosen || n->unblocked_time < chosen_time) {
 690                chosen = n;
 691                chosen_time = n->unblocked_time;
 692             }
 693          }
 694       } else {
 695          /* Before register allocation, we don't care about the latencies of
 696           * instructions.  All we care about is reducing live intervals of
 697           * variables so that we can avoid register spilling, or get 16-wide
 698           * shaders which naturally do a better job of hiding instruction
 699           * latency.
 700           *
 701           * To do so, schedule our instructions in a roughly LIFO/depth-first
 702           * order: when new instructions become available as a result of
 703           * scheduling something, choose those first so that our result
 704           * hopefully is consumed quickly.
 705           *
 706           * The exception is messages that generate more than one result
 707           * register (AKA texturing).  In those cases, the LIFO search would
 708           * normally tend to choose them quickly (because scheduling the
 709           * previous message not only unblocked the children using its result,
 710           * but also the MRF setup for the next sampler message, which in turn
 711           * unblocks the next sampler message).
 712           */
 713          for (schedule_node *node = (schedule_node *)instructions.get_tail();
 714               node != instructions.get_head()->prev;
 715               node = (schedule_node *)node->prev) {
 716             schedule_node *n = (schedule_node *)node;
 717
 718             chosen = n;
 719             if (chosen->inst->regs_written() <= 1)
 720                break;
 721          }
 722
 723          chosen_time = chosen->unblocked_time;
 724       }
 725
 726       /* Schedule this instruction. */
 727       assert(chosen);
 728       chosen->remove();
 729       next_block_header->insert_before(chosen->inst);
 730       instructions_to_schedule--;
 731
 732       /* Bump the clock.  Instructions in gen hardware are handled one simd4
 733        * vector at a time, with 1 cycle per vector dispatched.  Thus 8-wide
 734        * pixel shaders take 2 cycles to dispatch and 16-wide (compressed)
 735        * instructions take 4.
 736        */
 737       if (is_compressed(chosen->inst))
 738          time += 4;
 739       else
 740          time += 2;
 741
 742       /* If we expected a delay for scheduling, then bump the clock to reflect
 743        * that as well.  In reality, the hardware will switch to another
 744        * hyperthread and may not return to dispatching our thread for a while
 745        * even after we're unblocked.
 746        */
 747       time = MAX2(time, chosen_time);
 748
 749       if (debug) {
 750          printf("clock %4d, scheduled: ", time);
 751          v->dump_instruction(chosen->inst);
 752       }
 753
 754       /* Now that we've scheduled a new instruction, some of its
 755        * children can be promoted to the list of instructions ready to
 756        * be scheduled.  Update the children's unblocked time for this
 757        * DAG edge as we do so.
 758        */
 759       for (int i = 0; i < chosen->child_count; i++) {
 760          schedule_node *child = chosen->children[i];
 761
 762          child->unblocked_time = MAX2(child->unblocked_time,
 763                                       time + chosen->child_latency[i]);
 764
 765          child->parent_count--;
 766          if (child->parent_count == 0) {
 767             if (debug) {
 768                printf("now available: ");
 769                v->dump_instruction(child->inst);
 770             }
 771             instructions.push_tail(child);
 772          }
 773       }
 774
 775       /* Shared resource: the mathbox.  There's one mathbox per EU on Gen6+
 776        * but it's more limited pre-gen6, so if we send something off to it then
 777        * the next math instruction isn't going to make progress until the first
 778        * is done.
 779        */
 780       if (chosen->inst->is_math()) {
 781          foreach_list(node, &instructions) {
 782             schedule_node *n = (schedule_node *)node;
 783
 784             if (n->inst->is_math())
 785                n->unblocked_time = MAX2(n->unblocked_time,
 786                                         time + chosen->latency);
 787          }
 788       }
 789    }
 790
 791    assert(instructions_to_schedule == 0);
 792 }
 793
 794 void
 795 fs_visitor::schedule_instructions(bool post_reg_alloc)
 796 {
 797    fs_inst *next_block_header = (fs_inst *)instructions.head;
 798
 799    int grf_count;
 800    if (post_reg_alloc)
 801       grf_count = grf_used;
 802    else
 803       grf_count = virtual_grf_count;
 804
 805    if (debug) {
 806       printf("\nInstructions before scheduling (reg_alloc %d)\n", post_reg_alloc);
 807       dump_instructions();
 808    }
 809
 810    instruction_scheduler sched(this, mem_ctx, grf_count, post_reg_alloc);
 811
 812    while (!next_block_header->is_tail_sentinel()) {
 813       /* Add things to be scheduled until we get to a new BB. */
 814       while (!next_block_header->is_tail_sentinel()) {
 815          fs_inst *inst = next_block_header;
 816          next_block_header = (fs_inst *)next_block_header->next;
 817
 818          sched.add_inst(inst);
 819          if (inst->is_control_flow())
 820             break;
 821       }
 822       sched.calculate_deps();
 823       sched.schedule_instructions(next_block_header);
 824    }
 825
 826    if (debug) {
 827       printf("\nInstructions after scheduling (reg_alloc %d)\n", post_reg_alloc);
 828       dump_instructions();
 829    }
 830
 831    this->live_intervals_valid = false;
 832 }