src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 #include "brw_fs.h"
  29 #include "glsl/glsl_types.h"
  30 #include "glsl/ir_optimization.h"
  31 #include "glsl/ir_print_visitor.h"
  32
  33 /** @file brw_fs_schedule_instructions.cpp
  34  *
  35  * List scheduling of FS instructions.
  36  *
  37  * The basic model of the list scheduler is to take a basic block,
  38  * compute a DAG of the dependencies (RAW ordering with latency, WAW
  39  * ordering with latency, WAR ordering), and make a list of the DAG heads.
  40  * Heuristically pick a DAG head, then put all the children that are
  41  * now DAG heads into the list of things to schedule.
  42  *
  43  * The heuristic is the important part.  We're trying to be cheap,
  44  * since actually computing the optimal scheduling is NP complete.
  45  * What we do is track a "current clock".  When we schedule a node, we
  46  * update the earliest-unblocked clock time of its children, and
  47  * increment the clock.  Then, when trying to schedule, we just pick
  48  * the earliest-unblocked instruction to schedule.
  49  *
  50  * Note that often there will be many things which could execute
  51  * immediately, and there are a range of heuristic options to choose
  52  * from in picking among those.
  53  */
  54
  55 static bool debug = false;
  56
  57 class schedule_node : public exec_node
  58 {
  59 public:
  60    schedule_node(backend_instruction *inst, const struct intel_context *intel)
  61    {
  62       this->inst = inst;
  63       this->child_array_size = 0;
  64       this->children = NULL;
  65       this->child_latency = NULL;
  66       this->child_count = 0;
  67       this->parent_count = 0;
  68       this->unblocked_time = 0;
  69
  70       /* We can't measure Gen6 timings directly but expect them to be much
  71        * closer to Gen7 than Gen4.
  72        */
  73       if (intel->gen >= 6)
  74          set_latency_gen7(intel->is_haswell);
  75       else
  76          set_latency_gen4();
  77    }
  78
  79    void set_latency_gen4();
  80    void set_latency_gen7(bool is_haswell);
  81
  82    backend_instruction *inst;
  83    schedule_node **children;
  84    int *child_latency;
  85    int child_count;
  86    int parent_count;
  87    int child_array_size;
  88    int unblocked_time;
  89    int latency;
  90 };
  91
  92 void
  93 schedule_node::set_latency_gen4()
  94 {
  95    int chans = 8;
  96    int math_latency = 22;
  97
  98    switch (inst->opcode) {
  99    case SHADER_OPCODE_RCP:
 100       this->latency = 1 * chans * math_latency;
 101       break;
 102    case SHADER_OPCODE_RSQ:
 103       this->latency = 2 * chans * math_latency;
 104       break;
 105    case SHADER_OPCODE_INT_QUOTIENT:
 106    case SHADER_OPCODE_SQRT:
 107    case SHADER_OPCODE_LOG2:
 108       /* full precision log.  partial is 2. */
 109       this->latency = 3 * chans * math_latency;
 110       break;
 111    case SHADER_OPCODE_INT_REMAINDER:
 112    case SHADER_OPCODE_EXP2:
 113       /* full precision.  partial is 3, same throughput. */
 114       this->latency = 4 * chans * math_latency;
 115       break;
 116    case SHADER_OPCODE_POW:
 117       this->latency = 8 * chans * math_latency;
 118       break;
 119    case SHADER_OPCODE_SIN:
 120    case SHADER_OPCODE_COS:
 121       /* minimum latency, max is 12 rounds. */
 122       this->latency = 5 * chans * math_latency;
 123       break;
 124    default:
 125       this->latency = 2;
 126       break;
 127    }
 128 }
 129
 130 void
 131 schedule_node::set_latency_gen7(bool is_haswell)
 132 {
 133    switch (inst->opcode) {
 134    case BRW_OPCODE_MAD:
 135       /* 2 cycles
 136        *  (since the last two src operands are in different register banks):
 137        * mad(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g3.1<4,1,1>F.x { align16 WE_normal 1Q };
 138        *
 139        * 3 cycles on IVB, 4 on HSW
 140        *  (since the last two src operands are in the same register bank):
 141        * mad(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
 142        *
 143        * 18 cycles on IVB, 16 on HSW
 144        *  (since the last two src operands are in different register banks):
 145        * mad(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g3.1<4,1,1>F.x { align16 WE_normal 1Q };
 146        * mov(8) null   g4<4,5,1>F                     { align16 WE_normal 1Q };
 147        *
 148        * 20 cycles on IVB, 18 on HSW
 149        *  (since the last two src operands are in the same register bank):
 150        * mad(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
 151        * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
 152        */
 153
 154       /* Our register allocator doesn't know about register banks, so use the
 155        * higher latency.
 156        */
 157       latency = is_haswell ? 16 : 18;
 158       break;
 159
 160    case BRW_OPCODE_LRP:
 161       /* 2 cycles
 162        *  (since the last two src operands are in different register banks):
 163        * lrp(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g3.1<4,1,1>F.x { align16 WE_normal 1Q };
 164        *
 165        * 3 cycles on IVB, 4 on HSW
 166        *  (since the last two src operands are in the same register bank):
 167        * lrp(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
 168        *
 169        * 16 cycles on IVB, 14 on HSW
 170        *  (since the last two src operands are in different register banks):
 171        * lrp(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g3.1<4,1,1>F.x { align16 WE_normal 1Q };
 172        * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
 173        *
 174        * 16 cycles
 175        *  (since the last two src operands are in the same register bank):
 176        * lrp(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
 177        * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
 178        */
 179
 180       /* Our register allocator doesn't know about register banks, so use the
 181        * higher latency.
 182        */
 183       latency = 14;
 184       break;
 185
 186    case SHADER_OPCODE_RCP:
 187    case SHADER_OPCODE_RSQ:
 188    case SHADER_OPCODE_SQRT:
 189    case SHADER_OPCODE_LOG2:
 190    case SHADER_OPCODE_EXP2:
 191    case SHADER_OPCODE_SIN:
 192    case SHADER_OPCODE_COS:
 193       /* 2 cycles:
 194        * math inv(8) g4<1>F g2<0,1,0>F      null       { align1 WE_normal 1Q };
 195        *
 196        * 18 cycles:
 197        * math inv(8) g4<1>F g2<0,1,0>F      null       { align1 WE_normal 1Q };
 198        * mov(8)      null   g4<8,8,1>F                 { align1 WE_normal 1Q };
 199        *
 200        * Same for exp2, log2, rsq, sqrt, sin, cos.
 201        */
 202       latency = is_haswell ? 14 : 16;
 203       break;
 204
 205    case SHADER_OPCODE_POW:
 206       /* 2 cycles:
 207        * math pow(8) g4<1>F g2<0,1,0>F   g2.1<0,1,0>F  { align1 WE_normal 1Q };
 208        *
 209        * 26 cycles:
 210        * math pow(8) g4<1>F g2<0,1,0>F   g2.1<0,1,0>F  { align1 WE_normal 1Q };
 211        * mov(8)      null   g4<8,8,1>F                 { align1 WE_normal 1Q };
 212        */
 213       latency = is_haswell ? 22 : 24;
 214       break;
 215
 216    case SHADER_OPCODE_TEX:
 217    case SHADER_OPCODE_TXD:
 218    case SHADER_OPCODE_TXF:
 219    case SHADER_OPCODE_TXL:
 220       /* 18 cycles:
 221        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
 222        * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
 223        * send(8) g4<1>UW    g114<8,8,1>F
 224        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
 225        *
 226        * 697 +/-49 cycles (min 610, n=26):
 227        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
 228        * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
 229        * send(8) g4<1>UW    g114<8,8,1>F
 230        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
 231        * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
 232        *
 233        * So the latency on our first texture load of the batchbuffer takes
 234        * ~700 cycles, since the caches are cold at that point.
 235        *
 236        * 840 +/- 92 cycles (min 720, n=25):
 237        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
 238        * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
 239        * send(8) g4<1>UW    g114<8,8,1>F
 240        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
 241        * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
 242        * send(8) g4<1>UW    g114<8,8,1>F
 243        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
 244        * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
 245        *
 246        * On the second load, it takes just an extra ~140 cycles, and after
 247        * accounting for the 14 cycles of the MOV's latency, that makes ~130.
 248        *
 249        * 683 +/- 49 cycles (min = 602, n=47):
 250        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
 251        * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
 252        * send(8) g4<1>UW    g114<8,8,1>F
 253        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
 254        * send(8) g50<1>UW   g114<8,8,1>F
 255        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
 256        * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
 257        *
 258        * The unit appears to be pipelined, since this matches up with the
 259        * cache-cold case, despite there being two loads here.  If you replace
 260        * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39).
 261        *
 262        * So, take some number between the cache-hot 140 cycles and the
 263        * cache-cold 700 cycles.  No particular tuning was done on this.
 264        *
 265        * I haven't done significant testing of the non-TEX opcodes.  TXL at
 266        * least looked about the same as TEX.
 267        */
 268       latency = 200;
 269       break;
 270
 271    case SHADER_OPCODE_TXS:
 272       /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41
 273        * cycles (n=15):
 274        * mov(8)   g114<1>UD  0D                        { align1 WE_normal 1Q };
 275        * send(8)  g6<1>UW    g114<8,8,1>F
 276        *   sampler (10, 0, 10, 1) mlen 1 rlen 4        { align1 WE_normal 1Q };
 277        * mov(16)  g6<1>F     g6<8,8,1>D                { align1 WE_normal 1Q };
 278        *
 279        *
 280        * Two loads was 535 +/- 30 cycles (n=19):
 281        * mov(16)   g114<1>UD  0D                       { align1 WE_normal 1H };
 282        * send(16)  g6<1>UW    g114<8,8,1>F
 283        *   sampler (10, 0, 10, 2) mlen 2 rlen 8        { align1 WE_normal 1H };
 284        * mov(16)   g114<1>UD  0D                       { align1 WE_normal 1H };
 285        * mov(16)   g6<1>F     g6<8,8,1>D               { align1 WE_normal 1H };
 286        * send(16)  g8<1>UW    g114<8,8,1>F
 287        *   sampler (10, 0, 10, 2) mlen 2 rlen 8        { align1 WE_normal 1H };
 288        * mov(16)   g8<1>F     g8<8,8,1>D               { align1 WE_normal 1H };
 289        * add(16)   g6<1>F     g6<8,8,1>F   g8<8,8,1>F  { align1 WE_normal 1H };
 290        *
 291        * Since the only caches that should matter are just the
 292        * instruction/state cache containing the surface state, assume that we
 293        * always have hot caches.
 294        */
 295       latency = 100;
 296       break;
 297
 298    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 299    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 300       /* testing using varying-index pull constants:
 301        *
 302        * 16 cycles:
 303        * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
 304        * send(8) g4<1>F  g4<8,8,1>D
 305        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
 306        *
 307        * ~480 cycles:
 308        * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
 309        * send(8) g4<1>F  g4<8,8,1>D
 310        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
 311        * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
 312        *
 313        * ~620 cycles:
 314        * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
 315        * send(8) g4<1>F  g4<8,8,1>D
 316        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
 317        * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
 318        * send(8) g4<1>F  g4<8,8,1>D
 319        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
 320        * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
 321        *
 322        * So, if it's cache-hot, it's about 140.  If it's cache cold, it's
 323        * about 460.  We expect to mostly be cache hot, so pick something more
 324        * in that direction.
 325        */
 326       latency = 200;
 327       break;
 328
 329    default:
 330       /* 2 cycles:
 331        * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
 332        *
 333        * 16 cycles:
 334        * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
 335        * mov(8) null   g4<8,8,1>F                      { align1 WE_normal 1Q };
 336        */
 337       latency = 14;
 338       break;
 339    }
 340 }
 341
 342 class instruction_scheduler {
 343 public:
 344    instruction_scheduler(backend_visitor *v, int grf_count, bool post_reg_alloc)
 345    {
 346       this->bv = v;
 347       this->mem_ctx = ralloc_context(v->mem_ctx);
 348       this->grf_count = grf_count;
 349       this->instructions.make_empty();
 350       this->instructions_to_schedule = 0;
 351       this->post_reg_alloc = post_reg_alloc;
 352       this->time = 0;
 353    }
 354
 355    ~instruction_scheduler()
 356    {
 357       ralloc_free(this->mem_ctx);
 358    }
 359    void add_barrier_deps(schedule_node *n);
 360    void add_dep(schedule_node *before, schedule_node *after, int latency);
 361    void add_dep(schedule_node *before, schedule_node *after);
 362
 363    void run(exec_list *instructions);
 364    void add_inst(backend_instruction *inst);
 365    virtual void calculate_deps() = 0;
 366    virtual schedule_node *choose_instruction_to_schedule() = 0;
 367
 368    /**
 369     * Returns how many cycles it takes the instruction to issue.
 370     *
 371     * Instructions in gen hardware are handled one simd4 vector at a time,
 372     * with 1 cycle per vector dispatched.  Thus 8-wide pixel shaders take 2
 373     * cycles to dispatch and 16-wide (compressed) instructions take 4.
 374     */
 375    virtual int issue_time(backend_instruction *inst) = 0;
 376
 377    void schedule_instructions(backend_instruction *next_block_header);
 378
 379    void *mem_ctx;
 380
 381    bool post_reg_alloc;
 382    int instructions_to_schedule;
 383    int grf_count;
 384    int time;
 385    exec_list instructions;
 386    backend_visitor *bv;
 387 };
 388
 389 class fs_instruction_scheduler : public instruction_scheduler
 390 {
 391 public:
 392    fs_instruction_scheduler(fs_visitor *v, int grf_count, bool post_reg_alloc);
 393    void calculate_deps();
 394    bool is_compressed(fs_inst *inst);
 395    schedule_node *choose_instruction_to_schedule();
 396    int issue_time(backend_instruction *inst);
 397    fs_visitor *v;
 398 };
 399
 400 fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
 401                                                    int grf_count,
 402                                                    bool post_reg_alloc)
 403    : instruction_scheduler(v, grf_count, post_reg_alloc),
 404      v(v)
 405 {
 406 }
 407
 408 void
 409 instruction_scheduler::add_inst(backend_instruction *inst)
 410 {
 411    schedule_node *n = new(mem_ctx) schedule_node(inst, bv->intel);
 412
 413    assert(!inst->is_head_sentinel());
 414    assert(!inst->is_tail_sentinel());
 415
 416    this->instructions_to_schedule++;
 417
 418    inst->remove();
 419    instructions.push_tail(n);
 420 }
 421
 422 /**
 423  * Add a dependency between two instruction nodes.
 424  *
 425  * The @after node will be scheduled after @before.  We will try to
 426  * schedule it @latency cycles after @before, but no guarantees there.
 427  */
 428 void
 429 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
 430                                int latency)
 431 {
 432    if (!before || !after)
 433       return;
 434
 435    assert(before != after);
 436
 437    for (int i = 0; i < before->child_count; i++) {
 438       if (before->children[i] == after) {
 439          before->child_latency[i] = MAX2(before->child_latency[i], latency);
 440          return;
 441       }
 442    }
 443
 444    if (before->child_array_size <= before->child_count) {
 445       if (before->child_array_size < 16)
 446          before->child_array_size = 16;
 447       else
 448          before->child_array_size *= 2;
 449
 450       before->children = reralloc(mem_ctx, before->children,
 451                                   schedule_node *,
 452                                   before->child_array_size);
 453       before->child_latency = reralloc(mem_ctx, before->child_latency,
 454                                        int, before->child_array_size);
 455    }
 456
 457    before->children[before->child_count] = after;
 458    before->child_latency[before->child_count] = latency;
 459    before->child_count++;
 460    after->parent_count++;
 461 }
 462
 463 void
 464 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
 465 {
 466    if (!before)
 467       return;
 468
 469    add_dep(before, after, before->latency);
 470 }
 471
 472 /**
 473  * Sometimes we really want this node to execute after everything that
 474  * was before it and before everything that followed it.  This adds
 475  * the deps to do so.
 476  */
 477 void
 478 instruction_scheduler::add_barrier_deps(schedule_node *n)
 479 {
 480    schedule_node *prev = (schedule_node *)n->prev;
 481    schedule_node *next = (schedule_node *)n->next;
 482
 483    if (prev) {
 484       while (!prev->is_head_sentinel()) {
 485          add_dep(prev, n, 0);
 486          prev = (schedule_node *)prev->prev;
 487       }
 488    }
 489
 490    if (next) {
 491       while (!next->is_tail_sentinel()) {
 492          add_dep(n, next, 0);
 493          next = (schedule_node *)next->next;
 494       }
 495    }
 496 }
 497
 498 /* instruction scheduling needs to be aware of when an MRF write
 499  * actually writes 2 MRFs.
 500  */
 501 bool
 502 fs_instruction_scheduler::is_compressed(fs_inst *inst)
 503 {
 504    return (v->dispatch_width == 16 &&
 505            !inst->force_uncompressed &&
 506            !inst->force_sechalf);
 507 }
 508
 509 void
 510 fs_instruction_scheduler::calculate_deps()
 511 {
 512    /* Pre-register-allocation, this tracks the last write per VGRF (so
 513     * different reg_offsets within it can interfere when they shouldn't).
 514     * After register allocation, reg_offsets are gone and we track individual
 515     * GRF registers.
 516     */
 517    schedule_node *last_grf_write[grf_count];
 518    schedule_node *last_mrf_write[BRW_MAX_MRF];
 519    schedule_node *last_conditional_mod[2] = { NULL, NULL };
 520    /* Fixed HW registers are assumed to be separate from the virtual
 521     * GRFs, so they can be tracked separately.  We don't really write
 522     * to fixed GRFs much, so don't bother tracking them on a more
 523     * granular level.
 524     */
 525    schedule_node *last_fixed_grf_write = NULL;
 526    int reg_width = v->dispatch_width / 8;
 527
 528    /* The last instruction always needs to still be the last
 529     * instruction.  Either it's flow control (IF, ELSE, ENDIF, DO,
 530     * WHILE) and scheduling other things after it would disturb the
 531     * basic block, or it's FB_WRITE and we should do a better job at
 532     * dead code elimination anyway.
 533     */
 534    schedule_node *last = (schedule_node *)instructions.get_tail();
 535    add_barrier_deps(last);
 536
 537    memset(last_grf_write, 0, sizeof(last_grf_write));
 538    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 539
 540    /* top-to-bottom dependencies: RAW and WAW. */
 541    foreach_list(node, &instructions) {
 542       schedule_node *n = (schedule_node *)node;
 543       fs_inst *inst = (fs_inst *)n->inst;
 544
 545       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT)
 546          add_barrier_deps(n);
 547
 548       /* read-after-write deps. */
 549       for (int i = 0; i < 3; i++) {
 550          if (inst->src[i].file == GRF) {
 551             if (post_reg_alloc) {
 552                for (int r = 0; r < reg_width; r++)
 553                   add_dep(last_grf_write[inst->src[i].reg + r], n);
 554             } else {
 555                add_dep(last_grf_write[inst->src[i].reg], n);
 556             }
 557          } else if (inst->src[i].file == HW_REG &&
 558                     (inst->src[i].fixed_hw_reg.file ==
 559                      BRW_GENERAL_REGISTER_FILE)) {
 560             if (post_reg_alloc) {
 561                for (int r = 0; r < reg_width; r++)
 562                   add_dep(last_grf_write[inst->src[i].fixed_hw_reg.nr + r], n);
 563             } else {
 564                add_dep(last_fixed_grf_write, n);
 565             }
 566          } else if (inst->src[i].file != BAD_FILE &&
 567                     inst->src[i].file != IMM &&
 568                     inst->src[i].file != UNIFORM) {
 569             assert(inst->src[i].file != MRF);
 570             add_barrier_deps(n);
 571          }
 572       }
 573
 574       for (int i = 0; i < inst->mlen; i++) {
 575          /* It looks like the MRF regs are released in the send
 576           * instruction once it's sent, not when the result comes
 577           * back.
 578           */
 579          add_dep(last_mrf_write[inst->base_mrf + i], n);
 580       }
 581
 582       if (inst->predicate) {
 583          add_dep(last_conditional_mod[inst->flag_subreg], n);
 584       }
 585
 586       /* write-after-write deps. */
 587       if (inst->dst.file == GRF) {
 588          if (post_reg_alloc) {
 589             for (int r = 0; r < inst->regs_written * reg_width; r++) {
 590                add_dep(last_grf_write[inst->dst.reg + r], n);
 591                last_grf_write[inst->dst.reg + r] = n;
 592             }
 593          } else {
 594             add_dep(last_grf_write[inst->dst.reg], n);
 595             last_grf_write[inst->dst.reg] = n;
 596          }
 597       } else if (inst->dst.file == MRF) {
 598          int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
 599
 600          add_dep(last_mrf_write[reg], n);
 601          last_mrf_write[reg] = n;
 602          if (is_compressed(inst)) {
 603             if (inst->dst.reg & BRW_MRF_COMPR4)
 604                reg += 4;
 605             else
 606                reg++;
 607             add_dep(last_mrf_write[reg], n);
 608             last_mrf_write[reg] = n;
 609          }
 610       } else if (inst->dst.file == HW_REG &&
 611                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
 612          if (post_reg_alloc) {
 613             for (int r = 0; r < reg_width; r++)
 614                last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
 615          } else {
 616             last_fixed_grf_write = n;
 617          }
 618       } else if (inst->dst.file != BAD_FILE) {
 619          add_barrier_deps(n);
 620       }
 621
 622       if (inst->mlen > 0) {
 623          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 624             add_dep(last_mrf_write[inst->base_mrf + i], n);
 625             last_mrf_write[inst->base_mrf + i] = n;
 626          }
 627       }
 628
 629       /* Treat FS_OPCODE_MOV_DISPATCH_TO_FLAGS as though it had a
 630        * conditional_mod, because it sets the flag register.
 631        */
 632       if (inst->conditional_mod ||
 633           inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
 634          add_dep(last_conditional_mod[inst->flag_subreg], n, 0);
 635          last_conditional_mod[inst->flag_subreg] = n;
 636       }
 637    }
 638
 639    /* bottom-to-top dependencies: WAR */
 640    memset(last_grf_write, 0, sizeof(last_grf_write));
 641    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 642    memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
 643    last_fixed_grf_write = NULL;
 644
 645    exec_node *node;
 646    exec_node *prev;
 647    for (node = instructions.get_tail(), prev = node->prev;
 648         !node->is_head_sentinel();
 649         node = prev, prev = node->prev) {
 650       schedule_node *n = (schedule_node *)node;
 651       fs_inst *inst = (fs_inst *)n->inst;
 652
 653       /* write-after-read deps. */
 654       for (int i = 0; i < 3; i++) {
 655          if (inst->src[i].file == GRF) {
 656             if (post_reg_alloc) {
 657                for (int r = 0; r < reg_width; r++)
 658                   add_dep(n, last_grf_write[inst->src[i].reg + r]);
 659             } else {
 660                add_dep(n, last_grf_write[inst->src[i].reg]);
 661             }
 662          } else if (inst->src[i].file == HW_REG &&
 663                     (inst->src[i].fixed_hw_reg.file ==
 664                      BRW_GENERAL_REGISTER_FILE)) {
 665             if (post_reg_alloc) {
 666                for (int r = 0; r < reg_width; r++)
 667                   add_dep(n, last_grf_write[inst->src[i].fixed_hw_reg.nr + r]);
 668             } else {
 669                add_dep(n, last_fixed_grf_write);
 670             }
 671          } else if (inst->src[i].file != BAD_FILE &&
 672                     inst->src[i].file != IMM &&
 673                     inst->src[i].file != UNIFORM) {
 674             assert(inst->src[i].file != MRF);
 675             add_barrier_deps(n);
 676          }
 677       }
 678
 679       for (int i = 0; i < inst->mlen; i++) {
 680          /* It looks like the MRF regs are released in the send
 681           * instruction once it's sent, not when the result comes
 682           * back.
 683           */
 684          add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
 685       }
 686
 687       if (inst->predicate) {
 688          add_dep(n, last_conditional_mod[inst->flag_subreg]);
 689       }
 690
 691       /* Update the things this instruction wrote, so earlier reads
 692        * can mark this as WAR dependency.
 693        */
 694       if (inst->dst.file == GRF) {
 695          if (post_reg_alloc) {
 696             for (int r = 0; r < inst->regs_written * reg_width; r++)
 697                last_grf_write[inst->dst.reg + r] = n;
 698          } else {
 699             last_grf_write[inst->dst.reg] = n;
 700          }
 701       } else if (inst->dst.file == MRF) {
 702          int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
 703
 704          last_mrf_write[reg] = n;
 705
 706          if (is_compressed(inst)) {
 707             if (inst->dst.reg & BRW_MRF_COMPR4)
 708                reg += 4;
 709             else
 710                reg++;
 711
 712             last_mrf_write[reg] = n;
 713          }
 714       } else if (inst->dst.file == HW_REG &&
 715                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
 716          if (post_reg_alloc) {
 717             for (int r = 0; r < reg_width; r++)
 718                last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
 719          } else {
 720             last_fixed_grf_write = n;
 721          }
 722       } else if (inst->dst.file != BAD_FILE) {
 723          add_barrier_deps(n);
 724       }
 725
 726       if (inst->mlen > 0) {
 727          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 728             last_mrf_write[inst->base_mrf + i] = n;
 729          }
 730       }
 731
 732       /* Treat FS_OPCODE_MOV_DISPATCH_TO_FLAGS as though it had a
 733        * conditional_mod, because it sets the flag register.
 734        */
 735       if (inst->conditional_mod ||
 736           inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
 737          last_conditional_mod[inst->flag_subreg] = n;
 738       }
 739    }
 740 }
 741
 742 schedule_node *
 743 fs_instruction_scheduler::choose_instruction_to_schedule()
 744 {
 745    schedule_node *chosen = NULL;
 746
 747    if (post_reg_alloc) {
 748       int chosen_time = 0;
 749
 750       /* Of the instructions closest ready to execute or the closest to
 751        * being ready, choose the oldest one.
 752        */
 753       foreach_list(node, &instructions) {
 754          schedule_node *n = (schedule_node *)node;
 755
 756          if (!chosen || n->unblocked_time < chosen_time) {
 757             chosen = n;
 758             chosen_time = n->unblocked_time;
 759          }
 760       }
 761    } else {
 762       /* Before register allocation, we don't care about the latencies of
 763        * instructions.  All we care about is reducing live intervals of
 764        * variables so that we can avoid register spilling, or get 16-wide
 765        * shaders which naturally do a better job of hiding instruction
 766        * latency.
 767        *
 768        * To do so, schedule our instructions in a roughly LIFO/depth-first
 769        * order: when new instructions become available as a result of
 770        * scheduling something, choose those first so that our result
 771        * hopefully is consumed quickly.
 772        *
 773        * The exception is messages that generate more than one result
 774        * register (AKA texturing).  In those cases, the LIFO search would
 775        * normally tend to choose them quickly (because scheduling the
 776        * previous message not only unblocked the children using its result,
 777        * but also the MRF setup for the next sampler message, which in turn
 778        * unblocks the next sampler message).
 779        */
 780       for (schedule_node *node = (schedule_node *)instructions.get_tail();
 781            node != instructions.get_head()->prev;
 782            node = (schedule_node *)node->prev) {
 783          schedule_node *n = (schedule_node *)node;
 784          fs_inst *inst = (fs_inst *)n->inst;
 785
 786          chosen = n;
 787          if (inst->regs_written <= 1)
 788             break;
 789       }
 790    }
 791
 792    return chosen;
 793 }
 794
 795 int
 796 fs_instruction_scheduler::issue_time(backend_instruction *inst)
 797 {
 798    if (is_compressed((fs_inst *)inst))
 799       return 4;
 800    else
 801       return 2;
 802 }
 803
 804 void
 805 instruction_scheduler::schedule_instructions(backend_instruction *next_block_header)
 806 {
 807    time = 0;
 808
 809    /* Remove non-DAG heads from the list. */
 810    foreach_list_safe(node, &instructions) {
 811       schedule_node *n = (schedule_node *)node;
 812       if (n->parent_count != 0)
 813          n->remove();
 814    }
 815
 816    while (!instructions.is_empty()) {
 817       schedule_node *chosen = choose_instruction_to_schedule();
 818
 819       /* Schedule this instruction. */
 820       assert(chosen);
 821       chosen->remove();
 822       next_block_header->insert_before(chosen->inst);
 823       instructions_to_schedule--;
 824
 825       /* Update the clock for how soon an instruction could start after the
 826        * chosen one.
 827        */
 828       time += issue_time(chosen->inst);
 829
 830       /* If we expected a delay for scheduling, then bump the clock to reflect
 831        * that as well.  In reality, the hardware will switch to another
 832        * hyperthread and may not return to dispatching our thread for a while
 833        * even after we're unblocked.
 834        */
 835       time = MAX2(time, chosen->unblocked_time);
 836
 837       if (debug) {
 838          printf("clock %4d, scheduled: ", time);
 839          bv->dump_instruction(chosen->inst);
 840       }
 841
 842       /* Now that we've scheduled a new instruction, some of its
 843        * children can be promoted to the list of instructions ready to
 844        * be scheduled.  Update the children's unblocked time for this
 845        * DAG edge as we do so.
 846        */
 847       for (int i = 0; i < chosen->child_count; i++) {
 848          schedule_node *child = chosen->children[i];
 849
 850          child->unblocked_time = MAX2(child->unblocked_time,
 851                                       time + chosen->child_latency[i]);
 852
 853          child->parent_count--;
 854          if (child->parent_count == 0) {
 855             if (debug) {
 856                printf("now available: ");
 857                bv->dump_instruction(child->inst);
 858             }
 859             instructions.push_tail(child);
 860          }
 861       }
 862
 863       /* Shared resource: the mathbox.  There's one mathbox per EU on Gen6+
 864        * but it's more limited pre-gen6, so if we send something off to it then
 865        * the next math instruction isn't going to make progress until the first
 866        * is done.
 867        */
 868       if (chosen->inst->is_math()) {
 869          foreach_list(node, &instructions) {
 870             schedule_node *n = (schedule_node *)node;
 871
 872             if (n->inst->is_math())
 873                n->unblocked_time = MAX2(n->unblocked_time,
 874                                         time + chosen->latency);
 875          }
 876       }
 877    }
 878
 879    assert(instructions_to_schedule == 0);
 880 }
 881
 882 void
 883 instruction_scheduler::run(exec_list *all_instructions)
 884 {
 885    backend_instruction *next_block_header =
 886       (backend_instruction *)all_instructions->head;
 887
 888    if (debug) {
 889       printf("\nInstructions before scheduling (reg_alloc %d)\n", post_reg_alloc);
 890       bv->dump_instructions();
 891    }
 892
 893    while (!next_block_header->is_tail_sentinel()) {
 894       /* Add things to be scheduled until we get to a new BB. */
 895       while (!next_block_header->is_tail_sentinel()) {
 896          backend_instruction *inst = next_block_header;
 897          next_block_header = (backend_instruction *)next_block_header->next;
 898
 899          add_inst(inst);
 900          if (inst->is_control_flow())
 901             break;
 902       }
 903       calculate_deps();
 904       schedule_instructions(next_block_header);
 905    }
 906
 907    if (debug) {
 908       printf("\nInstructions after scheduling (reg_alloc %d)\n", post_reg_alloc);
 909       bv->dump_instructions();
 910    }
 911 }
 912
 913 void
 914 fs_visitor::schedule_instructions(bool post_reg_alloc)
 915 {
 916    int grf_count;
 917    if (post_reg_alloc)
 918       grf_count = grf_used;
 919    else
 920       grf_count = virtual_grf_count;
 921
 922    fs_instruction_scheduler sched(this, grf_count, post_reg_alloc);
 923    sched.run(&instructions);
 924
 925    if (unlikely(INTEL_DEBUG & DEBUG_WM) && post_reg_alloc) {
 926       printf("fs%d estimated execution time: %d cycles\n",
 927              dispatch_width, sched.time);
 928    }
 929
 930    this->live_intervals_valid = false;
 931 }