src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 #include "brw_fs.h"
  29 #include "brw_vec4.h"
  30 #include "glsl/glsl_types.h"
  31 #include "glsl/ir_optimization.h"
  32
  33 using namespace brw;
  34
  35 /** @file brw_fs_schedule_instructions.cpp
  36  *
  37  * List scheduling of FS instructions.
  38  *
  39  * The basic model of the list scheduler is to take a basic block,
  40  * compute a DAG of the dependencies (RAW ordering with latency, WAW
  41  * ordering with latency, WAR ordering), and make a list of the DAG heads.
  42  * Heuristically pick a DAG head, then put all the children that are
  43  * now DAG heads into the list of things to schedule.
  44  *
  45  * The heuristic is the important part.  We're trying to be cheap,
  46  * since actually computing the optimal scheduling is NP complete.
  47  * What we do is track a "current clock".  When we schedule a node, we
  48  * update the earliest-unblocked clock time of its children, and
  49  * increment the clock.  Then, when trying to schedule, we just pick
  50  * the earliest-unblocked instruction to schedule.
  51  *
  52  * Note that often there will be many things which could execute
  53  * immediately, and there are a range of heuristic options to choose
  54  * from in picking among those.
  55  */
  56
  57 static bool debug = false;
  58
  59 class schedule_node : public exec_node
  60 {
  61 public:
  62    schedule_node(backend_instruction *inst, const struct brw_context *brw)
  63    {
  64       this->inst = inst;
  65       this->child_array_size = 0;
  66       this->children = NULL;
  67       this->child_latency = NULL;
  68       this->child_count = 0;
  69       this->parent_count = 0;
  70       this->unblocked_time = 0;
  71
  72       /* We can't measure Gen6 timings directly but expect them to be much
  73        * closer to Gen7 than Gen4.
  74        */
  75       if (brw->gen >= 6)
  76          set_latency_gen7(brw->is_haswell);
  77       else
  78          set_latency_gen4();
  79    }
  80
  81    void set_latency_gen4();
  82    void set_latency_gen7(bool is_haswell);
  83
  84    backend_instruction *inst;
  85    schedule_node **children;
  86    int *child_latency;
  87    int child_count;
  88    int parent_count;
  89    int child_array_size;
  90    int unblocked_time;
  91    int latency;
  92 };
  93
  94 void
  95 schedule_node::set_latency_gen4()
  96 {
  97    int chans = 8;
  98    int math_latency = 22;
  99
 100    switch (inst->opcode) {
 101    case SHADER_OPCODE_RCP:
 102       this->latency = 1 * chans * math_latency;
 103       break;
 104    case SHADER_OPCODE_RSQ:
 105       this->latency = 2 * chans * math_latency;
 106       break;
 107    case SHADER_OPCODE_INT_QUOTIENT:
 108    case SHADER_OPCODE_SQRT:
 109    case SHADER_OPCODE_LOG2:
 110       /* full precision log.  partial is 2. */
 111       this->latency = 3 * chans * math_latency;
 112       break;
 113    case SHADER_OPCODE_INT_REMAINDER:
 114    case SHADER_OPCODE_EXP2:
 115       /* full precision.  partial is 3, same throughput. */
 116       this->latency = 4 * chans * math_latency;
 117       break;
 118    case SHADER_OPCODE_POW:
 119       this->latency = 8 * chans * math_latency;
 120       break;
 121    case SHADER_OPCODE_SIN:
 122    case SHADER_OPCODE_COS:
 123       /* minimum latency, max is 12 rounds. */
 124       this->latency = 5 * chans * math_latency;
 125       break;
 126    default:
 127       this->latency = 2;
 128       break;
 129    }
 130 }
 131
 132 void
 133 schedule_node::set_latency_gen7(bool is_haswell)
 134 {
 135    switch (inst->opcode) {
 136    case BRW_OPCODE_MAD:
 137       /* 2 cycles
 138        *  (since the last two src operands are in different register banks):
 139        * mad(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g3.1<4,1,1>F.x { align16 WE_normal 1Q };
 140        *
 141        * 3 cycles on IVB, 4 on HSW
 142        *  (since the last two src operands are in the same register bank):
 143        * mad(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
 144        *
 145        * 18 cycles on IVB, 16 on HSW
 146        *  (since the last two src operands are in different register banks):
 147        * mad(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g3.1<4,1,1>F.x { align16 WE_normal 1Q };
 148        * mov(8) null   g4<4,5,1>F                     { align16 WE_normal 1Q };
 149        *
 150        * 20 cycles on IVB, 18 on HSW
 151        *  (since the last two src operands are in the same register bank):
 152        * mad(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
 153        * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
 154        */
 155
 156       /* Our register allocator doesn't know about register banks, so use the
 157        * higher latency.
 158        */
 159       latency = is_haswell ? 16 : 18;
 160       break;
 161
 162    case BRW_OPCODE_LRP:
 163       /* 2 cycles
 164        *  (since the last two src operands are in different register banks):
 165        * lrp(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g3.1<4,1,1>F.x { align16 WE_normal 1Q };
 166        *
 167        * 3 cycles on IVB, 4 on HSW
 168        *  (since the last two src operands are in the same register bank):
 169        * lrp(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
 170        *
 171        * 16 cycles on IVB, 14 on HSW
 172        *  (since the last two src operands are in different register banks):
 173        * lrp(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g3.1<4,1,1>F.x { align16 WE_normal 1Q };
 174        * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
 175        *
 176        * 16 cycles
 177        *  (since the last two src operands are in the same register bank):
 178        * lrp(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
 179        * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
 180        */
 181
 182       /* Our register allocator doesn't know about register banks, so use the
 183        * higher latency.
 184        */
 185       latency = 14;
 186       break;
 187
 188    case SHADER_OPCODE_RCP:
 189    case SHADER_OPCODE_RSQ:
 190    case SHADER_OPCODE_SQRT:
 191    case SHADER_OPCODE_LOG2:
 192    case SHADER_OPCODE_EXP2:
 193    case SHADER_OPCODE_SIN:
 194    case SHADER_OPCODE_COS:
 195       /* 2 cycles:
 196        * math inv(8) g4<1>F g2<0,1,0>F      null       { align1 WE_normal 1Q };
 197        *
 198        * 18 cycles:
 199        * math inv(8) g4<1>F g2<0,1,0>F      null       { align1 WE_normal 1Q };
 200        * mov(8)      null   g4<8,8,1>F                 { align1 WE_normal 1Q };
 201        *
 202        * Same for exp2, log2, rsq, sqrt, sin, cos.
 203        */
 204       latency = is_haswell ? 14 : 16;
 205       break;
 206
 207    case SHADER_OPCODE_POW:
 208       /* 2 cycles:
 209        * math pow(8) g4<1>F g2<0,1,0>F   g2.1<0,1,0>F  { align1 WE_normal 1Q };
 210        *
 211        * 26 cycles:
 212        * math pow(8) g4<1>F g2<0,1,0>F   g2.1<0,1,0>F  { align1 WE_normal 1Q };
 213        * mov(8)      null   g4<8,8,1>F                 { align1 WE_normal 1Q };
 214        */
 215       latency = is_haswell ? 22 : 24;
 216       break;
 217
 218    case SHADER_OPCODE_TEX:
 219    case SHADER_OPCODE_TXD:
 220    case SHADER_OPCODE_TXF:
 221    case SHADER_OPCODE_TXL:
 222       /* 18 cycles:
 223        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
 224        * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
 225        * send(8) g4<1>UW    g114<8,8,1>F
 226        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
 227        *
 228        * 697 +/-49 cycles (min 610, n=26):
 229        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
 230        * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
 231        * send(8) g4<1>UW    g114<8,8,1>F
 232        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
 233        * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
 234        *
 235        * So the latency on our first texture load of the batchbuffer takes
 236        * ~700 cycles, since the caches are cold at that point.
 237        *
 238        * 840 +/- 92 cycles (min 720, n=25):
 239        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
 240        * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
 241        * send(8) g4<1>UW    g114<8,8,1>F
 242        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
 243        * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
 244        * send(8) g4<1>UW    g114<8,8,1>F
 245        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
 246        * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
 247        *
 248        * On the second load, it takes just an extra ~140 cycles, and after
 249        * accounting for the 14 cycles of the MOV's latency, that makes ~130.
 250        *
 251        * 683 +/- 49 cycles (min = 602, n=47):
 252        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
 253        * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
 254        * send(8) g4<1>UW    g114<8,8,1>F
 255        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
 256        * send(8) g50<1>UW   g114<8,8,1>F
 257        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
 258        * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
 259        *
 260        * The unit appears to be pipelined, since this matches up with the
 261        * cache-cold case, despite there being two loads here.  If you replace
 262        * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39).
 263        *
 264        * So, take some number between the cache-hot 140 cycles and the
 265        * cache-cold 700 cycles.  No particular tuning was done on this.
 266        *
 267        * I haven't done significant testing of the non-TEX opcodes.  TXL at
 268        * least looked about the same as TEX.
 269        */
 270       latency = 200;
 271       break;
 272
 273    case SHADER_OPCODE_TXS:
 274       /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41
 275        * cycles (n=15):
 276        * mov(8)   g114<1>UD  0D                        { align1 WE_normal 1Q };
 277        * send(8)  g6<1>UW    g114<8,8,1>F
 278        *   sampler (10, 0, 10, 1) mlen 1 rlen 4        { align1 WE_normal 1Q };
 279        * mov(16)  g6<1>F     g6<8,8,1>D                { align1 WE_normal 1Q };
 280        *
 281        *
 282        * Two loads was 535 +/- 30 cycles (n=19):
 283        * mov(16)   g114<1>UD  0D                       { align1 WE_normal 1H };
 284        * send(16)  g6<1>UW    g114<8,8,1>F
 285        *   sampler (10, 0, 10, 2) mlen 2 rlen 8        { align1 WE_normal 1H };
 286        * mov(16)   g114<1>UD  0D                       { align1 WE_normal 1H };
 287        * mov(16)   g6<1>F     g6<8,8,1>D               { align1 WE_normal 1H };
 288        * send(16)  g8<1>UW    g114<8,8,1>F
 289        *   sampler (10, 0, 10, 2) mlen 2 rlen 8        { align1 WE_normal 1H };
 290        * mov(16)   g8<1>F     g8<8,8,1>D               { align1 WE_normal 1H };
 291        * add(16)   g6<1>F     g6<8,8,1>F   g8<8,8,1>F  { align1 WE_normal 1H };
 292        *
 293        * Since the only caches that should matter are just the
 294        * instruction/state cache containing the surface state, assume that we
 295        * always have hot caches.
 296        */
 297       latency = 100;
 298       break;
 299
 300    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 301    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 302    case VS_OPCODE_PULL_CONSTANT_LOAD:
 303       /* testing using varying-index pull constants:
 304        *
 305        * 16 cycles:
 306        * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
 307        * send(8) g4<1>F  g4<8,8,1>D
 308        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
 309        *
 310        * ~480 cycles:
 311        * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
 312        * send(8) g4<1>F  g4<8,8,1>D
 313        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
 314        * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
 315        *
 316        * ~620 cycles:
 317        * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
 318        * send(8) g4<1>F  g4<8,8,1>D
 319        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
 320        * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
 321        * send(8) g4<1>F  g4<8,8,1>D
 322        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
 323        * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
 324        *
 325        * So, if it's cache-hot, it's about 140.  If it's cache cold, it's
 326        * about 460.  We expect to mostly be cache hot, so pick something more
 327        * in that direction.
 328        */
 329       latency = 200;
 330       break;
 331
 332    default:
 333       /* 2 cycles:
 334        * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
 335        *
 336        * 16 cycles:
 337        * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
 338        * mov(8) null   g4<8,8,1>F                      { align1 WE_normal 1Q };
 339        */
 340       latency = 14;
 341       break;
 342    }
 343 }
 344
 345 class instruction_scheduler {
 346 public:
 347    instruction_scheduler(backend_visitor *v, int grf_count, bool post_reg_alloc)
 348    {
 349       this->bv = v;
 350       this->mem_ctx = ralloc_context(v->mem_ctx);
 351       this->grf_count = grf_count;
 352       this->instructions.make_empty();
 353       this->instructions_to_schedule = 0;
 354       this->post_reg_alloc = post_reg_alloc;
 355       this->time = 0;
 356    }
 357
 358    ~instruction_scheduler()
 359    {
 360       ralloc_free(this->mem_ctx);
 361    }
 362    void add_barrier_deps(schedule_node *n);
 363    void add_dep(schedule_node *before, schedule_node *after, int latency);
 364    void add_dep(schedule_node *before, schedule_node *after);
 365
 366    void run(exec_list *instructions);
 367    void add_inst(backend_instruction *inst);
 368    virtual void calculate_deps() = 0;
 369    virtual schedule_node *choose_instruction_to_schedule() = 0;
 370
 371    /**
 372     * Returns how many cycles it takes the instruction to issue.
 373     *
 374     * Instructions in gen hardware are handled one simd4 vector at a time,
 375     * with 1 cycle per vector dispatched.  Thus 8-wide pixel shaders take 2
 376     * cycles to dispatch and 16-wide (compressed) instructions take 4.
 377     */
 378    virtual int issue_time(backend_instruction *inst) = 0;
 379
 380    void schedule_instructions(backend_instruction *next_block_header);
 381
 382    void *mem_ctx;
 383
 384    bool post_reg_alloc;
 385    int instructions_to_schedule;
 386    int grf_count;
 387    int time;
 388    exec_list instructions;
 389    backend_visitor *bv;
 390 };
 391
 392 class fs_instruction_scheduler : public instruction_scheduler
 393 {
 394 public:
 395    fs_instruction_scheduler(fs_visitor *v, int grf_count, bool post_reg_alloc);
 396    void calculate_deps();
 397    bool is_compressed(fs_inst *inst);
 398    schedule_node *choose_instruction_to_schedule();
 399    int issue_time(backend_instruction *inst);
 400    fs_visitor *v;
 401 };
 402
 403 fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
 404                                                    int grf_count,
 405                                                    bool post_reg_alloc)
 406    : instruction_scheduler(v, grf_count, post_reg_alloc),
 407      v(v)
 408 {
 409 }
 410
 411 class vec4_instruction_scheduler : public instruction_scheduler
 412 {
 413 public:
 414    vec4_instruction_scheduler(vec4_visitor *v, int grf_count);
 415    void calculate_deps();
 416    schedule_node *choose_instruction_to_schedule();
 417    int issue_time(backend_instruction *inst);
 418    vec4_visitor *v;
 419 };
 420
 421 vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
 422                                                        int grf_count)
 423    : instruction_scheduler(v, grf_count, true),
 424      v(v)
 425 {
 426 }
 427
 428 void
 429 instruction_scheduler::add_inst(backend_instruction *inst)
 430 {
 431    schedule_node *n = new(mem_ctx) schedule_node(inst, bv->brw);
 432
 433    assert(!inst->is_head_sentinel());
 434    assert(!inst->is_tail_sentinel());
 435
 436    this->instructions_to_schedule++;
 437
 438    inst->remove();
 439    instructions.push_tail(n);
 440 }
 441
 442 /**
 443  * Add a dependency between two instruction nodes.
 444  *
 445  * The @after node will be scheduled after @before.  We will try to
 446  * schedule it @latency cycles after @before, but no guarantees there.
 447  */
 448 void
 449 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
 450                                int latency)
 451 {
 452    if (!before || !after)
 453       return;
 454
 455    assert(before != after);
 456
 457    for (int i = 0; i < before->child_count; i++) {
 458       if (before->children[i] == after) {
 459          before->child_latency[i] = MAX2(before->child_latency[i], latency);
 460          return;
 461       }
 462    }
 463
 464    if (before->child_array_size <= before->child_count) {
 465       if (before->child_array_size < 16)
 466          before->child_array_size = 16;
 467       else
 468          before->child_array_size *= 2;
 469
 470       before->children = reralloc(mem_ctx, before->children,
 471                                   schedule_node *,
 472                                   before->child_array_size);
 473       before->child_latency = reralloc(mem_ctx, before->child_latency,
 474                                        int, before->child_array_size);
 475    }
 476
 477    before->children[before->child_count] = after;
 478    before->child_latency[before->child_count] = latency;
 479    before->child_count++;
 480    after->parent_count++;
 481 }
 482
 483 void
 484 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
 485 {
 486    if (!before)
 487       return;
 488
 489    add_dep(before, after, before->latency);
 490 }
 491
 492 /**
 493  * Sometimes we really want this node to execute after everything that
 494  * was before it and before everything that followed it.  This adds
 495  * the deps to do so.
 496  */
 497 void
 498 instruction_scheduler::add_barrier_deps(schedule_node *n)
 499 {
 500    schedule_node *prev = (schedule_node *)n->prev;
 501    schedule_node *next = (schedule_node *)n->next;
 502
 503    if (prev) {
 504       while (!prev->is_head_sentinel()) {
 505          add_dep(prev, n, 0);
 506          prev = (schedule_node *)prev->prev;
 507       }
 508    }
 509
 510    if (next) {
 511       while (!next->is_tail_sentinel()) {
 512          add_dep(n, next, 0);
 513          next = (schedule_node *)next->next;
 514       }
 515    }
 516 }
 517
 518 /* instruction scheduling needs to be aware of when an MRF write
 519  * actually writes 2 MRFs.
 520  */
 521 bool
 522 fs_instruction_scheduler::is_compressed(fs_inst *inst)
 523 {
 524    return (v->dispatch_width == 16 &&
 525            !inst->force_uncompressed &&
 526            !inst->force_sechalf);
 527 }
 528
 529 void
 530 fs_instruction_scheduler::calculate_deps()
 531 {
 532    /* Pre-register-allocation, this tracks the last write per VGRF (so
 533     * different reg_offsets within it can interfere when they shouldn't).
 534     * After register allocation, reg_offsets are gone and we track individual
 535     * GRF registers.
 536     */
 537    schedule_node *last_grf_write[grf_count];
 538    schedule_node *last_mrf_write[BRW_MAX_MRF];
 539    schedule_node *last_conditional_mod[2] = { NULL, NULL };
 540    /* Fixed HW registers are assumed to be separate from the virtual
 541     * GRFs, so they can be tracked separately.  We don't really write
 542     * to fixed GRFs much, so don't bother tracking them on a more
 543     * granular level.
 544     */
 545    schedule_node *last_fixed_grf_write = NULL;
 546    int reg_width = v->dispatch_width / 8;
 547
 548    /* The last instruction always needs to still be the last
 549     * instruction.  Either it's flow control (IF, ELSE, ENDIF, DO,
 550     * WHILE) and scheduling other things after it would disturb the
 551     * basic block, or it's FB_WRITE and we should do a better job at
 552     * dead code elimination anyway.
 553     */
 554    schedule_node *last = (schedule_node *)instructions.get_tail();
 555    add_barrier_deps(last);
 556
 557    memset(last_grf_write, 0, sizeof(last_grf_write));
 558    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 559
 560    /* top-to-bottom dependencies: RAW and WAW. */
 561    foreach_list(node, &instructions) {
 562       schedule_node *n = (schedule_node *)node;
 563       fs_inst *inst = (fs_inst *)n->inst;
 564
 565       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT)
 566          add_barrier_deps(n);
 567
 568       /* read-after-write deps. */
 569       for (int i = 0; i < 3; i++) {
 570          if (inst->src[i].file == GRF) {
 571             if (post_reg_alloc) {
 572                for (int r = 0; r < reg_width * inst->regs_read(v, i); r++)
 573                   add_dep(last_grf_write[inst->src[i].reg + r], n);
 574             } else {
 575                add_dep(last_grf_write[inst->src[i].reg], n);
 576             }
 577          } else if (inst->src[i].file == HW_REG &&
 578                     (inst->src[i].fixed_hw_reg.file ==
 579                      BRW_GENERAL_REGISTER_FILE)) {
 580             if (post_reg_alloc) {
 581                int size = reg_width;
 582                if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0)
 583                   size = 1;
 584                for (int r = 0; r < size; r++)
 585                   add_dep(last_grf_write[inst->src[i].fixed_hw_reg.nr + r], n);
 586             } else {
 587                add_dep(last_fixed_grf_write, n);
 588             }
 589          } else if (inst->src[i].file != BAD_FILE &&
 590                     inst->src[i].file != IMM &&
 591                     inst->src[i].file != UNIFORM) {
 592             assert(inst->src[i].file != MRF);
 593             add_barrier_deps(n);
 594          }
 595       }
 596
 597       if (inst->base_mrf != -1) {
 598          for (int i = 0; i < inst->mlen; i++) {
 599             /* It looks like the MRF regs are released in the send
 600              * instruction once it's sent, not when the result comes
 601              * back.
 602              */
 603             add_dep(last_mrf_write[inst->base_mrf + i], n);
 604          }
 605       }
 606
 607       if (inst->predicate) {
 608          add_dep(last_conditional_mod[inst->flag_subreg], n);
 609       }
 610
 611       /* write-after-write deps. */
 612       if (inst->dst.file == GRF) {
 613          if (post_reg_alloc) {
 614             for (int r = 0; r < inst->regs_written * reg_width; r++) {
 615                add_dep(last_grf_write[inst->dst.reg + r], n);
 616                last_grf_write[inst->dst.reg + r] = n;
 617             }
 618          } else {
 619             add_dep(last_grf_write[inst->dst.reg], n);
 620             last_grf_write[inst->dst.reg] = n;
 621          }
 622       } else if (inst->dst.file == MRF) {
 623          int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
 624
 625          add_dep(last_mrf_write[reg], n);
 626          last_mrf_write[reg] = n;
 627          if (is_compressed(inst)) {
 628             if (inst->dst.reg & BRW_MRF_COMPR4)
 629                reg += 4;
 630             else
 631                reg++;
 632             add_dep(last_mrf_write[reg], n);
 633             last_mrf_write[reg] = n;
 634          }
 635       } else if (inst->dst.file == HW_REG &&
 636                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
 637          if (post_reg_alloc) {
 638             for (int r = 0; r < reg_width; r++)
 639                last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
 640          } else {
 641             last_fixed_grf_write = n;
 642          }
 643       } else if (inst->dst.file != BAD_FILE) {
 644          add_barrier_deps(n);
 645       }
 646
 647       if (inst->mlen > 0 && inst->base_mrf != -1) {
 648          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 649             add_dep(last_mrf_write[inst->base_mrf + i], n);
 650             last_mrf_write[inst->base_mrf + i] = n;
 651          }
 652       }
 653
 654       /* Treat FS_OPCODE_MOV_DISPATCH_TO_FLAGS as though it had a
 655        * conditional_mod, because it sets the flag register.
 656        */
 657       if (inst->conditional_mod ||
 658           inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
 659          add_dep(last_conditional_mod[inst->flag_subreg], n, 0);
 660          last_conditional_mod[inst->flag_subreg] = n;
 661       }
 662    }
 663
 664    /* bottom-to-top dependencies: WAR */
 665    memset(last_grf_write, 0, sizeof(last_grf_write));
 666    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 667    memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
 668    last_fixed_grf_write = NULL;
 669
 670    exec_node *node;
 671    exec_node *prev;
 672    for (node = instructions.get_tail(), prev = node->prev;
 673         !node->is_head_sentinel();
 674         node = prev, prev = node->prev) {
 675       schedule_node *n = (schedule_node *)node;
 676       fs_inst *inst = (fs_inst *)n->inst;
 677
 678       /* write-after-read deps. */
 679       for (int i = 0; i < 3; i++) {
 680          if (inst->src[i].file == GRF) {
 681             if (post_reg_alloc) {
 682                for (int r = 0; r < reg_width * inst->regs_read(v, i); r++)
 683                   add_dep(n, last_grf_write[inst->src[i].reg + r]);
 684             } else {
 685                add_dep(n, last_grf_write[inst->src[i].reg]);
 686             }
 687          } else if (inst->src[i].file == HW_REG &&
 688                     (inst->src[i].fixed_hw_reg.file ==
 689                      BRW_GENERAL_REGISTER_FILE)) {
 690             if (post_reg_alloc) {
 691                int size = reg_width;
 692                if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0)
 693                   size = 1;
 694                for (int r = 0; r < size; r++)
 695                   add_dep(n, last_grf_write[inst->src[i].fixed_hw_reg.nr + r]);
 696             } else {
 697                add_dep(n, last_fixed_grf_write);
 698             }
 699          } else if (inst->src[i].file != BAD_FILE &&
 700                     inst->src[i].file != IMM &&
 701                     inst->src[i].file != UNIFORM) {
 702             assert(inst->src[i].file != MRF);
 703             add_barrier_deps(n);
 704          }
 705       }
 706
 707       if (inst->base_mrf != -1) {
 708          for (int i = 0; i < inst->mlen; i++) {
 709             /* It looks like the MRF regs are released in the send
 710              * instruction once it's sent, not when the result comes
 711              * back.
 712              */
 713             add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
 714          }
 715       }
 716
 717       if (inst->predicate) {
 718          add_dep(n, last_conditional_mod[inst->flag_subreg]);
 719       }
 720
 721       /* Update the things this instruction wrote, so earlier reads
 722        * can mark this as WAR dependency.
 723        */
 724       if (inst->dst.file == GRF) {
 725          if (post_reg_alloc) {
 726             for (int r = 0; r < inst->regs_written * reg_width; r++)
 727                last_grf_write[inst->dst.reg + r] = n;
 728          } else {
 729             last_grf_write[inst->dst.reg] = n;
 730          }
 731       } else if (inst->dst.file == MRF) {
 732          int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
 733
 734          last_mrf_write[reg] = n;
 735
 736          if (is_compressed(inst)) {
 737             if (inst->dst.reg & BRW_MRF_COMPR4)
 738                reg += 4;
 739             else
 740                reg++;
 741
 742             last_mrf_write[reg] = n;
 743          }
 744       } else if (inst->dst.file == HW_REG &&
 745                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
 746          if (post_reg_alloc) {
 747             for (int r = 0; r < reg_width; r++)
 748                last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
 749          } else {
 750             last_fixed_grf_write = n;
 751          }
 752       } else if (inst->dst.file != BAD_FILE) {
 753          add_barrier_deps(n);
 754       }
 755
 756       if (inst->mlen > 0 && inst->base_mrf != -1) {
 757          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 758             last_mrf_write[inst->base_mrf + i] = n;
 759          }
 760       }
 761
 762       /* Treat FS_OPCODE_MOV_DISPATCH_TO_FLAGS as though it had a
 763        * conditional_mod, because it sets the flag register.
 764        */
 765       if (inst->conditional_mod ||
 766           inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
 767          last_conditional_mod[inst->flag_subreg] = n;
 768       }
 769    }
 770 }
 771
 772 void
 773 vec4_instruction_scheduler::calculate_deps()
 774 {
 775    schedule_node *last_grf_write[grf_count];
 776    schedule_node *last_mrf_write[BRW_MAX_MRF];
 777    schedule_node *last_conditional_mod = NULL;
 778    /* Fixed HW registers are assumed to be separate from the virtual
 779     * GRFs, so they can be tracked separately.  We don't really write
 780     * to fixed GRFs much, so don't bother tracking them on a more
 781     * granular level.
 782     */
 783    schedule_node *last_fixed_grf_write = NULL;
 784
 785    /* The last instruction always needs to still be the last instruction.
 786     * Either it's flow control (IF, ELSE, ENDIF, DO, WHILE) and scheduling
 787     * other things after it would disturb the basic block, or it's the EOT
 788     * URB_WRITE and we should do a better job at dead code eliminating
 789     * anything that could have been scheduled after it.
 790     */
 791    schedule_node *last = (schedule_node *)instructions.get_tail();
 792    add_barrier_deps(last);
 793
 794    memset(last_grf_write, 0, sizeof(last_grf_write));
 795    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 796
 797    /* top-to-bottom dependencies: RAW and WAW. */
 798    foreach_list(node, &instructions) {
 799       schedule_node *n = (schedule_node *)node;
 800       vec4_instruction *inst = (vec4_instruction *)n->inst;
 801
 802       /* read-after-write deps. */
 803       for (int i = 0; i < 3; i++) {
 804          if (inst->src[i].file == GRF) {
 805             add_dep(last_grf_write[inst->src[i].reg], n);
 806          } else if (inst->src[i].file == HW_REG &&
 807                     (inst->src[i].fixed_hw_reg.file ==
 808                      BRW_GENERAL_REGISTER_FILE)) {
 809             add_dep(last_fixed_grf_write, n);
 810          } else if (inst->src[i].file != BAD_FILE &&
 811                     inst->src[i].file != IMM &&
 812                     inst->src[i].file != UNIFORM) {
 813             /* No reads from MRF, and ATTR is already translated away */
 814             assert(inst->src[i].file != MRF &&
 815                    inst->src[i].file != ATTR);
 816             add_barrier_deps(n);
 817          }
 818       }
 819
 820       for (int i = 0; i < inst->mlen; i++) {
 821          /* It looks like the MRF regs are released in the send
 822           * instruction once it's sent, not when the result comes
 823           * back.
 824           */
 825          add_dep(last_mrf_write[inst->base_mrf + i], n);
 826       }
 827
 828       if (inst->depends_on_flags()) {
 829          assert(last_conditional_mod);
 830          add_dep(last_conditional_mod, n);
 831       }
 832
 833       /* write-after-write deps. */
 834       if (inst->dst.file == GRF) {
 835          add_dep(last_grf_write[inst->dst.reg], n);
 836          last_grf_write[inst->dst.reg] = n;
 837       } else if (inst->dst.file == MRF) {
 838          add_dep(last_mrf_write[inst->dst.reg], n);
 839          last_mrf_write[inst->dst.reg] = n;
 840      } else if (inst->dst.file == HW_REG &&
 841                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
 842          last_fixed_grf_write = n;
 843       } else if (inst->dst.file != BAD_FILE) {
 844          add_barrier_deps(n);
 845       }
 846
 847       if (inst->mlen > 0) {
 848          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 849             add_dep(last_mrf_write[inst->base_mrf + i], n);
 850             last_mrf_write[inst->base_mrf + i] = n;
 851          }
 852       }
 853
 854       if (inst->conditional_mod) {
 855          add_dep(last_conditional_mod, n, 0);
 856          last_conditional_mod = n;
 857       }
 858    }
 859
 860    /* bottom-to-top dependencies: WAR */
 861    memset(last_grf_write, 0, sizeof(last_grf_write));
 862    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 863    last_conditional_mod = NULL;
 864    last_fixed_grf_write = NULL;
 865
 866    exec_node *node;
 867    exec_node *prev;
 868    for (node = instructions.get_tail(), prev = node->prev;
 869         !node->is_head_sentinel();
 870         node = prev, prev = node->prev) {
 871       schedule_node *n = (schedule_node *)node;
 872       vec4_instruction *inst = (vec4_instruction *)n->inst;
 873
 874       /* write-after-read deps. */
 875       for (int i = 0; i < 3; i++) {
 876          if (inst->src[i].file == GRF) {
 877             add_dep(n, last_grf_write[inst->src[i].reg]);
 878          } else if (inst->src[i].file == HW_REG &&
 879                     (inst->src[i].fixed_hw_reg.file ==
 880                      BRW_GENERAL_REGISTER_FILE)) {
 881             add_dep(n, last_fixed_grf_write);
 882          } else if (inst->src[i].file != BAD_FILE &&
 883                     inst->src[i].file != IMM &&
 884                     inst->src[i].file != UNIFORM) {
 885             assert(inst->src[i].file != MRF &&
 886                    inst->src[i].file != ATTR);
 887             add_barrier_deps(n);
 888          }
 889       }
 890
 891       for (int i = 0; i < inst->mlen; i++) {
 892          /* It looks like the MRF regs are released in the send
 893           * instruction once it's sent, not when the result comes
 894           * back.
 895           */
 896          add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
 897       }
 898
 899       if (inst->depends_on_flags()) {
 900          add_dep(n, last_conditional_mod);
 901       }
 902
 903       /* Update the things this instruction wrote, so earlier reads
 904        * can mark this as WAR dependency.
 905        */
 906       if (inst->dst.file == GRF) {
 907          last_grf_write[inst->dst.reg] = n;
 908       } else if (inst->dst.file == MRF) {
 909          last_mrf_write[inst->dst.reg] = n;
 910       } else if (inst->dst.file == HW_REG &&
 911                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
 912          last_fixed_grf_write = n;
 913       } else if (inst->dst.file != BAD_FILE) {
 914          add_barrier_deps(n);
 915       }
 916
 917       if (inst->mlen > 0) {
 918          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 919             last_mrf_write[inst->base_mrf + i] = n;
 920          }
 921       }
 922
 923       if (inst->conditional_mod) {
 924          last_conditional_mod = n;
 925       }
 926    }
 927 }
 928
 929 schedule_node *
 930 fs_instruction_scheduler::choose_instruction_to_schedule()
 931 {
 932    schedule_node *chosen = NULL;
 933
 934    if (post_reg_alloc) {
 935       int chosen_time = 0;
 936
 937       /* Of the instructions closest ready to execute or the closest to
 938        * being ready, choose the oldest one.
 939        */
 940       foreach_list(node, &instructions) {
 941          schedule_node *n = (schedule_node *)node;
 942
 943          if (!chosen || n->unblocked_time < chosen_time) {
 944             chosen = n;
 945             chosen_time = n->unblocked_time;
 946          }
 947       }
 948    } else {
 949       /* Before register allocation, we don't care about the latencies of
 950        * instructions.  All we care about is reducing live intervals of
 951        * variables so that we can avoid register spilling, or get 16-wide
 952        * shaders which naturally do a better job of hiding instruction
 953        * latency.
 954        *
 955        * To do so, schedule our instructions in a roughly LIFO/depth-first
 956        * order: when new instructions become available as a result of
 957        * scheduling something, choose those first so that our result
 958        * hopefully is consumed quickly.
 959        *
 960        * The exception is messages that generate more than one result
 961        * register (AKA texturing).  In those cases, the LIFO search would
 962        * normally tend to choose them quickly (because scheduling the
 963        * previous message not only unblocked the children using its result,
 964        * but also the MRF setup for the next sampler message, which in turn
 965        * unblocks the next sampler message).
 966        */
 967       for (schedule_node *node = (schedule_node *)instructions.get_tail();
 968            node != instructions.get_head()->prev;
 969            node = (schedule_node *)node->prev) {
 970          schedule_node *n = (schedule_node *)node;
 971          fs_inst *inst = (fs_inst *)n->inst;
 972
 973          chosen = n;
 974          if (inst->regs_written <= 1)
 975             break;
 976       }
 977    }
 978
 979    return chosen;
 980 }
 981
 982 schedule_node *
 983 vec4_instruction_scheduler::choose_instruction_to_schedule()
 984 {
 985    schedule_node *chosen = NULL;
 986    int chosen_time = 0;
 987
 988    /* Of the instructions ready to execute or the closest to being ready,
 989     * choose the oldest one.
 990     */
 991    foreach_list(node, &instructions) {
 992       schedule_node *n = (schedule_node *)node;
 993
 994       if (!chosen || n->unblocked_time < chosen_time) {
 995          chosen = n;
 996          chosen_time = n->unblocked_time;
 997       }
 998    }
 999
1000    return chosen;
1001 }
1002
1003 int
1004 fs_instruction_scheduler::issue_time(backend_instruction *inst)
1005 {
1006    if (is_compressed((fs_inst *)inst))
1007       return 4;
1008    else
1009       return 2;
1010 }
1011
1012 int
1013 vec4_instruction_scheduler::issue_time(backend_instruction *inst)
1014 {
1015    /* We always execute as two vec4s in parallel. */
1016    return 2;
1017 }
1018
1019 void
1020 instruction_scheduler::schedule_instructions(backend_instruction *next_block_header)
1021 {
1022    time = 0;
1023
1024    /* Remove non-DAG heads from the list. */
1025    foreach_list_safe(node, &instructions) {
1026       schedule_node *n = (schedule_node *)node;
1027       if (n->parent_count != 0)
1028          n->remove();
1029    }
1030
1031    while (!instructions.is_empty()) {
1032       schedule_node *chosen = choose_instruction_to_schedule();
1033
1034       /* Schedule this instruction. */
1035       assert(chosen);
1036       chosen->remove();
1037       next_block_header->insert_before(chosen->inst);
1038       instructions_to_schedule--;
1039
1040       /* Update the clock for how soon an instruction could start after the
1041        * chosen one.
1042        */
1043       time += issue_time(chosen->inst);
1044
1045       /* If we expected a delay for scheduling, then bump the clock to reflect
1046        * that as well.  In reality, the hardware will switch to another
1047        * hyperthread and may not return to dispatching our thread for a while
1048        * even after we're unblocked.
1049        */
1050       time = MAX2(time, chosen->unblocked_time);
1051
1052       if (debug) {
1053          printf("clock %4d, scheduled: ", time);
1054          bv->dump_instruction(chosen->inst);
1055       }
1056
1057       /* Now that we've scheduled a new instruction, some of its
1058        * children can be promoted to the list of instructions ready to
1059        * be scheduled.  Update the children's unblocked time for this
1060        * DAG edge as we do so.
1061        */
1062       for (int i = 0; i < chosen->child_count; i++) {
1063          schedule_node *child = chosen->children[i];
1064
1065          child->unblocked_time = MAX2(child->unblocked_time,
1066                                       time + chosen->child_latency[i]);
1067
1068          child->parent_count--;
1069          if (child->parent_count == 0) {
1070             if (debug) {
1071                printf("now available: ");
1072                bv->dump_instruction(child->inst);
1073             }
1074             instructions.push_tail(child);
1075          }
1076       }
1077
1078       /* Shared resource: the mathbox.  There's one mathbox per EU on Gen6+
1079        * but it's more limited pre-gen6, so if we send something off to it then
1080        * the next math instruction isn't going to make progress until the first
1081        * is done.
1082        */
1083       if (chosen->inst->is_math()) {
1084          foreach_list(node, &instructions) {
1085             schedule_node *n = (schedule_node *)node;
1086
1087             if (n->inst->is_math())
1088                n->unblocked_time = MAX2(n->unblocked_time,
1089                                         time + chosen->latency);
1090          }
1091       }
1092    }
1093
1094    assert(instructions_to_schedule == 0);
1095 }
1096
1097 void
1098 instruction_scheduler::run(exec_list *all_instructions)
1099 {
1100    backend_instruction *next_block_header =
1101       (backend_instruction *)all_instructions->head;
1102
1103    if (debug) {
1104       printf("\nInstructions before scheduling (reg_alloc %d)\n", post_reg_alloc);
1105       bv->dump_instructions();
1106    }
1107
1108    while (!next_block_header->is_tail_sentinel()) {
1109       /* Add things to be scheduled until we get to a new BB. */
1110       while (!next_block_header->is_tail_sentinel()) {
1111          backend_instruction *inst = next_block_header;
1112          next_block_header = (backend_instruction *)next_block_header->next;
1113
1114          add_inst(inst);
1115          if (inst->is_control_flow())
1116             break;
1117       }
1118       calculate_deps();
1119       schedule_instructions(next_block_header);
1120    }
1121
1122    if (debug) {
1123       printf("\nInstructions after scheduling (reg_alloc %d)\n", post_reg_alloc);
1124       bv->dump_instructions();
1125    }
1126 }
1127
1128 void
1129 fs_visitor::schedule_instructions(bool post_reg_alloc)
1130 {
1131    int grf_count;
1132    if (post_reg_alloc)
1133       grf_count = grf_used;
1134    else
1135       grf_count = virtual_grf_count;
1136
1137    fs_instruction_scheduler sched(this, grf_count, post_reg_alloc);
1138    sched.run(&instructions);
1139
1140    if (unlikely(INTEL_DEBUG & DEBUG_WM) && post_reg_alloc) {
1141       printf("fs%d estimated execution time: %d cycles\n",
1142              dispatch_width, sched.time);
1143    }
1144
1145    invalidate_live_intervals();
1146 }
1147
1148 void
1149 vec4_visitor::opt_schedule_instructions()
1150 {
1151    vec4_instruction_scheduler sched(this, prog_data->total_grf);
1152    sched.run(&instructions);
1153
1154    if (unlikely(debug_flag)) {
1155       printf("vec4 estimated execution time: %d cycles\n", sched.time);
1156    }
1157
1158    this->live_intervals_valid = false;
1159 }