src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 #include "brw_fs.h"
  29 #include "brw_vec4.h"
  30 #include "glsl/glsl_types.h"
  31 #include "glsl/ir_optimization.h"
  32
  33 using namespace brw;
  34
  35 /** @file brw_fs_schedule_instructions.cpp
  36  *
  37  * List scheduling of FS instructions.
  38  *
  39  * The basic model of the list scheduler is to take a basic block,
  40  * compute a DAG of the dependencies (RAW ordering with latency, WAW
  41  * ordering with latency, WAR ordering), and make a list of the DAG heads.
  42  * Heuristically pick a DAG head, then put all the children that are
  43  * now DAG heads into the list of things to schedule.
  44  *
  45  * The heuristic is the important part.  We're trying to be cheap,
  46  * since actually computing the optimal scheduling is NP complete.
  47  * What we do is track a "current clock".  When we schedule a node, we
  48  * update the earliest-unblocked clock time of its children, and
  49  * increment the clock.  Then, when trying to schedule, we just pick
  50  * the earliest-unblocked instruction to schedule.
  51  *
  52  * Note that often there will be many things which could execute
  53  * immediately, and there are a range of heuristic options to choose
  54  * from in picking among those.
  55  */
  56
  57 static bool debug = false;
  58
  59 class schedule_node : public exec_node
  60 {
  61 public:
  62    schedule_node(backend_instruction *inst, const struct intel_context *intel)
  63    {
  64       this->inst = inst;
  65       this->child_array_size = 0;
  66       this->children = NULL;
  67       this->child_latency = NULL;
  68       this->child_count = 0;
  69       this->parent_count = 0;
  70       this->unblocked_time = 0;
  71
  72       /* We can't measure Gen6 timings directly but expect them to be much
  73        * closer to Gen7 than Gen4.
  74        */
  75       if (intel->gen >= 6)
  76          set_latency_gen7(intel->is_haswell);
  77       else
  78          set_latency_gen4();
  79    }
  80
  81    void set_latency_gen4();
  82    void set_latency_gen7(bool is_haswell);
  83
  84    backend_instruction *inst;
  85    schedule_node **children;
  86    int *child_latency;
  87    int child_count;
  88    int parent_count;
  89    int child_array_size;
  90    int unblocked_time;
  91    int latency;
  92 };
  93
  94 void
  95 schedule_node::set_latency_gen4()
  96 {
  97    int chans = 8;
  98    int math_latency = 22;
  99
 100    switch (inst->opcode) {
 101    case SHADER_OPCODE_RCP:
 102       this->latency = 1 * chans * math_latency;
 103       break;
 104    case SHADER_OPCODE_RSQ:
 105       this->latency = 2 * chans * math_latency;
 106       break;
 107    case SHADER_OPCODE_INT_QUOTIENT:
 108    case SHADER_OPCODE_SQRT:
 109    case SHADER_OPCODE_LOG2:
 110       /* full precision log.  partial is 2. */
 111       this->latency = 3 * chans * math_latency;
 112       break;
 113    case SHADER_OPCODE_INT_REMAINDER:
 114    case SHADER_OPCODE_EXP2:
 115       /* full precision.  partial is 3, same throughput. */
 116       this->latency = 4 * chans * math_latency;
 117       break;
 118    case SHADER_OPCODE_POW:
 119       this->latency = 8 * chans * math_latency;
 120       break;
 121    case SHADER_OPCODE_SIN:
 122    case SHADER_OPCODE_COS:
 123       /* minimum latency, max is 12 rounds. */
 124       this->latency = 5 * chans * math_latency;
 125       break;
 126    default:
 127       this->latency = 2;
 128       break;
 129    }
 130 }
 131
 132 void
 133 schedule_node::set_latency_gen7(bool is_haswell)
 134 {
 135    switch (inst->opcode) {
 136    case BRW_OPCODE_MAD:
 137       /* 2 cycles
 138        *  (since the last two src operands are in different register banks):
 139        * mad(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g3.1<4,1,1>F.x { align16 WE_normal 1Q };
 140        *
 141        * 3 cycles on IVB, 4 on HSW
 142        *  (since the last two src operands are in the same register bank):
 143        * mad(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
 144        *
 145        * 18 cycles on IVB, 16 on HSW
 146        *  (since the last two src operands are in different register banks):
 147        * mad(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g3.1<4,1,1>F.x { align16 WE_normal 1Q };
 148        * mov(8) null   g4<4,5,1>F                     { align16 WE_normal 1Q };
 149        *
 150        * 20 cycles on IVB, 18 on HSW
 151        *  (since the last two src operands are in the same register bank):
 152        * mad(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
 153        * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
 154        */
 155
 156       /* Our register allocator doesn't know about register banks, so use the
 157        * higher latency.
 158        */
 159       latency = is_haswell ? 16 : 18;
 160       break;
 161
 162    case BRW_OPCODE_LRP:
 163       /* 2 cycles
 164        *  (since the last two src operands are in different register banks):
 165        * lrp(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g3.1<4,1,1>F.x { align16 WE_normal 1Q };
 166        *
 167        * 3 cycles on IVB, 4 on HSW
 168        *  (since the last two src operands are in the same register bank):
 169        * lrp(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
 170        *
 171        * 16 cycles on IVB, 14 on HSW
 172        *  (since the last two src operands are in different register banks):
 173        * lrp(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g3.1<4,1,1>F.x { align16 WE_normal 1Q };
 174        * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
 175        *
 176        * 16 cycles
 177        *  (since the last two src operands are in the same register bank):
 178        * lrp(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
 179        * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
 180        */
 181
 182       /* Our register allocator doesn't know about register banks, so use the
 183        * higher latency.
 184        */
 185       latency = 14;
 186       break;
 187
 188    case SHADER_OPCODE_RCP:
 189    case SHADER_OPCODE_RSQ:
 190    case SHADER_OPCODE_SQRT:
 191    case SHADER_OPCODE_LOG2:
 192    case SHADER_OPCODE_EXP2:
 193    case SHADER_OPCODE_SIN:
 194    case SHADER_OPCODE_COS:
 195       /* 2 cycles:
 196        * math inv(8) g4<1>F g2<0,1,0>F      null       { align1 WE_normal 1Q };
 197        *
 198        * 18 cycles:
 199        * math inv(8) g4<1>F g2<0,1,0>F      null       { align1 WE_normal 1Q };
 200        * mov(8)      null   g4<8,8,1>F                 { align1 WE_normal 1Q };
 201        *
 202        * Same for exp2, log2, rsq, sqrt, sin, cos.
 203        */
 204       latency = is_haswell ? 14 : 16;
 205       break;
 206
 207    case SHADER_OPCODE_POW:
 208       /* 2 cycles:
 209        * math pow(8) g4<1>F g2<0,1,0>F   g2.1<0,1,0>F  { align1 WE_normal 1Q };
 210        *
 211        * 26 cycles:
 212        * math pow(8) g4<1>F g2<0,1,0>F   g2.1<0,1,0>F  { align1 WE_normal 1Q };
 213        * mov(8)      null   g4<8,8,1>F                 { align1 WE_normal 1Q };
 214        */
 215       latency = is_haswell ? 22 : 24;
 216       break;
 217
 218    case SHADER_OPCODE_TEX:
 219    case SHADER_OPCODE_TXD:
 220    case SHADER_OPCODE_TXF:
 221    case SHADER_OPCODE_TXL:
 222       /* 18 cycles:
 223        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
 224        * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
 225        * send(8) g4<1>UW    g114<8,8,1>F
 226        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
 227        *
 228        * 697 +/-49 cycles (min 610, n=26):
 229        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
 230        * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
 231        * send(8) g4<1>UW    g114<8,8,1>F
 232        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
 233        * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
 234        *
 235        * So the latency on our first texture load of the batchbuffer takes
 236        * ~700 cycles, since the caches are cold at that point.
 237        *
 238        * 840 +/- 92 cycles (min 720, n=25):
 239        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
 240        * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
 241        * send(8) g4<1>UW    g114<8,8,1>F
 242        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
 243        * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
 244        * send(8) g4<1>UW    g114<8,8,1>F
 245        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
 246        * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
 247        *
 248        * On the second load, it takes just an extra ~140 cycles, and after
 249        * accounting for the 14 cycles of the MOV's latency, that makes ~130.
 250        *
 251        * 683 +/- 49 cycles (min = 602, n=47):
 252        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
 253        * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
 254        * send(8) g4<1>UW    g114<8,8,1>F
 255        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
 256        * send(8) g50<1>UW   g114<8,8,1>F
 257        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
 258        * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
 259        *
 260        * The unit appears to be pipelined, since this matches up with the
 261        * cache-cold case, despite there being two loads here.  If you replace
 262        * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39).
 263        *
 264        * So, take some number between the cache-hot 140 cycles and the
 265        * cache-cold 700 cycles.  No particular tuning was done on this.
 266        *
 267        * I haven't done significant testing of the non-TEX opcodes.  TXL at
 268        * least looked about the same as TEX.
 269        */
 270       latency = 200;
 271       break;
 272
 273    case SHADER_OPCODE_TXS:
 274       /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41
 275        * cycles (n=15):
 276        * mov(8)   g114<1>UD  0D                        { align1 WE_normal 1Q };
 277        * send(8)  g6<1>UW    g114<8,8,1>F
 278        *   sampler (10, 0, 10, 1) mlen 1 rlen 4        { align1 WE_normal 1Q };
 279        * mov(16)  g6<1>F     g6<8,8,1>D                { align1 WE_normal 1Q };
 280        *
 281        *
 282        * Two loads was 535 +/- 30 cycles (n=19):
 283        * mov(16)   g114<1>UD  0D                       { align1 WE_normal 1H };
 284        * send(16)  g6<1>UW    g114<8,8,1>F
 285        *   sampler (10, 0, 10, 2) mlen 2 rlen 8        { align1 WE_normal 1H };
 286        * mov(16)   g114<1>UD  0D                       { align1 WE_normal 1H };
 287        * mov(16)   g6<1>F     g6<8,8,1>D               { align1 WE_normal 1H };
 288        * send(16)  g8<1>UW    g114<8,8,1>F
 289        *   sampler (10, 0, 10, 2) mlen 2 rlen 8        { align1 WE_normal 1H };
 290        * mov(16)   g8<1>F     g8<8,8,1>D               { align1 WE_normal 1H };
 291        * add(16)   g6<1>F     g6<8,8,1>F   g8<8,8,1>F  { align1 WE_normal 1H };
 292        *
 293        * Since the only caches that should matter are just the
 294        * instruction/state cache containing the surface state, assume that we
 295        * always have hot caches.
 296        */
 297       latency = 100;
 298       break;
 299
 300    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 301    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 302    case VS_OPCODE_PULL_CONSTANT_LOAD:
 303       /* testing using varying-index pull constants:
 304        *
 305        * 16 cycles:
 306        * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
 307        * send(8) g4<1>F  g4<8,8,1>D
 308        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
 309        *
 310        * ~480 cycles:
 311        * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
 312        * send(8) g4<1>F  g4<8,8,1>D
 313        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
 314        * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
 315        *
 316        * ~620 cycles:
 317        * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
 318        * send(8) g4<1>F  g4<8,8,1>D
 319        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
 320        * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
 321        * send(8) g4<1>F  g4<8,8,1>D
 322        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
 323        * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
 324        *
 325        * So, if it's cache-hot, it's about 140.  If it's cache cold, it's
 326        * about 460.  We expect to mostly be cache hot, so pick something more
 327        * in that direction.
 328        */
 329       latency = 200;
 330       break;
 331
 332    default:
 333       /* 2 cycles:
 334        * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
 335        *
 336        * 16 cycles:
 337        * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
 338        * mov(8) null   g4<8,8,1>F                      { align1 WE_normal 1Q };
 339        */
 340       latency = 14;
 341       break;
 342    }
 343 }
 344
 345 class instruction_scheduler {
 346 public:
 347    instruction_scheduler(backend_visitor *v, int grf_count, bool post_reg_alloc)
 348    {
 349       this->bv = v;
 350       this->mem_ctx = ralloc_context(v->mem_ctx);
 351       this->grf_count = grf_count;
 352       this->instructions.make_empty();
 353       this->instructions_to_schedule = 0;
 354       this->post_reg_alloc = post_reg_alloc;
 355       this->time = 0;
 356    }
 357
 358    ~instruction_scheduler()
 359    {
 360       ralloc_free(this->mem_ctx);
 361    }
 362    void add_barrier_deps(schedule_node *n);
 363    void add_dep(schedule_node *before, schedule_node *after, int latency);
 364    void add_dep(schedule_node *before, schedule_node *after);
 365
 366    void run(exec_list *instructions);
 367    void add_inst(backend_instruction *inst);
 368    virtual void calculate_deps() = 0;
 369    virtual schedule_node *choose_instruction_to_schedule() = 0;
 370
 371    /**
 372     * Returns how many cycles it takes the instruction to issue.
 373     *
 374     * Instructions in gen hardware are handled one simd4 vector at a time,
 375     * with 1 cycle per vector dispatched.  Thus 8-wide pixel shaders take 2
 376     * cycles to dispatch and 16-wide (compressed) instructions take 4.
 377     */
 378    virtual int issue_time(backend_instruction *inst) = 0;
 379
 380    void schedule_instructions(backend_instruction *next_block_header);
 381
 382    void *mem_ctx;
 383
 384    bool post_reg_alloc;
 385    int instructions_to_schedule;
 386    int grf_count;
 387    int time;
 388    exec_list instructions;
 389    backend_visitor *bv;
 390 };
 391
 392 class fs_instruction_scheduler : public instruction_scheduler
 393 {
 394 public:
 395    fs_instruction_scheduler(fs_visitor *v, int grf_count, bool post_reg_alloc);
 396    void calculate_deps();
 397    bool is_compressed(fs_inst *inst);
 398    schedule_node *choose_instruction_to_schedule();
 399    int issue_time(backend_instruction *inst);
 400    fs_visitor *v;
 401 };
 402
 403 fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
 404                                                    int grf_count,
 405                                                    bool post_reg_alloc)
 406    : instruction_scheduler(v, grf_count, post_reg_alloc),
 407      v(v)
 408 {
 409 }
 410
 411 class vec4_instruction_scheduler : public instruction_scheduler
 412 {
 413 public:
 414    vec4_instruction_scheduler(vec4_visitor *v, int grf_count);
 415    void calculate_deps();
 416    schedule_node *choose_instruction_to_schedule();
 417    int issue_time(backend_instruction *inst);
 418    vec4_visitor *v;
 419 };
 420
 421 vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
 422                                                        int grf_count)
 423    : instruction_scheduler(v, grf_count, true),
 424      v(v)
 425 {
 426 }
 427
 428 void
 429 instruction_scheduler::add_inst(backend_instruction *inst)
 430 {
 431    schedule_node *n = new(mem_ctx) schedule_node(inst, bv->intel);
 432
 433    assert(!inst->is_head_sentinel());
 434    assert(!inst->is_tail_sentinel());
 435
 436    this->instructions_to_schedule++;
 437
 438    inst->remove();
 439    instructions.push_tail(n);
 440 }
 441
 442 /**
 443  * Add a dependency between two instruction nodes.
 444  *
 445  * The @after node will be scheduled after @before.  We will try to
 446  * schedule it @latency cycles after @before, but no guarantees there.
 447  */
 448 void
 449 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
 450                                int latency)
 451 {
 452    if (!before || !after)
 453       return;
 454
 455    assert(before != after);
 456
 457    for (int i = 0; i < before->child_count; i++) {
 458       if (before->children[i] == after) {
 459          before->child_latency[i] = MAX2(before->child_latency[i], latency);
 460          return;
 461       }
 462    }
 463
 464    if (before->child_array_size <= before->child_count) {
 465       if (before->child_array_size < 16)
 466          before->child_array_size = 16;
 467       else
 468          before->child_array_size *= 2;
 469
 470       before->children = reralloc(mem_ctx, before->children,
 471                                   schedule_node *,
 472                                   before->child_array_size);
 473       before->child_latency = reralloc(mem_ctx, before->child_latency,
 474                                        int, before->child_array_size);
 475    }
 476
 477    before->children[before->child_count] = after;
 478    before->child_latency[before->child_count] = latency;
 479    before->child_count++;
 480    after->parent_count++;
 481 }
 482
 483 void
 484 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
 485 {
 486    if (!before)
 487       return;
 488
 489    add_dep(before, after, before->latency);
 490 }
 491
 492 /**
 493  * Sometimes we really want this node to execute after everything that
 494  * was before it and before everything that followed it.  This adds
 495  * the deps to do so.
 496  */
 497 void
 498 instruction_scheduler::add_barrier_deps(schedule_node *n)
 499 {
 500    schedule_node *prev = (schedule_node *)n->prev;
 501    schedule_node *next = (schedule_node *)n->next;
 502
 503    if (prev) {
 504       while (!prev->is_head_sentinel()) {
 505          add_dep(prev, n, 0);
 506          prev = (schedule_node *)prev->prev;
 507       }
 508    }
 509
 510    if (next) {
 511       while (!next->is_tail_sentinel()) {
 512          add_dep(n, next, 0);
 513          next = (schedule_node *)next->next;
 514       }
 515    }
 516 }
 517
 518 /* instruction scheduling needs to be aware of when an MRF write
 519  * actually writes 2 MRFs.
 520  */
 521 bool
 522 fs_instruction_scheduler::is_compressed(fs_inst *inst)
 523 {
 524    return (v->dispatch_width == 16 &&
 525            !inst->force_uncompressed &&
 526            !inst->force_sechalf);
 527 }
 528
 529 void
 530 fs_instruction_scheduler::calculate_deps()
 531 {
 532    /* Pre-register-allocation, this tracks the last write per VGRF (so
 533     * different reg_offsets within it can interfere when they shouldn't).
 534     * After register allocation, reg_offsets are gone and we track individual
 535     * GRF registers.
 536     */
 537    schedule_node *last_grf_write[grf_count];
 538    schedule_node *last_mrf_write[BRW_MAX_MRF];
 539    schedule_node *last_conditional_mod[2] = { NULL, NULL };
 540    /* Fixed HW registers are assumed to be separate from the virtual
 541     * GRFs, so they can be tracked separately.  We don't really write
 542     * to fixed GRFs much, so don't bother tracking them on a more
 543     * granular level.
 544     */
 545    schedule_node *last_fixed_grf_write = NULL;
 546    int reg_width = v->dispatch_width / 8;
 547
 548    /* The last instruction always needs to still be the last
 549     * instruction.  Either it's flow control (IF, ELSE, ENDIF, DO,
 550     * WHILE) and scheduling other things after it would disturb the
 551     * basic block, or it's FB_WRITE and we should do a better job at
 552     * dead code elimination anyway.
 553     */
 554    schedule_node *last = (schedule_node *)instructions.get_tail();
 555    add_barrier_deps(last);
 556
 557    memset(last_grf_write, 0, sizeof(last_grf_write));
 558    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 559
 560    /* top-to-bottom dependencies: RAW and WAW. */
 561    foreach_list(node, &instructions) {
 562       schedule_node *n = (schedule_node *)node;
 563       fs_inst *inst = (fs_inst *)n->inst;
 564
 565       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT)
 566          add_barrier_deps(n);
 567
 568       /* read-after-write deps. */
 569       for (int i = 0; i < 3; i++) {
 570          if (inst->src[i].file == GRF) {
 571             if (post_reg_alloc) {
 572                for (int r = 0; r < reg_width; r++)
 573                   add_dep(last_grf_write[inst->src[i].reg + r], n);
 574             } else {
 575                add_dep(last_grf_write[inst->src[i].reg], n);
 576             }
 577          } else if (inst->src[i].file == HW_REG &&
 578                     (inst->src[i].fixed_hw_reg.file ==
 579                      BRW_GENERAL_REGISTER_FILE)) {
 580             if (post_reg_alloc) {
 581                int size = reg_width;
 582                if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0)
 583                   size = 1;
 584                for (int r = 0; r < size; r++)
 585                   add_dep(last_grf_write[inst->src[i].fixed_hw_reg.nr + r], n);
 586             } else {
 587                add_dep(last_fixed_grf_write, n);
 588             }
 589          } else if (inst->src[i].file != BAD_FILE &&
 590                     inst->src[i].file != IMM &&
 591                     inst->src[i].file != UNIFORM) {
 592             assert(inst->src[i].file != MRF);
 593             add_barrier_deps(n);
 594          }
 595       }
 596
 597       for (int i = 0; i < inst->mlen; i++) {
 598          /* It looks like the MRF regs are released in the send
 599           * instruction once it's sent, not when the result comes
 600           * back.
 601           */
 602          add_dep(last_mrf_write[inst->base_mrf + i], n);
 603       }
 604
 605       if (inst->predicate) {
 606          add_dep(last_conditional_mod[inst->flag_subreg], n);
 607       }
 608
 609       /* write-after-write deps. */
 610       if (inst->dst.file == GRF) {
 611          if (post_reg_alloc) {
 612             for (int r = 0; r < inst->regs_written * reg_width; r++) {
 613                add_dep(last_grf_write[inst->dst.reg + r], n);
 614                last_grf_write[inst->dst.reg + r] = n;
 615             }
 616          } else {
 617             add_dep(last_grf_write[inst->dst.reg], n);
 618             last_grf_write[inst->dst.reg] = n;
 619          }
 620       } else if (inst->dst.file == MRF) {
 621          int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
 622
 623          add_dep(last_mrf_write[reg], n);
 624          last_mrf_write[reg] = n;
 625          if (is_compressed(inst)) {
 626             if (inst->dst.reg & BRW_MRF_COMPR4)
 627                reg += 4;
 628             else
 629                reg++;
 630             add_dep(last_mrf_write[reg], n);
 631             last_mrf_write[reg] = n;
 632          }
 633       } else if (inst->dst.file == HW_REG &&
 634                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
 635          if (post_reg_alloc) {
 636             for (int r = 0; r < reg_width; r++)
 637                last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
 638          } else {
 639             last_fixed_grf_write = n;
 640          }
 641       } else if (inst->dst.file != BAD_FILE) {
 642          add_barrier_deps(n);
 643       }
 644
 645       if (inst->mlen > 0) {
 646          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 647             add_dep(last_mrf_write[inst->base_mrf + i], n);
 648             last_mrf_write[inst->base_mrf + i] = n;
 649          }
 650       }
 651
 652       /* Treat FS_OPCODE_MOV_DISPATCH_TO_FLAGS as though it had a
 653        * conditional_mod, because it sets the flag register.
 654        */
 655       if (inst->conditional_mod ||
 656           inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
 657          add_dep(last_conditional_mod[inst->flag_subreg], n, 0);
 658          last_conditional_mod[inst->flag_subreg] = n;
 659       }
 660    }
 661
 662    /* bottom-to-top dependencies: WAR */
 663    memset(last_grf_write, 0, sizeof(last_grf_write));
 664    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 665    memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
 666    last_fixed_grf_write = NULL;
 667
 668    exec_node *node;
 669    exec_node *prev;
 670    for (node = instructions.get_tail(), prev = node->prev;
 671         !node->is_head_sentinel();
 672         node = prev, prev = node->prev) {
 673       schedule_node *n = (schedule_node *)node;
 674       fs_inst *inst = (fs_inst *)n->inst;
 675
 676       /* write-after-read deps. */
 677       for (int i = 0; i < 3; i++) {
 678          if (inst->src[i].file == GRF) {
 679             if (post_reg_alloc) {
 680                for (int r = 0; r < reg_width; r++)
 681                   add_dep(n, last_grf_write[inst->src[i].reg + r]);
 682             } else {
 683                add_dep(n, last_grf_write[inst->src[i].reg]);
 684             }
 685          } else if (inst->src[i].file == HW_REG &&
 686                     (inst->src[i].fixed_hw_reg.file ==
 687                      BRW_GENERAL_REGISTER_FILE)) {
 688             if (post_reg_alloc) {
 689                int size = reg_width;
 690                if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0)
 691                   size = 1;
 692                for (int r = 0; r < size; r++)
 693                   add_dep(n, last_grf_write[inst->src[i].fixed_hw_reg.nr + r]);
 694             } else {
 695                add_dep(n, last_fixed_grf_write);
 696             }
 697          } else if (inst->src[i].file != BAD_FILE &&
 698                     inst->src[i].file != IMM &&
 699                     inst->src[i].file != UNIFORM) {
 700             assert(inst->src[i].file != MRF);
 701             add_barrier_deps(n);
 702          }
 703       }
 704
 705       for (int i = 0; i < inst->mlen; i++) {
 706          /* It looks like the MRF regs are released in the send
 707           * instruction once it's sent, not when the result comes
 708           * back.
 709           */
 710          add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
 711       }
 712
 713       if (inst->predicate) {
 714          add_dep(n, last_conditional_mod[inst->flag_subreg]);
 715       }
 716
 717       /* Update the things this instruction wrote, so earlier reads
 718        * can mark this as WAR dependency.
 719        */
 720       if (inst->dst.file == GRF) {
 721          if (post_reg_alloc) {
 722             for (int r = 0; r < inst->regs_written * reg_width; r++)
 723                last_grf_write[inst->dst.reg + r] = n;
 724          } else {
 725             last_grf_write[inst->dst.reg] = n;
 726          }
 727       } else if (inst->dst.file == MRF) {
 728          int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
 729
 730          last_mrf_write[reg] = n;
 731
 732          if (is_compressed(inst)) {
 733             if (inst->dst.reg & BRW_MRF_COMPR4)
 734                reg += 4;
 735             else
 736                reg++;
 737
 738             last_mrf_write[reg] = n;
 739          }
 740       } else if (inst->dst.file == HW_REG &&
 741                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
 742          if (post_reg_alloc) {
 743             for (int r = 0; r < reg_width; r++)
 744                last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
 745          } else {
 746             last_fixed_grf_write = n;
 747          }
 748       } else if (inst->dst.file != BAD_FILE) {
 749          add_barrier_deps(n);
 750       }
 751
 752       if (inst->mlen > 0) {
 753          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 754             last_mrf_write[inst->base_mrf + i] = n;
 755          }
 756       }
 757
 758       /* Treat FS_OPCODE_MOV_DISPATCH_TO_FLAGS as though it had a
 759        * conditional_mod, because it sets the flag register.
 760        */
 761       if (inst->conditional_mod ||
 762           inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
 763          last_conditional_mod[inst->flag_subreg] = n;
 764       }
 765    }
 766 }
 767
 768 void
 769 vec4_instruction_scheduler::calculate_deps()
 770 {
 771    schedule_node *last_grf_write[grf_count];
 772    schedule_node *last_mrf_write[BRW_MAX_MRF];
 773    schedule_node *last_conditional_mod = NULL;
 774    /* Fixed HW registers are assumed to be separate from the virtual
 775     * GRFs, so they can be tracked separately.  We don't really write
 776     * to fixed GRFs much, so don't bother tracking them on a more
 777     * granular level.
 778     */
 779    schedule_node *last_fixed_grf_write = NULL;
 780
 781    /* The last instruction always needs to still be the last instruction.
 782     * Either it's flow control (IF, ELSE, ENDIF, DO, WHILE) and scheduling
 783     * other things after it would disturb the basic block, or it's the EOT
 784     * URB_WRITE and we should do a better job at dead code eliminating
 785     * anything that could have been scheduled after it.
 786     */
 787    schedule_node *last = (schedule_node *)instructions.get_tail();
 788    add_barrier_deps(last);
 789
 790    memset(last_grf_write, 0, sizeof(last_grf_write));
 791    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 792
 793    /* top-to-bottom dependencies: RAW and WAW. */
 794    foreach_list(node, &instructions) {
 795       schedule_node *n = (schedule_node *)node;
 796       vec4_instruction *inst = (vec4_instruction *)n->inst;
 797
 798       /* read-after-write deps. */
 799       for (int i = 0; i < 3; i++) {
 800          if (inst->src[i].file == GRF) {
 801             add_dep(last_grf_write[inst->src[i].reg], n);
 802          } else if (inst->src[i].file == HW_REG &&
 803                     (inst->src[i].fixed_hw_reg.file ==
 804                      BRW_GENERAL_REGISTER_FILE)) {
 805             add_dep(last_fixed_grf_write, n);
 806          } else if (inst->src[i].file != BAD_FILE &&
 807                     inst->src[i].file != IMM &&
 808                     inst->src[i].file != UNIFORM) {
 809             /* No reads from MRF, and ATTR is already translated away */
 810             assert(inst->src[i].file != MRF &&
 811                    inst->src[i].file != ATTR);
 812             add_barrier_deps(n);
 813          }
 814       }
 815
 816       for (int i = 0; i < inst->mlen; i++) {
 817          /* It looks like the MRF regs are released in the send
 818           * instruction once it's sent, not when the result comes
 819           * back.
 820           */
 821          add_dep(last_mrf_write[inst->base_mrf + i], n);
 822       }
 823
 824       if (inst->predicate) {
 825          assert(last_conditional_mod);
 826          add_dep(last_conditional_mod, n);
 827       }
 828
 829       /* write-after-write deps. */
 830       if (inst->dst.file == GRF) {
 831          add_dep(last_grf_write[inst->dst.reg], n);
 832          last_grf_write[inst->dst.reg] = n;
 833       } else if (inst->dst.file == MRF) {
 834          add_dep(last_mrf_write[inst->dst.reg], n);
 835          last_mrf_write[inst->dst.reg] = n;
 836      } else if (inst->dst.file == HW_REG &&
 837                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
 838          last_fixed_grf_write = n;
 839       } else if (inst->dst.file != BAD_FILE) {
 840          add_barrier_deps(n);
 841       }
 842
 843       if (inst->mlen > 0) {
 844          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 845             add_dep(last_mrf_write[inst->base_mrf + i], n);
 846             last_mrf_write[inst->base_mrf + i] = n;
 847          }
 848       }
 849
 850       if (inst->conditional_mod) {
 851          add_dep(last_conditional_mod, n, 0);
 852          last_conditional_mod = n;
 853       }
 854    }
 855
 856    /* bottom-to-top dependencies: WAR */
 857    memset(last_grf_write, 0, sizeof(last_grf_write));
 858    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 859    last_conditional_mod = NULL;
 860    last_fixed_grf_write = NULL;
 861
 862    exec_node *node;
 863    exec_node *prev;
 864    for (node = instructions.get_tail(), prev = node->prev;
 865         !node->is_head_sentinel();
 866         node = prev, prev = node->prev) {
 867       schedule_node *n = (schedule_node *)node;
 868       vec4_instruction *inst = (vec4_instruction *)n->inst;
 869
 870       /* write-after-read deps. */
 871       for (int i = 0; i < 3; i++) {
 872          if (inst->src[i].file == GRF) {
 873             add_dep(n, last_grf_write[inst->src[i].reg]);
 874          } else if (inst->src[i].file == HW_REG &&
 875                     (inst->src[i].fixed_hw_reg.file ==
 876                      BRW_GENERAL_REGISTER_FILE)) {
 877             add_dep(n, last_fixed_grf_write);
 878          } else if (inst->src[i].file != BAD_FILE &&
 879                     inst->src[i].file != IMM &&
 880                     inst->src[i].file != UNIFORM) {
 881             assert(inst->src[i].file != MRF &&
 882                    inst->src[i].file != ATTR);
 883             add_barrier_deps(n);
 884          }
 885       }
 886
 887       for (int i = 0; i < inst->mlen; i++) {
 888          /* It looks like the MRF regs are released in the send
 889           * instruction once it's sent, not when the result comes
 890           * back.
 891           */
 892          add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
 893       }
 894
 895       if (inst->predicate) {
 896          add_dep(n, last_conditional_mod);
 897       }
 898
 899       /* Update the things this instruction wrote, so earlier reads
 900        * can mark this as WAR dependency.
 901        */
 902       if (inst->dst.file == GRF) {
 903          last_grf_write[inst->dst.reg] = n;
 904       } else if (inst->dst.file == MRF) {
 905          last_mrf_write[inst->dst.reg] = n;
 906       } else if (inst->dst.file == HW_REG &&
 907                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
 908          last_fixed_grf_write = n;
 909       } else if (inst->dst.file != BAD_FILE) {
 910          add_barrier_deps(n);
 911       }
 912
 913       if (inst->mlen > 0) {
 914          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
 915             last_mrf_write[inst->base_mrf + i] = n;
 916          }
 917       }
 918
 919       if (inst->conditional_mod) {
 920          last_conditional_mod = n;
 921       }
 922    }
 923 }
 924
 925 schedule_node *
 926 fs_instruction_scheduler::choose_instruction_to_schedule()
 927 {
 928    schedule_node *chosen = NULL;
 929
 930    if (post_reg_alloc) {
 931       int chosen_time = 0;
 932
 933       /* Of the instructions closest ready to execute or the closest to
 934        * being ready, choose the oldest one.
 935        */
 936       foreach_list(node, &instructions) {
 937          schedule_node *n = (schedule_node *)node;
 938
 939          if (!chosen || n->unblocked_time < chosen_time) {
 940             chosen = n;
 941             chosen_time = n->unblocked_time;
 942          }
 943       }
 944    } else {
 945       /* Before register allocation, we don't care about the latencies of
 946        * instructions.  All we care about is reducing live intervals of
 947        * variables so that we can avoid register spilling, or get 16-wide
 948        * shaders which naturally do a better job of hiding instruction
 949        * latency.
 950        *
 951        * To do so, schedule our instructions in a roughly LIFO/depth-first
 952        * order: when new instructions become available as a result of
 953        * scheduling something, choose those first so that our result
 954        * hopefully is consumed quickly.
 955        *
 956        * The exception is messages that generate more than one result
 957        * register (AKA texturing).  In those cases, the LIFO search would
 958        * normally tend to choose them quickly (because scheduling the
 959        * previous message not only unblocked the children using its result,
 960        * but also the MRF setup for the next sampler message, which in turn
 961        * unblocks the next sampler message).
 962        */
 963       for (schedule_node *node = (schedule_node *)instructions.get_tail();
 964            node != instructions.get_head()->prev;
 965            node = (schedule_node *)node->prev) {
 966          schedule_node *n = (schedule_node *)node;
 967          fs_inst *inst = (fs_inst *)n->inst;
 968
 969          chosen = n;
 970          if (inst->regs_written <= 1)
 971             break;
 972       }
 973    }
 974
 975    return chosen;
 976 }
 977
 978 schedule_node *
 979 vec4_instruction_scheduler::choose_instruction_to_schedule()
 980 {
 981    schedule_node *chosen = NULL;
 982    int chosen_time = 0;
 983
 984    /* Of the instructions ready to execute or the closest to being ready,
 985     * choose the oldest one.
 986     */
 987    foreach_list(node, &instructions) {
 988       schedule_node *n = (schedule_node *)node;
 989
 990       if (!chosen || n->unblocked_time < chosen_time) {
 991          chosen = n;
 992          chosen_time = n->unblocked_time;
 993       }
 994    }
 995
 996    return chosen;
 997 }
 998
 999 int
1000 fs_instruction_scheduler::issue_time(backend_instruction *inst)
1001 {
1002    if (is_compressed((fs_inst *)inst))
1003       return 4;
1004    else
1005       return 2;
1006 }
1007
1008 int
1009 vec4_instruction_scheduler::issue_time(backend_instruction *inst)
1010 {
1011    /* We always execute as two vec4s in parallel. */
1012    return 2;
1013 }
1014
1015 void
1016 instruction_scheduler::schedule_instructions(backend_instruction *next_block_header)
1017 {
1018    time = 0;
1019
1020    /* Remove non-DAG heads from the list. */
1021    foreach_list_safe(node, &instructions) {
1022       schedule_node *n = (schedule_node *)node;
1023       if (n->parent_count != 0)
1024          n->remove();
1025    }
1026
1027    while (!instructions.is_empty()) {
1028       schedule_node *chosen = choose_instruction_to_schedule();
1029
1030       /* Schedule this instruction. */
1031       assert(chosen);
1032       chosen->remove();
1033       next_block_header->insert_before(chosen->inst);
1034       instructions_to_schedule--;
1035
1036       /* Update the clock for how soon an instruction could start after the
1037        * chosen one.
1038        */
1039       time += issue_time(chosen->inst);
1040
1041       /* If we expected a delay for scheduling, then bump the clock to reflect
1042        * that as well.  In reality, the hardware will switch to another
1043        * hyperthread and may not return to dispatching our thread for a while
1044        * even after we're unblocked.
1045        */
1046       time = MAX2(time, chosen->unblocked_time);
1047
1048       if (debug) {
1049          printf("clock %4d, scheduled: ", time);
1050          bv->dump_instruction(chosen->inst);
1051       }
1052
1053       /* Now that we've scheduled a new instruction, some of its
1054        * children can be promoted to the list of instructions ready to
1055        * be scheduled.  Update the children's unblocked time for this
1056        * DAG edge as we do so.
1057        */
1058       for (int i = 0; i < chosen->child_count; i++) {
1059          schedule_node *child = chosen->children[i];
1060
1061          child->unblocked_time = MAX2(child->unblocked_time,
1062                                       time + chosen->child_latency[i]);
1063
1064          child->parent_count--;
1065          if (child->parent_count == 0) {
1066             if (debug) {
1067                printf("now available: ");
1068                bv->dump_instruction(child->inst);
1069             }
1070             instructions.push_tail(child);
1071          }
1072       }
1073
1074       /* Shared resource: the mathbox.  There's one mathbox per EU on Gen6+
1075        * but it's more limited pre-gen6, so if we send something off to it then
1076        * the next math instruction isn't going to make progress until the first
1077        * is done.
1078        */
1079       if (chosen->inst->is_math()) {
1080          foreach_list(node, &instructions) {
1081             schedule_node *n = (schedule_node *)node;
1082
1083             if (n->inst->is_math())
1084                n->unblocked_time = MAX2(n->unblocked_time,
1085                                         time + chosen->latency);
1086          }
1087       }
1088    }
1089
1090    assert(instructions_to_schedule == 0);
1091 }
1092
1093 void
1094 instruction_scheduler::run(exec_list *all_instructions)
1095 {
1096    backend_instruction *next_block_header =
1097       (backend_instruction *)all_instructions->head;
1098
1099    if (debug) {
1100       printf("\nInstructions before scheduling (reg_alloc %d)\n", post_reg_alloc);
1101       bv->dump_instructions();
1102    }
1103
1104    while (!next_block_header->is_tail_sentinel()) {
1105       /* Add things to be scheduled until we get to a new BB. */
1106       while (!next_block_header->is_tail_sentinel()) {
1107          backend_instruction *inst = next_block_header;
1108          next_block_header = (backend_instruction *)next_block_header->next;
1109
1110          add_inst(inst);
1111          if (inst->is_control_flow())
1112             break;
1113       }
1114       calculate_deps();
1115       schedule_instructions(next_block_header);
1116    }
1117
1118    if (debug) {
1119       printf("\nInstructions after scheduling (reg_alloc %d)\n", post_reg_alloc);
1120       bv->dump_instructions();
1121    }
1122 }
1123
1124 void
1125 fs_visitor::schedule_instructions(bool post_reg_alloc)
1126 {
1127    int grf_count;
1128    if (post_reg_alloc)
1129       grf_count = grf_used;
1130    else
1131       grf_count = virtual_grf_count;
1132
1133    fs_instruction_scheduler sched(this, grf_count, post_reg_alloc);
1134    sched.run(&instructions);
1135
1136    if (unlikely(INTEL_DEBUG & DEBUG_WM) && post_reg_alloc) {
1137       printf("fs%d estimated execution time: %d cycles\n",
1138              dispatch_width, sched.time);
1139    }
1140
1141    this->live_intervals_valid = false;
1142 }
1143
1144 void
1145 vec4_visitor::opt_schedule_instructions()
1146 {
1147    vec4_instruction_scheduler sched(this, prog_data->total_grf);
1148    sched.run(&instructions);
1149
1150    if (unlikely(debug_flag)) {
1151       printf("vec4 estimated execution time: %d cycles\n", sched.time);
1152    }
1153
1154    this->live_intervals_valid = false;
1155 }