src/mesa/drivers/dri/i965/brw_vec4.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "brw_vs.h"
  27 #include "brw_dead_control_flow.h"
  28
  29 extern "C" {
  30 #include "main/macros.h"
  31 #include "main/shaderobj.h"
  32 #include "program/prog_print.h"
  33 #include "program/prog_parameter.h"
  34 }
  35
  36 #define MAX_INSTRUCTION (1 << 30)
  37
  38 using namespace brw;
  39
  40 namespace brw {
  41
  42 /**
  43  * Common helper for constructing swizzles.  When only a subset of
  44  * channels of a vec4 are used, we don't want to reference the other
  45  * channels, as that will tell optimization passes that those other
  46  * channels are used.
  47  */
  48 unsigned
  49 swizzle_for_size(int size)
  50 {
  51    static const unsigned size_swizzles[4] = {
  52       BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
  53       BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y),
  54       BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z),
  55       BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W),
  56    };
  57
  58    assert((size >= 1) && (size <= 4));
  59    return size_swizzles[size - 1];
  60 }
  61
  62 void
  63 src_reg::init()
  64 {
  65    memset(this, 0, sizeof(*this));
  66
  67    this->file = BAD_FILE;
  68 }
  69
  70 src_reg::src_reg(register_file file, int reg, const glsl_type *type)
  71 {
  72    init();
  73
  74    this->file = file;
  75    this->reg = reg;
  76    if (type && (type->is_scalar() || type->is_vector() || type->is_matrix()))
  77       this->swizzle = swizzle_for_size(type->vector_elements);
  78    else
  79       this->swizzle = BRW_SWIZZLE_XYZW;
  80 }
  81
  82 /** Generic unset register constructor. */
  83 src_reg::src_reg()
  84 {
  85    init();
  86 }
  87
  88 src_reg::src_reg(float f)
  89 {
  90    init();
  91
  92    this->file = IMM;
  93    this->type = BRW_REGISTER_TYPE_F;
  94    this->imm.f = f;
  95 }
  96
  97 src_reg::src_reg(uint32_t u)
  98 {
  99    init();
 100
 101    this->file = IMM;
 102    this->type = BRW_REGISTER_TYPE_UD;
 103    this->imm.u = u;
 104 }
 105
 106 src_reg::src_reg(int32_t i)
 107 {
 108    init();
 109
 110    this->file = IMM;
 111    this->type = BRW_REGISTER_TYPE_D;
 112    this->imm.i = i;
 113 }
 114
 115 src_reg::src_reg(struct brw_reg reg)
 116 {
 117    init();
 118
 119    this->file = HW_REG;
 120    this->fixed_hw_reg = reg;
 121 }
 122
 123 src_reg::src_reg(dst_reg reg)
 124 {
 125    init();
 126
 127    this->file = reg.file;
 128    this->reg = reg.reg;
 129    this->reg_offset = reg.reg_offset;
 130    this->type = reg.type;
 131    this->reladdr = reg.reladdr;
 132    this->fixed_hw_reg = reg.fixed_hw_reg;
 133
 134    int swizzles[4];
 135    int next_chan = 0;
 136    int last = 0;
 137
 138    for (int i = 0; i < 4; i++) {
 139       if (!(reg.writemask & (1 << i)))
 140          continue;
 141
 142       swizzles[next_chan++] = last = i;
 143    }
 144
 145    for (; next_chan < 4; next_chan++) {
 146       swizzles[next_chan] = last;
 147    }
 148
 149    this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
 150                                 swizzles[2], swizzles[3]);
 151 }
 152
 153 void
 154 dst_reg::init()
 155 {
 156    memset(this, 0, sizeof(*this));
 157    this->file = BAD_FILE;
 158    this->writemask = WRITEMASK_XYZW;
 159 }
 160
 161 dst_reg::dst_reg()
 162 {
 163    init();
 164 }
 165
 166 dst_reg::dst_reg(register_file file, int reg)
 167 {
 168    init();
 169
 170    this->file = file;
 171    this->reg = reg;
 172 }
 173
 174 dst_reg::dst_reg(register_file file, int reg, const glsl_type *type,
 175                  int writemask)
 176 {
 177    init();
 178
 179    this->file = file;
 180    this->reg = reg;
 181    this->type = brw_type_for_base_type(type);
 182    this->writemask = writemask;
 183 }
 184
 185 dst_reg::dst_reg(struct brw_reg reg)
 186 {
 187    init();
 188
 189    this->file = HW_REG;
 190    this->fixed_hw_reg = reg;
 191 }
 192
 193 dst_reg::dst_reg(src_reg reg)
 194 {
 195    init();
 196
 197    this->file = reg.file;
 198    this->reg = reg.reg;
 199    this->reg_offset = reg.reg_offset;
 200    this->type = reg.type;
 201    /* How should we do writemasking when converting from a src_reg?  It seems
 202     * pretty obvious that for src.xxxx the caller wants to write to src.x, but
 203     * what about for src.wx?  Just special-case src.xxxx for now.
 204     */
 205    if (reg.swizzle == BRW_SWIZZLE_XXXX)
 206       this->writemask = WRITEMASK_X;
 207    else
 208       this->writemask = WRITEMASK_XYZW;
 209    this->reladdr = reg.reladdr;
 210    this->fixed_hw_reg = reg.fixed_hw_reg;
 211 }
 212
 213 bool
 214 vec4_instruction::is_send_from_grf()
 215 {
 216    switch (opcode) {
 217    case SHADER_OPCODE_SHADER_TIME_ADD:
 218    case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
 219       return true;
 220    default:
 221       return false;
 222    }
 223 }
 224
 225 bool
 226 vec4_visitor::can_do_source_mods(vec4_instruction *inst)
 227 {
 228    if (brw->gen == 6 && inst->is_math())
 229       return false;
 230
 231    if (inst->is_send_from_grf())
 232       return false;
 233
 234    if (!inst->can_do_source_mods())
 235       return false;
 236
 237    return true;
 238 }
 239
 240 /**
 241  * Returns how many MRFs an opcode will write over.
 242  *
 243  * Note that this is not the 0 or 1 implied writes in an actual gen
 244  * instruction -- the generate_* functions generate additional MOVs
 245  * for setup.
 246  */
 247 int
 248 vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
 249 {
 250    if (inst->mlen == 0)
 251       return 0;
 252
 253    switch (inst->opcode) {
 254    case SHADER_OPCODE_RCP:
 255    case SHADER_OPCODE_RSQ:
 256    case SHADER_OPCODE_SQRT:
 257    case SHADER_OPCODE_EXP2:
 258    case SHADER_OPCODE_LOG2:
 259    case SHADER_OPCODE_SIN:
 260    case SHADER_OPCODE_COS:
 261       return 1;
 262    case SHADER_OPCODE_INT_QUOTIENT:
 263    case SHADER_OPCODE_INT_REMAINDER:
 264    case SHADER_OPCODE_POW:
 265       return 2;
 266    case VS_OPCODE_URB_WRITE:
 267       return 1;
 268    case VS_OPCODE_PULL_CONSTANT_LOAD:
 269       return 2;
 270    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 271       return 2;
 272    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 273       return 3;
 274    case GS_OPCODE_URB_WRITE:
 275    case GS_OPCODE_THREAD_END:
 276       return 0;
 277    case SHADER_OPCODE_SHADER_TIME_ADD:
 278       return 0;
 279    case SHADER_OPCODE_TEX:
 280    case SHADER_OPCODE_TXL:
 281    case SHADER_OPCODE_TXD:
 282    case SHADER_OPCODE_TXF:
 283    case SHADER_OPCODE_TXF_CMS:
 284    case SHADER_OPCODE_TXF_MCS:
 285    case SHADER_OPCODE_TXS:
 286    case SHADER_OPCODE_TG4:
 287    case SHADER_OPCODE_TG4_OFFSET:
 288       return inst->header_present ? 1 : 0;
 289    case SHADER_OPCODE_UNTYPED_ATOMIC:
 290    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 291       return 0;
 292    default:
 293       assert(!"not reached");
 294       return inst->mlen;
 295    }
 296 }
 297
 298 bool
 299 src_reg::equals(src_reg *r)
 300 {
 301    return (file == r->file &&
 302            reg == r->reg &&
 303            reg_offset == r->reg_offset &&
 304            type == r->type &&
 305            negate == r->negate &&
 306            abs == r->abs &&
 307            swizzle == r->swizzle &&
 308            !reladdr && !r->reladdr &&
 309            memcmp(&fixed_hw_reg, &r->fixed_hw_reg,
 310                   sizeof(fixed_hw_reg)) == 0 &&
 311            imm.u == r->imm.u);
 312 }
 313
 314 /**
 315  * Must be called after calculate_live_intervales() to remove unused
 316  * writes to registers -- register allocation will fail otherwise
 317  * because something deffed but not used won't be considered to
 318  * interfere with other regs.
 319  */
 320 bool
 321 vec4_visitor::dead_code_eliminate()
 322 {
 323    bool progress = false;
 324    int pc = 0;
 325
 326    calculate_live_intervals();
 327
 328    foreach_list_safe(node, &this->instructions) {
 329       vec4_instruction *inst = (vec4_instruction *)node;
 330
 331       if (inst->dst.file == GRF && !inst->has_side_effects()) {
 332          assert(this->virtual_grf_end[inst->dst.reg] >= pc);
 333          if (this->virtual_grf_end[inst->dst.reg] == pc) {
 334             /* Don't dead code eliminate instructions that write to the
 335              * accumulator as a side-effect. Instead just set the destination
 336              * to the null register to free it.
 337              */
 338             switch (inst->opcode) {
 339             case BRW_OPCODE_ADDC:
 340             case BRW_OPCODE_SUBB:
 341             case BRW_OPCODE_MACH:
 342                inst->dst = dst_reg(retype(brw_null_reg(), inst->dst.type));
 343                break;
 344             default:
 345                inst->remove();
 346                break;
 347             }
 348             progress = true;
 349          }
 350       }
 351
 352       pc++;
 353    }
 354
 355    if (progress)
 356       invalidate_live_intervals();
 357
 358    return progress;
 359 }
 360
 361 void
 362 vec4_visitor::split_uniform_registers()
 363 {
 364    /* Prior to this, uniforms have been in an array sized according to
 365     * the number of vector uniforms present, sparsely filled (so an
 366     * aggregate results in reg indices being skipped over).  Now we're
 367     * going to cut those aggregates up so each .reg index is one
 368     * vector.  The goal is to make elimination of unused uniform
 369     * components easier later.
 370     */
 371    foreach_list(node, &this->instructions) {
 372       vec4_instruction *inst = (vec4_instruction *)node;
 373
 374       for (int i = 0 ; i < 3; i++) {
 375          if (inst->src[i].file != UNIFORM)
 376             continue;
 377
 378          assert(!inst->src[i].reladdr);
 379
 380          inst->src[i].reg += inst->src[i].reg_offset;
 381          inst->src[i].reg_offset = 0;
 382       }
 383    }
 384
 385    /* Update that everything is now vector-sized. */
 386    for (int i = 0; i < this->uniforms; i++) {
 387       this->uniform_size[i] = 1;
 388    }
 389 }
 390
 391 void
 392 vec4_visitor::pack_uniform_registers()
 393 {
 394    bool uniform_used[this->uniforms];
 395    int new_loc[this->uniforms];
 396    int new_chan[this->uniforms];
 397
 398    memset(uniform_used, 0, sizeof(uniform_used));
 399    memset(new_loc, 0, sizeof(new_loc));
 400    memset(new_chan, 0, sizeof(new_chan));
 401
 402    /* Find which uniform vectors are actually used by the program.  We
 403     * expect unused vector elements when we've moved array access out
 404     * to pull constants, and from some GLSL code generators like wine.
 405     */
 406    foreach_list(node, &this->instructions) {
 407       vec4_instruction *inst = (vec4_instruction *)node;
 408
 409       for (int i = 0 ; i < 3; i++) {
 410          if (inst->src[i].file != UNIFORM)
 411             continue;
 412
 413          uniform_used[inst->src[i].reg] = true;
 414       }
 415    }
 416
 417    int new_uniform_count = 0;
 418
 419    /* Now, figure out a packing of the live uniform vectors into our
 420     * push constants.
 421     */
 422    for (int src = 0; src < uniforms; src++) {
 423       int size = this->uniform_vector_size[src];
 424
 425       if (!uniform_used[src]) {
 426          this->uniform_vector_size[src] = 0;
 427          continue;
 428       }
 429
 430       int dst;
 431       /* Find the lowest place we can slot this uniform in. */
 432       for (dst = 0; dst < src; dst++) {
 433          if (this->uniform_vector_size[dst] + size <= 4)
 434             break;
 435       }
 436
 437       if (src == dst) {
 438          new_loc[src] = dst;
 439          new_chan[src] = 0;
 440       } else {
 441          new_loc[src] = dst;
 442          new_chan[src] = this->uniform_vector_size[dst];
 443
 444          /* Move the references to the data */
 445          for (int j = 0; j < size; j++) {
 446             prog_data->param[dst * 4 + new_chan[src] + j] =
 447                prog_data->param[src * 4 + j];
 448          }
 449
 450          this->uniform_vector_size[dst] += size;
 451          this->uniform_vector_size[src] = 0;
 452       }
 453
 454       new_uniform_count = MAX2(new_uniform_count, dst + 1);
 455    }
 456
 457    this->uniforms = new_uniform_count;
 458
 459    /* Now, update the instructions for our repacked uniforms. */
 460    foreach_list(node, &this->instructions) {
 461       vec4_instruction *inst = (vec4_instruction *)node;
 462
 463       for (int i = 0 ; i < 3; i++) {
 464          int src = inst->src[i].reg;
 465
 466          if (inst->src[i].file != UNIFORM)
 467             continue;
 468
 469          inst->src[i].reg = new_loc[src];
 470
 471          int sx = BRW_GET_SWZ(inst->src[i].swizzle, 0) + new_chan[src];
 472          int sy = BRW_GET_SWZ(inst->src[i].swizzle, 1) + new_chan[src];
 473          int sz = BRW_GET_SWZ(inst->src[i].swizzle, 2) + new_chan[src];
 474          int sw = BRW_GET_SWZ(inst->src[i].swizzle, 3) + new_chan[src];
 475          inst->src[i].swizzle = BRW_SWIZZLE4(sx, sy, sz, sw);
 476       }
 477    }
 478 }
 479
 480 bool
 481 src_reg::is_zero() const
 482 {
 483    if (file != IMM)
 484       return false;
 485
 486    if (type == BRW_REGISTER_TYPE_F) {
 487       return imm.f == 0.0;
 488    } else {
 489       return imm.i == 0;
 490    }
 491 }
 492
 493 bool
 494 src_reg::is_one() const
 495 {
 496    if (file != IMM)
 497       return false;
 498
 499    if (type == BRW_REGISTER_TYPE_F) {
 500       return imm.f == 1.0;
 501    } else {
 502       return imm.i == 1;
 503    }
 504 }
 505
 506 /**
 507  * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
 508  *
 509  * While GLSL IR also performs this optimization, we end up with it in
 510  * our instruction stream for a couple of reasons.  One is that we
 511  * sometimes generate silly instructions, for example in array access
 512  * where we'll generate "ADD offset, index, base" even if base is 0.
 513  * The other is that GLSL IR's constant propagation doesn't track the
 514  * components of aggregates, so some VS patterns (initialize matrix to
 515  * 0, accumulate in vertex blending factors) end up breaking down to
 516  * instructions involving 0.
 517  */
 518 bool
 519 vec4_visitor::opt_algebraic()
 520 {
 521    bool progress = false;
 522
 523    foreach_list(node, &this->instructions) {
 524       vec4_instruction *inst = (vec4_instruction *)node;
 525
 526       switch (inst->opcode) {
 527       case BRW_OPCODE_ADD:
 528          if (inst->src[1].is_zero()) {
 529             inst->opcode = BRW_OPCODE_MOV;
 530             inst->src[1] = src_reg();
 531             progress = true;
 532          }
 533          break;
 534
 535       case BRW_OPCODE_MUL:
 536          if (inst->src[1].is_zero()) {
 537             inst->opcode = BRW_OPCODE_MOV;
 538             switch (inst->src[0].type) {
 539             case BRW_REGISTER_TYPE_F:
 540                inst->src[0] = src_reg(0.0f);
 541                break;
 542             case BRW_REGISTER_TYPE_D:
 543                inst->src[0] = src_reg(0);
 544                break;
 545             case BRW_REGISTER_TYPE_UD:
 546                inst->src[0] = src_reg(0u);
 547                break;
 548             default:
 549                assert(!"not reached");
 550                inst->src[0] = src_reg(0.0f);
 551                break;
 552             }
 553             inst->src[1] = src_reg();
 554             progress = true;
 555          } else if (inst->src[1].is_one()) {
 556             inst->opcode = BRW_OPCODE_MOV;
 557             inst->src[1] = src_reg();
 558             progress = true;
 559          }
 560          break;
 561       default:
 562          break;
 563       }
 564    }
 565
 566    if (progress)
 567       invalidate_live_intervals();
 568
 569    return progress;
 570 }
 571
 572 /**
 573  * Only a limited number of hardware registers may be used for push
 574  * constants, so this turns access to the overflowed constants into
 575  * pull constants.
 576  */
 577 void
 578 vec4_visitor::move_push_constants_to_pull_constants()
 579 {
 580    int pull_constant_loc[this->uniforms];
 581
 582    /* Only allow 32 registers (256 uniform components) as push constants,
 583     * which is the limit on gen6.
 584     */
 585    int max_uniform_components = 32 * 8;
 586    if (this->uniforms * 4 <= max_uniform_components)
 587       return;
 588
 589    /* Make some sort of choice as to which uniforms get sent to pull
 590     * constants.  We could potentially do something clever here like
 591     * look for the most infrequently used uniform vec4s, but leave
 592     * that for later.
 593     */
 594    for (int i = 0; i < this->uniforms * 4; i += 4) {
 595       pull_constant_loc[i / 4] = -1;
 596
 597       if (i >= max_uniform_components) {
 598          const float **values = &prog_data->param[i];
 599
 600          /* Try to find an existing copy of this uniform in the pull
 601           * constants if it was part of an array access already.
 602           */
 603          for (unsigned int j = 0; j < prog_data->nr_pull_params; j += 4) {
 604             int matches;
 605
 606             for (matches = 0; matches < 4; matches++) {
 607                if (prog_data->pull_param[j + matches] != values[matches])
 608                   break;
 609             }
 610
 611             if (matches == 4) {
 612                pull_constant_loc[i / 4] = j / 4;
 613                break;
 614             }
 615          }
 616
 617          if (pull_constant_loc[i / 4] == -1) {
 618             assert(prog_data->nr_pull_params % 4 == 0);
 619             pull_constant_loc[i / 4] = prog_data->nr_pull_params / 4;
 620
 621             for (int j = 0; j < 4; j++) {
 622                prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
 623             }
 624          }
 625       }
 626    }
 627
 628    /* Now actually rewrite usage of the things we've moved to pull
 629     * constants.
 630     */
 631    foreach_list_safe(node, &this->instructions) {
 632       vec4_instruction *inst = (vec4_instruction *)node;
 633
 634       for (int i = 0 ; i < 3; i++) {
 635          if (inst->src[i].file != UNIFORM ||
 636              pull_constant_loc[inst->src[i].reg] == -1)
 637             continue;
 638
 639          int uniform = inst->src[i].reg;
 640
 641          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
 642
 643          emit_pull_constant_load(inst, temp, inst->src[i],
 644                                  pull_constant_loc[uniform]);
 645
 646          inst->src[i].file = temp.file;
 647          inst->src[i].reg = temp.reg;
 648          inst->src[i].reg_offset = temp.reg_offset;
 649          inst->src[i].reladdr = NULL;
 650       }
 651    }
 652
 653    /* Repack push constants to remove the now-unused ones. */
 654    pack_uniform_registers();
 655 }
 656
 657 /**
 658  * Sets the dependency control fields on instructions after register
 659  * allocation and before the generator is run.
 660  *
 661  * When you have a sequence of instructions like:
 662  *
 663  * DP4 temp.x vertex uniform[0]
 664  * DP4 temp.y vertex uniform[0]
 665  * DP4 temp.z vertex uniform[0]
 666  * DP4 temp.w vertex uniform[0]
 667  *
 668  * The hardware doesn't know that it can actually run the later instructions
 669  * while the previous ones are in flight, producing stalls.  However, we have
 670  * manual fields we can set in the instructions that let it do so.
 671  */
 672 void
 673 vec4_visitor::opt_set_dependency_control()
 674 {
 675    vec4_instruction *last_grf_write[BRW_MAX_GRF];
 676    uint8_t grf_channels_written[BRW_MAX_GRF];
 677    vec4_instruction *last_mrf_write[BRW_MAX_GRF];
 678    uint8_t mrf_channels_written[BRW_MAX_GRF];
 679
 680    cfg_t cfg(&instructions);
 681
 682    assert(prog_data->total_grf ||
 683           !"Must be called after register allocation");
 684
 685    for (int i = 0; i < cfg.num_blocks; i++) {
 686       bblock_t *bblock = cfg.blocks[i];
 687       vec4_instruction *inst;
 688
 689       memset(last_grf_write, 0, sizeof(last_grf_write));
 690       memset(last_mrf_write, 0, sizeof(last_mrf_write));
 691
 692       for (inst = (vec4_instruction *)bblock->start;
 693            inst != (vec4_instruction *)bblock->end->next;
 694            inst = (vec4_instruction *)inst->next) {
 695          /* If we read from a register that we were doing dependency control
 696           * on, don't do dependency control across the read.
 697           */
 698          for (int i = 0; i < 3; i++) {
 699             int reg = inst->src[i].reg + inst->src[i].reg_offset;
 700             if (inst->src[i].file == GRF) {
 701                last_grf_write[reg] = NULL;
 702             } else if (inst->src[i].file == HW_REG) {
 703                memset(last_grf_write, 0, sizeof(last_grf_write));
 704                break;
 705             }
 706             assert(inst->src[i].file != MRF);
 707          }
 708
 709          /* In the presence of send messages, totally interrupt dependency
 710           * control.  They're long enough that the chance of dependency
 711           * control around them just doesn't matter.
 712           */
 713          if (inst->mlen) {
 714             memset(last_grf_write, 0, sizeof(last_grf_write));
 715             memset(last_mrf_write, 0, sizeof(last_mrf_write));
 716             continue;
 717          }
 718
 719          /* It looks like setting dependency control on a predicated
 720           * instruction hangs the GPU.
 721           */
 722          if (inst->predicate) {
 723             memset(last_grf_write, 0, sizeof(last_grf_write));
 724             memset(last_mrf_write, 0, sizeof(last_mrf_write));
 725             continue;
 726          }
 727
 728          /* Now, see if we can do dependency control for this instruction
 729           * against a previous one writing to its destination.
 730           */
 731          int reg = inst->dst.reg + inst->dst.reg_offset;
 732          if (inst->dst.file == GRF) {
 733             if (last_grf_write[reg] &&
 734                 !(inst->dst.writemask & grf_channels_written[reg])) {
 735                last_grf_write[reg]->no_dd_clear = true;
 736                inst->no_dd_check = true;
 737             } else {
 738                grf_channels_written[reg] = 0;
 739             }
 740
 741             last_grf_write[reg] = inst;
 742             grf_channels_written[reg] |= inst->dst.writemask;
 743          } else if (inst->dst.file == MRF) {
 744             if (last_mrf_write[reg] &&
 745                 !(inst->dst.writemask & mrf_channels_written[reg])) {
 746                last_mrf_write[reg]->no_dd_clear = true;
 747                inst->no_dd_check = true;
 748             } else {
 749                mrf_channels_written[reg] = 0;
 750             }
 751
 752             last_mrf_write[reg] = inst;
 753             mrf_channels_written[reg] |= inst->dst.writemask;
 754          } else if (inst->dst.reg == HW_REG) {
 755             if (inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE)
 756                memset(last_grf_write, 0, sizeof(last_grf_write));
 757             if (inst->dst.fixed_hw_reg.file == BRW_MESSAGE_REGISTER_FILE)
 758                memset(last_mrf_write, 0, sizeof(last_mrf_write));
 759          }
 760       }
 761    }
 762 }
 763
 764 bool
 765 vec4_instruction::can_reswizzle_dst(int dst_writemask,
 766                                     int swizzle,
 767                                     int swizzle_mask)
 768 {
 769    /* If this instruction sets anything not referenced by swizzle, then we'd
 770     * totally break it when we reswizzle.
 771     */
 772    if (dst.writemask & ~swizzle_mask)
 773       return false;
 774
 775    switch (opcode) {
 776    case BRW_OPCODE_DP4:
 777    case BRW_OPCODE_DP3:
 778    case BRW_OPCODE_DP2:
 779       return true;
 780    default:
 781       /* Check if there happens to be no reswizzling required. */
 782       for (int c = 0; c < 4; c++) {
 783          int bit = 1 << BRW_GET_SWZ(swizzle, c);
 784          /* Skip components of the swizzle not used by the dst. */
 785          if (!(dst_writemask & (1 << c)))
 786             continue;
 787
 788          /* We don't do the reswizzling yet, so just sanity check that we
 789           * don't have to.
 790           */
 791          if (bit != (1 << c))
 792             return false;
 793       }
 794       return true;
 795    }
 796 }
 797
 798 /**
 799  * For any channels in the swizzle's source that were populated by this
 800  * instruction, rewrite the instruction to put the appropriate result directly
 801  * in those channels.
 802  *
 803  * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x
 804  */
 805 void
 806 vec4_instruction::reswizzle_dst(int dst_writemask, int swizzle)
 807 {
 808    int new_writemask = 0;
 809
 810    switch (opcode) {
 811    case BRW_OPCODE_DP4:
 812    case BRW_OPCODE_DP3:
 813    case BRW_OPCODE_DP2:
 814       for (int c = 0; c < 4; c++) {
 815          int bit = 1 << BRW_GET_SWZ(swizzle, c);
 816          /* Skip components of the swizzle not used by the dst. */
 817          if (!(dst_writemask & (1 << c)))
 818             continue;
 819          /* If we were populating this component, then populate the
 820           * corresponding channel of the new dst.
 821           */
 822          if (dst.writemask & bit)
 823             new_writemask |= (1 << c);
 824       }
 825       dst.writemask = new_writemask;
 826       break;
 827    default:
 828       for (int c = 0; c < 4; c++) {
 829          /* Skip components of the swizzle not used by the dst. */
 830          if (!(dst_writemask & (1 << c)))
 831             continue;
 832
 833          /* We don't do the reswizzling yet, so just sanity check that we
 834           * don't have to.
 835           */
 836          assert((1 << BRW_GET_SWZ(swizzle, c)) == (1 << c));
 837       }
 838       break;
 839    }
 840 }
 841
 842 /*
 843  * Tries to reduce extra MOV instructions by taking temporary GRFs that get
 844  * just written and then MOVed into another reg and making the original write
 845  * of the GRF write directly to the final destination instead.
 846  */
 847 bool
 848 vec4_visitor::opt_register_coalesce()
 849 {
 850    bool progress = false;
 851    int next_ip = 0;
 852
 853    calculate_live_intervals();
 854
 855    foreach_list_safe(node, &this->instructions) {
 856       vec4_instruction *inst = (vec4_instruction *)node;
 857
 858       int ip = next_ip;
 859       next_ip++;
 860
 861       if (inst->opcode != BRW_OPCODE_MOV ||
 862           (inst->dst.file != GRF && inst->dst.file != MRF) ||
 863           inst->predicate ||
 864           inst->src[0].file != GRF ||
 865           inst->dst.type != inst->src[0].type ||
 866           inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr)
 867          continue;
 868
 869       bool to_mrf = (inst->dst.file == MRF);
 870
 871       /* Can't coalesce this GRF if someone else was going to
 872        * read it later.
 873        */
 874       if (this->virtual_grf_end[inst->src[0].reg] > ip)
 875          continue;
 876
 877       /* We need to check interference with the final destination between this
 878        * instruction and the earliest instruction involved in writing the GRF
 879        * we're eliminating.  To do that, keep track of which of our source
 880        * channels we've seen initialized.
 881        */
 882       bool chans_needed[4] = {false, false, false, false};
 883       int chans_remaining = 0;
 884       int swizzle_mask = 0;
 885       for (int i = 0; i < 4; i++) {
 886          int chan = BRW_GET_SWZ(inst->src[0].swizzle, i);
 887
 888          if (!(inst->dst.writemask & (1 << i)))
 889             continue;
 890
 891          swizzle_mask |= (1 << chan);
 892
 893          if (!chans_needed[chan]) {
 894             chans_needed[chan] = true;
 895             chans_remaining++;
 896          }
 897       }
 898
 899       /* Now walk up the instruction stream trying to see if we can rewrite
 900        * everything writing to the temporary to write into the destination
 901        * instead.
 902        */
 903       vec4_instruction *scan_inst;
 904       for (scan_inst = (vec4_instruction *)inst->prev;
 905            scan_inst->prev != NULL;
 906            scan_inst = (vec4_instruction *)scan_inst->prev) {
 907          if (scan_inst->dst.file == GRF &&
 908              scan_inst->dst.reg == inst->src[0].reg &&
 909              scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
 910             /* Found something writing to the reg we want to coalesce away. */
 911             if (to_mrf) {
 912                /* SEND instructions can't have MRF as a destination. */
 913                if (scan_inst->mlen)
 914                   break;
 915
 916                if (brw->gen == 6) {
 917                   /* gen6 math instructions must have the destination be
 918                    * GRF, so no compute-to-MRF for them.
 919                    */
 920                   if (scan_inst->is_math()) {
 921                      break;
 922                   }
 923                }
 924             }
 925
 926             /* If we can't handle the swizzle, bail. */
 927             if (!scan_inst->can_reswizzle_dst(inst->dst.writemask,
 928                                               inst->src[0].swizzle,
 929                                               swizzle_mask)) {
 930                break;
 931             }
 932
 933             /* Mark which channels we found unconditional writes for. */
 934             if (!scan_inst->predicate) {
 935                for (int i = 0; i < 4; i++) {
 936                   if (scan_inst->dst.writemask & (1 << i) &&
 937                       chans_needed[i]) {
 938                      chans_needed[i] = false;
 939                      chans_remaining--;
 940                   }
 941                }
 942             }
 943
 944             if (chans_remaining == 0)
 945                break;
 946          }
 947
 948          /* We don't handle flow control here.  Most computation of values
 949           * that could be coalesced happens just before their use.
 950           */
 951          if (scan_inst->opcode == BRW_OPCODE_DO ||
 952              scan_inst->opcode == BRW_OPCODE_WHILE ||
 953              scan_inst->opcode == BRW_OPCODE_ELSE ||
 954              scan_inst->opcode == BRW_OPCODE_ENDIF) {
 955             break;
 956          }
 957
 958          /* You can't read from an MRF, so if someone else reads our MRF's
 959           * source GRF that we wanted to rewrite, that stops us.  If it's a
 960           * GRF we're trying to coalesce to, we don't actually handle
 961           * rewriting sources so bail in that case as well.
 962           */
 963          bool interfered = false;
 964          for (int i = 0; i < 3; i++) {
 965             if (scan_inst->src[i].file == GRF &&
 966                 scan_inst->src[i].reg == inst->src[0].reg &&
 967                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
 968                interfered = true;
 969             }
 970          }
 971          if (interfered)
 972             break;
 973
 974          /* If somebody else writes our destination here, we can't coalesce
 975           * before that.
 976           */
 977          if (scan_inst->dst.file == inst->dst.file &&
 978              scan_inst->dst.reg == inst->dst.reg) {
 979             break;
 980          }
 981
 982          /* Check for reads of the register we're trying to coalesce into.  We
 983           * can't go rewriting instructions above that to put some other value
 984           * in the register instead.
 985           */
 986          if (to_mrf && scan_inst->mlen > 0) {
 987             if (inst->dst.reg >= scan_inst->base_mrf &&
 988                 inst->dst.reg < scan_inst->base_mrf + scan_inst->mlen) {
 989                break;
 990             }
 991          } else {
 992             for (int i = 0; i < 3; i++) {
 993                if (scan_inst->src[i].file == inst->dst.file &&
 994                    scan_inst->src[i].reg == inst->dst.reg &&
 995                    scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
 996                   interfered = true;
 997                }
 998             }
 999             if (interfered)
1000                break;
1001          }
1002       }
1003
1004       if (chans_remaining == 0) {
1005          /* If we've made it here, we have an MOV we want to coalesce out, and
1006           * a scan_inst pointing to the earliest instruction involved in
1007           * computing the value.  Now go rewrite the instruction stream
1008           * between the two.
1009           */
1010
1011          while (scan_inst != inst) {
1012             if (scan_inst->dst.file == GRF &&
1013                 scan_inst->dst.reg == inst->src[0].reg &&
1014                 scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1015                scan_inst->reswizzle_dst(inst->dst.writemask,
1016                                         inst->src[0].swizzle);
1017                scan_inst->dst.file = inst->dst.file;
1018                scan_inst->dst.reg = inst->dst.reg;
1019                scan_inst->dst.reg_offset = inst->dst.reg_offset;
1020                scan_inst->saturate |= inst->saturate;
1021             }
1022             scan_inst = (vec4_instruction *)scan_inst->next;
1023          }
1024          inst->remove();
1025          progress = true;
1026       }
1027    }
1028
1029    if (progress)
1030       invalidate_live_intervals();
1031
1032    return progress;
1033 }
1034
1035 /**
1036  * Splits virtual GRFs requesting more than one contiguous physical register.
1037  *
1038  * We initially create large virtual GRFs for temporary structures, arrays,
1039  * and matrices, so that the dereference visitor functions can add reg_offsets
1040  * to work their way down to the actual member being accessed.  But when it
1041  * comes to optimization, we'd like to treat each register as individual
1042  * storage if possible.
1043  *
1044  * So far, the only thing that might prevent splitting is a send message from
1045  * a GRF on IVB.
1046  */
1047 void
1048 vec4_visitor::split_virtual_grfs()
1049 {
1050    int num_vars = this->virtual_grf_count;
1051    int new_virtual_grf[num_vars];
1052    bool split_grf[num_vars];
1053
1054    memset(new_virtual_grf, 0, sizeof(new_virtual_grf));
1055
1056    /* Try to split anything > 0 sized. */
1057    for (int i = 0; i < num_vars; i++) {
1058       split_grf[i] = this->virtual_grf_sizes[i] != 1;
1059    }
1060
1061    /* Check that the instructions are compatible with the registers we're trying
1062     * to split.
1063     */
1064    foreach_list(node, &this->instructions) {
1065       vec4_instruction *inst = (vec4_instruction *)node;
1066
1067       /* If there's a SEND message loading from a GRF on gen7+, it needs to be
1068        * contiguous.
1069        */
1070       if (inst->is_send_from_grf()) {
1071          for (int i = 0; i < 3; i++) {
1072             if (inst->src[i].file == GRF) {
1073                split_grf[inst->src[i].reg] = false;
1074             }
1075          }
1076       }
1077    }
1078
1079    /* Allocate new space for split regs.  Note that the virtual
1080     * numbers will be contiguous.
1081     */
1082    for (int i = 0; i < num_vars; i++) {
1083       if (!split_grf[i])
1084          continue;
1085
1086       new_virtual_grf[i] = virtual_grf_alloc(1);
1087       for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1088          int reg = virtual_grf_alloc(1);
1089          assert(reg == new_virtual_grf[i] + j - 1);
1090          (void) reg;
1091       }
1092       this->virtual_grf_sizes[i] = 1;
1093    }
1094
1095    foreach_list(node, &this->instructions) {
1096       vec4_instruction *inst = (vec4_instruction *)node;
1097
1098       if (inst->dst.file == GRF && split_grf[inst->dst.reg] &&
1099           inst->dst.reg_offset != 0) {
1100          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1101                           inst->dst.reg_offset - 1);
1102          inst->dst.reg_offset = 0;
1103       }
1104       for (int i = 0; i < 3; i++) {
1105          if (inst->src[i].file == GRF && split_grf[inst->src[i].reg] &&
1106              inst->src[i].reg_offset != 0) {
1107             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1108                                 inst->src[i].reg_offset - 1);
1109             inst->src[i].reg_offset = 0;
1110          }
1111       }
1112    }
1113    invalidate_live_intervals();
1114 }
1115
1116 void
1117 vec4_visitor::dump_instruction(backend_instruction *be_inst)
1118 {
1119    vec4_instruction *inst = (vec4_instruction *)be_inst;
1120
1121    printf("%s", brw_instruction_name(inst->opcode));
1122    if (inst->conditional_mod) {
1123       printf("%s", conditional_modifier[inst->conditional_mod]);
1124    }
1125    printf(" ");
1126
1127    switch (inst->dst.file) {
1128    case GRF:
1129       printf("vgrf%d.%d", inst->dst.reg, inst->dst.reg_offset);
1130       break;
1131    case MRF:
1132       printf("m%d", inst->dst.reg);
1133       break;
1134    case HW_REG:
1135       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
1136          switch (inst->dst.fixed_hw_reg.nr) {
1137          case BRW_ARF_NULL:
1138             printf("null");
1139             break;
1140          case BRW_ARF_ADDRESS:
1141             printf("a0.%d", inst->dst.fixed_hw_reg.subnr);
1142             break;
1143          case BRW_ARF_ACCUMULATOR:
1144             printf("acc%d", inst->dst.fixed_hw_reg.subnr);
1145             break;
1146          case BRW_ARF_FLAG:
1147             printf("f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
1148                              inst->dst.fixed_hw_reg.subnr);
1149             break;
1150          default:
1151             printf("arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
1152                                inst->dst.fixed_hw_reg.subnr);
1153             break;
1154          }
1155       } else {
1156          printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
1157       }
1158       if (inst->dst.fixed_hw_reg.subnr)
1159          printf("+%d", inst->dst.fixed_hw_reg.subnr);
1160       break;
1161    case BAD_FILE:
1162       printf("(null)");
1163       break;
1164    default:
1165       printf("???");
1166       break;
1167    }
1168    if (inst->dst.writemask != WRITEMASK_XYZW) {
1169       printf(".");
1170       if (inst->dst.writemask & 1)
1171          printf("x");
1172       if (inst->dst.writemask & 2)
1173          printf("y");
1174       if (inst->dst.writemask & 4)
1175          printf("z");
1176       if (inst->dst.writemask & 8)
1177          printf("w");
1178    }
1179    printf(":%s, ", brw_reg_type_letters(inst->dst.type));
1180
1181    for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
1182       if (inst->src[i].negate)
1183          printf("-");
1184       if (inst->src[i].abs)
1185          printf("|");
1186       switch (inst->src[i].file) {
1187       case GRF:
1188          printf("vgrf%d", inst->src[i].reg);
1189          break;
1190       case ATTR:
1191          printf("attr%d", inst->src[i].reg);
1192          break;
1193       case UNIFORM:
1194          printf("u%d", inst->src[i].reg);
1195          break;
1196       case IMM:
1197          switch (inst->src[i].type) {
1198          case BRW_REGISTER_TYPE_F:
1199             printf("%fF", inst->src[i].imm.f);
1200             break;
1201          case BRW_REGISTER_TYPE_D:
1202             printf("%dD", inst->src[i].imm.i);
1203             break;
1204          case BRW_REGISTER_TYPE_UD:
1205             printf("%uU", inst->src[i].imm.u);
1206             break;
1207          default:
1208             printf("???");
1209             break;
1210          }
1211          break;
1212       case HW_REG:
1213          if (inst->src[i].fixed_hw_reg.negate)
1214             printf("-");
1215          if (inst->src[i].fixed_hw_reg.abs)
1216             printf("|");
1217          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
1218             switch (inst->src[i].fixed_hw_reg.nr) {
1219             case BRW_ARF_NULL:
1220                printf("null");
1221                break;
1222             case BRW_ARF_ADDRESS:
1223                printf("a0.%d", inst->src[i].fixed_hw_reg.subnr);
1224                break;
1225             case BRW_ARF_ACCUMULATOR:
1226                printf("acc%d", inst->src[i].fixed_hw_reg.subnr);
1227                break;
1228             case BRW_ARF_FLAG:
1229                printf("f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
1230                                 inst->src[i].fixed_hw_reg.subnr);
1231                break;
1232             default:
1233                printf("arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
1234                                   inst->src[i].fixed_hw_reg.subnr);
1235                break;
1236             }
1237          } else {
1238             printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
1239          }
1240          if (inst->src[i].fixed_hw_reg.subnr)
1241             printf("+%d", inst->src[i].fixed_hw_reg.subnr);
1242          if (inst->src[i].fixed_hw_reg.abs)
1243             printf("|");
1244          break;
1245       case BAD_FILE:
1246          printf("(null)");
1247          break;
1248       default:
1249          printf("???");
1250          break;
1251       }
1252
1253       if (virtual_grf_sizes[inst->src[i].reg] != 1)
1254          printf(".%d", inst->src[i].reg_offset);
1255
1256       if (inst->src[i].file != IMM) {
1257          static const char *chans[4] = {"x", "y", "z", "w"};
1258          printf(".");
1259          for (int c = 0; c < 4; c++) {
1260             printf("%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]);
1261          }
1262       }
1263
1264       if (inst->src[i].abs)
1265          printf("|");
1266
1267       if (inst->src[i].file != IMM) {
1268          printf(":%s", reg_encoding[inst->src[i].type]);
1269       }
1270
1271       if (i < 2 && inst->src[i + 1].file != BAD_FILE)
1272          printf(", ");
1273    }
1274
1275    printf("\n");
1276 }
1277
1278
1279 static inline struct brw_reg
1280 attribute_to_hw_reg(int attr, bool interleaved)
1281 {
1282    if (interleaved)
1283       return stride(brw_vec4_grf(attr / 2, (attr % 2) * 4), 0, 4, 1);
1284    else
1285       return brw_vec8_grf(attr, 0);
1286 }
1287
1288
1289 /**
1290  * Replace each register of type ATTR in this->instructions with a reference
1291  * to a fixed HW register.
1292  *
1293  * If interleaved is true, then each attribute takes up half a register, with
1294  * register N containing attribute 2*N in its first half and attribute 2*N+1
1295  * in its second half (this corresponds to the payload setup used by geometry
1296  * shaders in "single" or "dual instanced" dispatch mode).  If interleaved is
1297  * false, then each attribute takes up a whole register, with register N
1298  * containing attribute N (this corresponds to the payload setup used by
1299  * vertex shaders, and by geometry shaders in "dual object" dispatch mode).
1300  */
1301 void
1302 vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map,
1303                                           bool interleaved)
1304 {
1305    foreach_list(node, &this->instructions) {
1306       vec4_instruction *inst = (vec4_instruction *)node;
1307
1308       /* We have to support ATTR as a destination for GL_FIXED fixup. */
1309       if (inst->dst.file == ATTR) {
1310          int grf = attribute_map[inst->dst.reg + inst->dst.reg_offset];
1311
1312          /* All attributes used in the shader need to have been assigned a
1313           * hardware register by the caller
1314           */
1315          assert(grf != 0);
1316
1317          struct brw_reg reg = attribute_to_hw_reg(grf, interleaved);
1318          reg.type = inst->dst.type;
1319          reg.dw1.bits.writemask = inst->dst.writemask;
1320
1321          inst->dst.file = HW_REG;
1322          inst->dst.fixed_hw_reg = reg;
1323       }
1324
1325       for (int i = 0; i < 3; i++) {
1326          if (inst->src[i].file != ATTR)
1327             continue;
1328
1329          int grf = attribute_map[inst->src[i].reg + inst->src[i].reg_offset];
1330
1331          /* All attributes used in the shader need to have been assigned a
1332           * hardware register by the caller
1333           */
1334          assert(grf != 0);
1335
1336          struct brw_reg reg = attribute_to_hw_reg(grf, interleaved);
1337          reg.dw1.bits.swizzle = inst->src[i].swizzle;
1338          reg.type = inst->src[i].type;
1339          if (inst->src[i].abs)
1340             reg = brw_abs(reg);
1341          if (inst->src[i].negate)
1342             reg = negate(reg);
1343
1344          inst->src[i].file = HW_REG;
1345          inst->src[i].fixed_hw_reg = reg;
1346       }
1347    }
1348 }
1349
1350 int
1351 vec4_vs_visitor::setup_attributes(int payload_reg)
1352 {
1353    int nr_attributes;
1354    int attribute_map[VERT_ATTRIB_MAX + 1];
1355    memset(attribute_map, 0, sizeof(attribute_map));
1356
1357    nr_attributes = 0;
1358    for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
1359       if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
1360          attribute_map[i] = payload_reg + nr_attributes;
1361          nr_attributes++;
1362       }
1363    }
1364
1365    /* VertexID is stored by the VF as the last vertex element, but we
1366     * don't represent it with a flag in inputs_read, so we call it
1367     * VERT_ATTRIB_MAX.
1368     */
1369    if (vs_prog_data->uses_vertexid) {
1370       attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes;
1371       nr_attributes++;
1372    }
1373
1374    lower_attributes_to_hw_regs(attribute_map, false /* interleaved */);
1375
1376    /* The BSpec says we always have to read at least one thing from
1377     * the VF, and it appears that the hardware wedges otherwise.
1378     */
1379    if (nr_attributes == 0)
1380       nr_attributes = 1;
1381
1382    prog_data->urb_read_length = (nr_attributes + 1) / 2;
1383
1384    unsigned vue_entries =
1385       MAX2(nr_attributes, prog_data->vue_map.num_slots);
1386
1387    if (brw->gen == 6)
1388       prog_data->urb_entry_size = ALIGN(vue_entries, 8) / 8;
1389    else
1390       prog_data->urb_entry_size = ALIGN(vue_entries, 4) / 4;
1391
1392    return payload_reg + nr_attributes;
1393 }
1394
1395 int
1396 vec4_visitor::setup_uniforms(int reg)
1397 {
1398    prog_data->dispatch_grf_start_reg = reg;
1399
1400    /* The pre-gen6 VS requires that some push constants get loaded no
1401     * matter what, or the GPU would hang.
1402     */
1403    if (brw->gen < 6 && this->uniforms == 0) {
1404       this->uniform_vector_size[this->uniforms] = 1;
1405
1406       prog_data->param = reralloc(NULL, prog_data->param, const float *, 4);
1407       for (unsigned int i = 0; i < 4; i++) {
1408          unsigned int slot = this->uniforms * 4 + i;
1409          static float zero = 0.0;
1410          prog_data->param[slot] = &zero;
1411       }
1412
1413       this->uniforms++;
1414       reg++;
1415    } else {
1416       reg += ALIGN(uniforms, 2) / 2;
1417    }
1418
1419    prog_data->nr_params = this->uniforms * 4;
1420
1421    prog_data->curb_read_length = reg - prog_data->dispatch_grf_start_reg;
1422
1423    return reg;
1424 }
1425
1426 void
1427 vec4_vs_visitor::setup_payload(void)
1428 {
1429    int reg = 0;
1430
1431    /* The payload always contains important data in g0, which contains
1432     * the URB handles that are passed on to the URB write at the end
1433     * of the thread.  So, we always start push constants at g1.
1434     */
1435    reg++;
1436
1437    reg = setup_uniforms(reg);
1438
1439    reg = setup_attributes(reg);
1440
1441    this->first_non_payload_grf = reg;
1442 }
1443
1444 src_reg
1445 vec4_visitor::get_timestamp()
1446 {
1447    assert(brw->gen >= 7);
1448
1449    src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1450                                 BRW_ARF_TIMESTAMP,
1451                                 0,
1452                                 BRW_REGISTER_TYPE_UD,
1453                                 BRW_VERTICAL_STRIDE_0,
1454                                 BRW_WIDTH_4,
1455                                 BRW_HORIZONTAL_STRIDE_4,
1456                                 BRW_SWIZZLE_XYZW,
1457                                 WRITEMASK_XYZW));
1458
1459    dst_reg dst = dst_reg(this, glsl_type::uvec4_type);
1460
1461    vec4_instruction *mov = emit(MOV(dst, ts));
1462    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
1463     * even if it's not enabled in the dispatch.
1464     */
1465    mov->force_writemask_all = true;
1466
1467    return src_reg(dst);
1468 }
1469
1470 void
1471 vec4_visitor::emit_shader_time_begin()
1472 {
1473    current_annotation = "shader time start";
1474    shader_start_time = get_timestamp();
1475 }
1476
1477 void
1478 vec4_visitor::emit_shader_time_end()
1479 {
1480    current_annotation = "shader time end";
1481    src_reg shader_end_time = get_timestamp();
1482
1483
1484    /* Check that there weren't any timestamp reset events (assuming these
1485     * were the only two timestamp reads that happened).
1486     */
1487    src_reg reset_end = shader_end_time;
1488    reset_end.swizzle = BRW_SWIZZLE_ZZZZ;
1489    vec4_instruction *test = emit(AND(dst_null_d(), reset_end, src_reg(1u)));
1490    test->conditional_mod = BRW_CONDITIONAL_Z;
1491
1492    emit(IF(BRW_PREDICATE_NORMAL));
1493
1494    /* Take the current timestamp and get the delta. */
1495    shader_start_time.negate = true;
1496    dst_reg diff = dst_reg(this, glsl_type::uint_type);
1497    emit(ADD(diff, shader_start_time, shader_end_time));
1498
1499    /* If there were no instructions between the two timestamp gets, the diff
1500     * is 2 cycles.  Remove that overhead, so I can forget about that when
1501     * trying to determine the time taken for single instructions.
1502     */
1503    emit(ADD(diff, src_reg(diff), src_reg(-2u)));
1504
1505    emit_shader_time_write(st_base, src_reg(diff));
1506    emit_shader_time_write(st_written, src_reg(1u));
1507    emit(BRW_OPCODE_ELSE);
1508    emit_shader_time_write(st_reset, src_reg(1u));
1509    emit(BRW_OPCODE_ENDIF);
1510 }
1511
1512 void
1513 vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type,
1514                                      src_reg value)
1515 {
1516    int shader_time_index =
1517       brw_get_shader_time_index(brw, shader_prog, prog, type);
1518
1519    dst_reg dst =
1520       dst_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 2));
1521
1522    dst_reg offset = dst;
1523    dst_reg time = dst;
1524    time.reg_offset++;
1525
1526    offset.type = BRW_REGISTER_TYPE_UD;
1527    emit(MOV(offset, src_reg(shader_time_index * SHADER_TIME_STRIDE)));
1528
1529    time.type = BRW_REGISTER_TYPE_UD;
1530    emit(MOV(time, src_reg(value)));
1531
1532    emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(dst));
1533 }
1534
1535 bool
1536 vec4_visitor::run()
1537 {
1538    sanity_param_count = prog->Parameters->NumParameters;
1539
1540    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
1541       emit_shader_time_begin();
1542
1543    assign_common_binding_table_offsets(0);
1544
1545    emit_prolog();
1546
1547    /* Generate VS IR for main().  (the visitor only descends into
1548     * functions called "main").
1549     */
1550    if (shader) {
1551       visit_instructions(shader->base.ir);
1552    } else {
1553       emit_program_code();
1554    }
1555    base_ir = NULL;
1556
1557    if (key->userclip_active && !prog->UsesClipDistanceOut)
1558       setup_uniform_clipplane_values();
1559
1560    emit_thread_end();
1561
1562    /* Before any optimization, push array accesses out to scratch
1563     * space where we need them to be.  This pass may allocate new
1564     * virtual GRFs, so we want to do it early.  It also makes sure
1565     * that we have reladdr computations available for CSE, since we'll
1566     * often do repeated subexpressions for those.
1567     */
1568    if (shader) {
1569       move_grf_array_access_to_scratch();
1570       move_uniform_array_access_to_pull_constants();
1571    } else {
1572       /* The ARB_vertex_program frontend emits pull constant loads directly
1573        * rather than using reladdr, so we don't need to walk through all the
1574        * instructions looking for things to move.  There isn't anything.
1575        *
1576        * We do still need to split things to vec4 size.
1577        */
1578       split_uniform_registers();
1579    }
1580    pack_uniform_registers();
1581    move_push_constants_to_pull_constants();
1582    split_virtual_grfs();
1583
1584    bool progress;
1585    do {
1586       progress = false;
1587       progress = dead_code_eliminate() || progress;
1588       progress = dead_control_flow_eliminate(this) || progress;
1589       progress = opt_copy_propagation() || progress;
1590       progress = opt_algebraic() || progress;
1591       progress = opt_register_coalesce() || progress;
1592    } while (progress);
1593
1594
1595    if (failed)
1596       return false;
1597
1598    setup_payload();
1599
1600    if (false) {
1601       /* Debug of register spilling: Go spill everything. */
1602       const int grf_count = virtual_grf_count;
1603       float spill_costs[virtual_grf_count];
1604       bool no_spill[virtual_grf_count];
1605       evaluate_spill_costs(spill_costs, no_spill);
1606       for (int i = 0; i < grf_count; i++) {
1607          if (no_spill[i])
1608             continue;
1609          spill_reg(i);
1610       }
1611    }
1612
1613    while (!reg_allocate()) {
1614       if (failed)
1615          return false;
1616    }
1617
1618    opt_schedule_instructions();
1619
1620    opt_set_dependency_control();
1621
1622    /* If any state parameters were appended, then ParameterValues could have
1623     * been realloced, in which case the driver uniform storage set up by
1624     * _mesa_associate_uniform_storage() would point to freed memory.  Make
1625     * sure that didn't happen.
1626     */
1627    assert(sanity_param_count == prog->Parameters->NumParameters);
1628
1629    return !failed;
1630 }
1631
1632 } /* namespace brw */
1633
1634 extern "C" {
1635
1636 /**
1637  * Compile a vertex shader.
1638  *
1639  * Returns the final assembly and the program's size.
1640  */
1641 const unsigned *
1642 brw_vs_emit(struct brw_context *brw,
1643             struct gl_shader_program *prog,
1644             struct brw_vs_compile *c,
1645             struct brw_vs_prog_data *prog_data,
1646             void *mem_ctx,
1647             unsigned *final_assembly_size)
1648 {
1649    bool start_busy = false;
1650    float start_time = 0;
1651
1652    if (unlikely(brw->perf_debug)) {
1653       start_busy = (brw->batch.last_bo &&
1654                     drm_intel_bo_busy(brw->batch.last_bo));
1655       start_time = get_time();
1656    }
1657
1658    struct brw_shader *shader = NULL;
1659    if (prog)
1660       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_VERTEX];
1661
1662    if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
1663       if (prog) {
1664          printf("GLSL IR for native vertex shader %d:\n", prog->Name);
1665          _mesa_print_ir(shader->base.ir, NULL);
1666          printf("\n\n");
1667       } else {
1668          printf("ARB_vertex_program %d for native vertex shader\n",
1669                 c->vp->program.Base.Id);
1670          _mesa_print_program(&c->vp->program.Base);
1671       }
1672    }
1673
1674    vec4_vs_visitor v(brw, c, prog_data, prog, shader, mem_ctx);
1675    if (!v.run()) {
1676       if (prog) {
1677          prog->LinkStatus = false;
1678          ralloc_strcat(&prog->InfoLog, v.fail_msg);
1679       }
1680
1681       _mesa_problem(NULL, "Failed to compile vertex shader: %s\n",
1682                     v.fail_msg);
1683
1684       return NULL;
1685    }
1686
1687    const unsigned *assembly = NULL;
1688    if (brw->gen >= 8) {
1689       gen8_vec4_generator g(brw, prog, &c->vp->program.Base, &prog_data->base,
1690                             mem_ctx, INTEL_DEBUG & DEBUG_VS);
1691       assembly = g.generate_assembly(&v.instructions, final_assembly_size);
1692    } else {
1693       vec4_generator g(brw, prog, &c->vp->program.Base, &prog_data->base,
1694                        mem_ctx, INTEL_DEBUG & DEBUG_VS);
1695       assembly = g.generate_assembly(&v.instructions, final_assembly_size);
1696    }
1697
1698    if (unlikely(brw->perf_debug) && shader) {
1699       if (shader->compiled_once) {
1700          brw_vs_debug_recompile(brw, prog, &c->key);
1701       }
1702       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
1703          perf_debug("VS compile took %.03f ms and stalled the GPU\n",
1704                     (get_time() - start_time) * 1000);
1705       }
1706       shader->compiled_once = true;
1707    }
1708
1709    return assembly;
1710 }
1711
1712
1713 void
1714 brw_vec4_setup_prog_key_for_precompile(struct gl_context *ctx,
1715                                        struct brw_vec4_prog_key *key,
1716                                        GLuint id, struct gl_program *prog)
1717 {
1718    key->program_string_id = id;
1719    key->clamp_vertex_color = ctx->API == API_OPENGL_COMPAT;
1720
1721    unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
1722    for (unsigned i = 0; i < sampler_count; i++) {
1723       if (prog->ShadowSamplers & (1 << i)) {
1724          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
1725          key->tex.swizzles[i] =
1726             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
1727       } else {
1728          /* Color sampler: assume no swizzling. */
1729          key->tex.swizzles[i] = SWIZZLE_XYZW;
1730       }
1731    }
1732 }
1733
1734
1735 bool
1736 brw_vec4_prog_data_compare(const struct brw_vec4_prog_data *a,
1737                            const struct brw_vec4_prog_data *b)
1738 {
1739    /* Compare all the struct (including the base) up to the pointers. */
1740    if (memcmp(a, b, offsetof(struct brw_vec4_prog_data, param)))
1741       return false;
1742
1743    if (memcmp(a->param, b->param, a->nr_params * sizeof(void *)))
1744       return false;
1745
1746    if (memcmp(a->pull_param, b->pull_param, a->nr_pull_params * sizeof(void *)))
1747       return false;
1748
1749    return true;
1750 }
1751
1752
1753 void
1754 brw_vec4_prog_data_free(const struct brw_vec4_prog_data *prog_data)
1755 {
1756    ralloc_free((void *)prog_data->param);
1757    ralloc_free((void *)prog_data->pull_param);
1758 }
1759
1760
1761 } /* extern "C" */