src/mesa/drivers/dri/i965/brw_vec4.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_fs.h"
  26 #include "brw_cfg.h"
  27 #include "brw_vs.h"
  28 #include "brw_nir.h"
  29 #include "brw_vec4_live_variables.h"
  30 #include "brw_dead_control_flow.h"
  31 #include "program/prog_parameter.h"
  32
  33 #define MAX_INSTRUCTION (1 << 30)
  34
  35 using namespace brw;
  36
  37 namespace brw {
  38
  39 void
  40 src_reg::init()
  41 {
  42    memset(this, 0, sizeof(*this));
  43
  44    this->file = BAD_FILE;
  45 }
  46
  47 src_reg::src_reg(enum brw_reg_file file, int nr, const glsl_type *type)
  48 {
  49    init();
  50
  51    this->file = file;
  52    this->nr = nr;
  53    if (type && (type->is_scalar() || type->is_vector() || type->is_matrix()))
  54       this->swizzle = brw_swizzle_for_size(type->vector_elements);
  55    else
  56       this->swizzle = BRW_SWIZZLE_XYZW;
  57    if (type)
  58       this->type = brw_type_for_base_type(type);
  59 }
  60
  61 /** Generic unset register constructor. */
  62 src_reg::src_reg()
  63 {
  64    init();
  65 }
  66
  67 src_reg::src_reg(struct ::brw_reg reg) :
  68    backend_reg(reg)
  69 {
  70    this->reg_offset = 0;
  71    this->reladdr = NULL;
  72 }
  73
  74 src_reg::src_reg(const dst_reg &reg) :
  75    backend_reg(reg)
  76 {
  77    this->reladdr = reg.reladdr;
  78    this->swizzle = brw_swizzle_for_mask(reg.writemask);
  79 }
  80
  81 void
  82 dst_reg::init()
  83 {
  84    memset(this, 0, sizeof(*this));
  85    this->file = BAD_FILE;
  86    this->writemask = WRITEMASK_XYZW;
  87 }
  88
  89 dst_reg::dst_reg()
  90 {
  91    init();
  92 }
  93
  94 dst_reg::dst_reg(enum brw_reg_file file, int nr)
  95 {
  96    init();
  97
  98    this->file = file;
  99    this->nr = nr;
 100 }
 101
 102 dst_reg::dst_reg(enum brw_reg_file file, int nr, const glsl_type *type,
 103                  unsigned writemask)
 104 {
 105    init();
 106
 107    this->file = file;
 108    this->nr = nr;
 109    this->type = brw_type_for_base_type(type);
 110    this->writemask = writemask;
 111 }
 112
 113 dst_reg::dst_reg(enum brw_reg_file file, int nr, brw_reg_type type,
 114                  unsigned writemask)
 115 {
 116    init();
 117
 118    this->file = file;
 119    this->nr = nr;
 120    this->type = type;
 121    this->writemask = writemask;
 122 }
 123
 124 dst_reg::dst_reg(struct ::brw_reg reg) :
 125    backend_reg(reg)
 126 {
 127    this->reg_offset = 0;
 128    this->reladdr = NULL;
 129 }
 130
 131 dst_reg::dst_reg(const src_reg &reg) :
 132    backend_reg(reg)
 133 {
 134    this->writemask = brw_mask_for_swizzle(reg.swizzle);
 135    this->reladdr = reg.reladdr;
 136 }
 137
 138 bool
 139 dst_reg::equals(const dst_reg &r) const
 140 {
 141    return (this->backend_reg::equals(r) &&
 142            (reladdr == r.reladdr ||
 143             (reladdr && r.reladdr && reladdr->equals(*r.reladdr))));
 144 }
 145
 146 bool
 147 vec4_instruction::is_send_from_grf()
 148 {
 149    switch (opcode) {
 150    case SHADER_OPCODE_SHADER_TIME_ADD:
 151    case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
 152    case SHADER_OPCODE_UNTYPED_ATOMIC:
 153    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 154    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 155    case SHADER_OPCODE_TYPED_ATOMIC:
 156    case SHADER_OPCODE_TYPED_SURFACE_READ:
 157    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 158       return true;
 159    default:
 160       return false;
 161    }
 162 }
 163
 164 /**
 165  * Returns true if this instruction's sources and destinations cannot
 166  * safely be the same register.
 167  *
 168  * In most cases, a register can be written over safely by the same
 169  * instruction that is its last use.  For a single instruction, the
 170  * sources are dereferenced before writing of the destination starts
 171  * (naturally).
 172  *
 173  * However, there are a few cases where this can be problematic:
 174  *
 175  * - Virtual opcodes that translate to multiple instructions in the
 176  *   code generator: if src == dst and one instruction writes the
 177  *   destination before a later instruction reads the source, then
 178  *   src will have been clobbered.
 179  *
 180  * The register allocator uses this information to set up conflicts between
 181  * GRF sources and the destination.
 182  */
 183 bool
 184 vec4_instruction::has_source_and_destination_hazard() const
 185 {
 186    switch (opcode) {
 187    /* Most opcodes in the vec4 world use MRFs. */
 188    default:
 189       return false;
 190    }
 191 }
 192
 193 unsigned
 194 vec4_instruction::regs_read(unsigned arg) const
 195 {
 196    if (src[arg].file == BAD_FILE)
 197       return 0;
 198
 199    switch (opcode) {
 200    case SHADER_OPCODE_SHADER_TIME_ADD:
 201    case SHADER_OPCODE_UNTYPED_ATOMIC:
 202    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 203    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 204    case SHADER_OPCODE_TYPED_ATOMIC:
 205    case SHADER_OPCODE_TYPED_SURFACE_READ:
 206    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 207       return arg == 0 ? mlen : 1;
 208
 209    case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
 210       return arg == 1 ? mlen : 1;
 211
 212    default:
 213       return 1;
 214    }
 215 }
 216
 217 bool
 218 vec4_instruction::can_do_source_mods(const struct brw_device_info *devinfo)
 219 {
 220    if (devinfo->gen == 6 && is_math())
 221       return false;
 222
 223    if (is_send_from_grf())
 224       return false;
 225
 226    if (!backend_instruction::can_do_source_mods())
 227       return false;
 228
 229    return true;
 230 }
 231
 232 bool
 233 vec4_instruction::can_change_types() const
 234 {
 235    return dst.type == src[0].type &&
 236           !src[0].abs && !src[0].negate && !saturate &&
 237           (opcode == BRW_OPCODE_MOV ||
 238            (opcode == BRW_OPCODE_SEL &&
 239             dst.type == src[1].type &&
 240             predicate != BRW_PREDICATE_NONE &&
 241             !src[1].abs && !src[1].negate));
 242 }
 243
 244 /**
 245  * Returns how many MRFs an opcode will write over.
 246  *
 247  * Note that this is not the 0 or 1 implied writes in an actual gen
 248  * instruction -- the generate_* functions generate additional MOVs
 249  * for setup.
 250  */
 251 int
 252 vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
 253 {
 254    if (inst->mlen == 0 || inst->is_send_from_grf())
 255       return 0;
 256
 257    switch (inst->opcode) {
 258    case SHADER_OPCODE_RCP:
 259    case SHADER_OPCODE_RSQ:
 260    case SHADER_OPCODE_SQRT:
 261    case SHADER_OPCODE_EXP2:
 262    case SHADER_OPCODE_LOG2:
 263    case SHADER_OPCODE_SIN:
 264    case SHADER_OPCODE_COS:
 265       return 1;
 266    case SHADER_OPCODE_INT_QUOTIENT:
 267    case SHADER_OPCODE_INT_REMAINDER:
 268    case SHADER_OPCODE_POW:
 269       return 2;
 270    case VS_OPCODE_URB_WRITE:
 271       return 1;
 272    case VS_OPCODE_PULL_CONSTANT_LOAD:
 273       return 2;
 274    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 275       return 2;
 276    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 277       return 3;
 278    case GS_OPCODE_URB_WRITE:
 279    case GS_OPCODE_URB_WRITE_ALLOCATE:
 280    case GS_OPCODE_THREAD_END:
 281       return 0;
 282    case GS_OPCODE_FF_SYNC:
 283       return 1;
 284    case SHADER_OPCODE_SHADER_TIME_ADD:
 285       return 0;
 286    case SHADER_OPCODE_TEX:
 287    case SHADER_OPCODE_TXL:
 288    case SHADER_OPCODE_TXD:
 289    case SHADER_OPCODE_TXF:
 290    case SHADER_OPCODE_TXF_CMS:
 291    case SHADER_OPCODE_TXF_CMS_W:
 292    case SHADER_OPCODE_TXF_MCS:
 293    case SHADER_OPCODE_TXS:
 294    case SHADER_OPCODE_TG4:
 295    case SHADER_OPCODE_TG4_OFFSET:
 296    case SHADER_OPCODE_SAMPLEINFO:
 297    case VS_OPCODE_GET_BUFFER_SIZE:
 298       return inst->header_size;
 299    default:
 300       unreachable("not reached");
 301    }
 302 }
 303
 304 bool
 305 src_reg::equals(const src_reg &r) const
 306 {
 307    return (this->backend_reg::equals(r) &&
 308            !reladdr && !r.reladdr);
 309 }
 310
 311 bool
 312 vec4_visitor::opt_vector_float()
 313 {
 314    bool progress = false;
 315
 316    int last_reg = -1, last_reg_offset = -1;
 317    enum brw_reg_file last_reg_file = BAD_FILE;
 318
 319    int remaining_channels = 0;
 320    uint8_t imm[4];
 321    int inst_count = 0;
 322    vec4_instruction *imm_inst[4];
 323
 324    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
 325       if (last_reg != inst->dst.nr ||
 326           last_reg_offset != inst->dst.reg_offset ||
 327           last_reg_file != inst->dst.file) {
 328          last_reg = inst->dst.nr;
 329          last_reg_offset = inst->dst.reg_offset;
 330          last_reg_file = inst->dst.file;
 331          remaining_channels = WRITEMASK_XYZW;
 332
 333          inst_count = 0;
 334       }
 335
 336       if (inst->opcode != BRW_OPCODE_MOV ||
 337           inst->dst.writemask == WRITEMASK_XYZW ||
 338           inst->src[0].file != IMM)
 339          continue;
 340
 341       int vf = brw_float_to_vf(inst->src[0].f);
 342       if (vf == -1)
 343          continue;
 344
 345       if ((inst->dst.writemask & WRITEMASK_X) != 0)
 346          imm[0] = vf;
 347       if ((inst->dst.writemask & WRITEMASK_Y) != 0)
 348          imm[1] = vf;
 349       if ((inst->dst.writemask & WRITEMASK_Z) != 0)
 350          imm[2] = vf;
 351       if ((inst->dst.writemask & WRITEMASK_W) != 0)
 352          imm[3] = vf;
 353
 354       imm_inst[inst_count++] = inst;
 355
 356       remaining_channels &= ~inst->dst.writemask;
 357       if (remaining_channels == 0) {
 358          unsigned vf;
 359          memcpy(&vf, imm, sizeof(vf));
 360          vec4_instruction *mov = MOV(inst->dst, brw_imm_vf(vf));
 361          mov->dst.type = BRW_REGISTER_TYPE_F;
 362          mov->dst.writemask = WRITEMASK_XYZW;
 363          inst->insert_after(block, mov);
 364          last_reg = -1;
 365
 366          for (int i = 0; i < inst_count; i++) {
 367             imm_inst[i]->remove(block);
 368          }
 369          progress = true;
 370       }
 371    }
 372
 373    if (progress)
 374       invalidate_live_intervals();
 375
 376    return progress;
 377 }
 378
 379 /* Replaces unused channels of a swizzle with channels that are used.
 380  *
 381  * For instance, this pass transforms
 382  *
 383  *    mov vgrf4.yz, vgrf5.wxzy
 384  *
 385  * into
 386  *
 387  *    mov vgrf4.yz, vgrf5.xxzx
 388  *
 389  * This eliminates false uses of some channels, letting dead code elimination
 390  * remove the instructions that wrote them.
 391  */
 392 bool
 393 vec4_visitor::opt_reduce_swizzle()
 394 {
 395    bool progress = false;
 396
 397    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
 398       if (inst->dst.file == BAD_FILE ||
 399           inst->dst.file == ARF ||
 400           inst->dst.file == FIXED_GRF ||
 401           inst->is_send_from_grf())
 402          continue;
 403
 404       unsigned swizzle;
 405
 406       /* Determine which channels of the sources are read. */
 407       switch (inst->opcode) {
 408       case VEC4_OPCODE_PACK_BYTES:
 409       case BRW_OPCODE_DP4:
 410       case BRW_OPCODE_DPH: /* FINISHME: DPH reads only three channels of src0,
 411                             *           but all four of src1.
 412                             */
 413          swizzle = brw_swizzle_for_size(4);
 414          break;
 415       case BRW_OPCODE_DP3:
 416          swizzle = brw_swizzle_for_size(3);
 417          break;
 418       case BRW_OPCODE_DP2:
 419          swizzle = brw_swizzle_for_size(2);
 420          break;
 421       default:
 422          swizzle = brw_swizzle_for_mask(inst->dst.writemask);
 423          break;
 424       }
 425
 426       /* Update sources' swizzles. */
 427       for (int i = 0; i < 3; i++) {
 428          if (inst->src[i].file != VGRF &&
 429              inst->src[i].file != ATTR &&
 430              inst->src[i].file != UNIFORM)
 431             continue;
 432
 433          const unsigned new_swizzle =
 434             brw_compose_swizzle(swizzle, inst->src[i].swizzle);
 435          if (inst->src[i].swizzle != new_swizzle) {
 436             inst->src[i].swizzle = new_swizzle;
 437             progress = true;
 438          }
 439       }
 440    }
 441
 442    if (progress)
 443       invalidate_live_intervals();
 444
 445    return progress;
 446 }
 447
 448 void
 449 vec4_visitor::split_uniform_registers()
 450 {
 451    /* Prior to this, uniforms have been in an array sized according to
 452     * the number of vector uniforms present, sparsely filled (so an
 453     * aggregate results in reg indices being skipped over).  Now we're
 454     * going to cut those aggregates up so each .nr index is one
 455     * vector.  The goal is to make elimination of unused uniform
 456     * components easier later.
 457     */
 458    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
 459       for (int i = 0 ; i < 3; i++) {
 460          if (inst->src[i].file != UNIFORM)
 461             continue;
 462
 463          assert(!inst->src[i].reladdr);
 464
 465          inst->src[i].nr += inst->src[i].reg_offset;
 466          inst->src[i].reg_offset = 0;
 467       }
 468    }
 469
 470    /* Update that everything is now vector-sized. */
 471    for (int i = 0; i < this->uniforms; i++) {
 472       this->uniform_size[i] = 1;
 473    }
 474 }
 475
 476 void
 477 vec4_visitor::pack_uniform_registers()
 478 {
 479    uint8_t chans_used[this->uniforms];
 480    int new_loc[this->uniforms];
 481    int new_chan[this->uniforms];
 482
 483    memset(chans_used, 0, sizeof(chans_used));
 484    memset(new_loc, 0, sizeof(new_loc));
 485    memset(new_chan, 0, sizeof(new_chan));
 486
 487    /* Find which uniform vectors are actually used by the program.  We
 488     * expect unused vector elements when we've moved array access out
 489     * to pull constants, and from some GLSL code generators like wine.
 490     */
 491    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
 492       unsigned readmask;
 493       switch (inst->opcode) {
 494       case VEC4_OPCODE_PACK_BYTES:
 495       case BRW_OPCODE_DP4:
 496       case BRW_OPCODE_DPH:
 497          readmask = 0xf;
 498          break;
 499       case BRW_OPCODE_DP3:
 500          readmask = 0x7;
 501          break;
 502       case BRW_OPCODE_DP2:
 503          readmask = 0x3;
 504          break;
 505       default:
 506          readmask = inst->dst.writemask;
 507          break;
 508       }
 509
 510       for (int i = 0 ; i < 3; i++) {
 511          if (inst->src[i].file != UNIFORM)
 512             continue;
 513
 514          int reg = inst->src[i].nr;
 515          for (int c = 0; c < 4; c++) {
 516             if (!(readmask & (1 << c)))
 517                continue;
 518
 519             chans_used[reg] = MAX2(chans_used[reg],
 520                                    BRW_GET_SWZ(inst->src[i].swizzle, c) + 1);
 521          }
 522       }
 523    }
 524
 525    int new_uniform_count = 0;
 526
 527    /* Now, figure out a packing of the live uniform vectors into our
 528     * push constants.
 529     */
 530    for (int src = 0; src < uniforms; src++) {
 531       assert(src < uniform_array_size);
 532       int size = chans_used[src];
 533
 534       if (size == 0)
 535          continue;
 536
 537       int dst;
 538       /* Find the lowest place we can slot this uniform in. */
 539       for (dst = 0; dst < src; dst++) {
 540          if (chans_used[dst] + size <= 4)
 541             break;
 542       }
 543
 544       if (src == dst) {
 545          new_loc[src] = dst;
 546          new_chan[src] = 0;
 547       } else {
 548          new_loc[src] = dst;
 549          new_chan[src] = chans_used[dst];
 550
 551          /* Move the references to the data */
 552          for (int j = 0; j < size; j++) {
 553             stage_prog_data->param[dst * 4 + new_chan[src] + j] =
 554                stage_prog_data->param[src * 4 + j];
 555          }
 556
 557          chans_used[dst] += size;
 558          chans_used[src] = 0;
 559       }
 560
 561       new_uniform_count = MAX2(new_uniform_count, dst + 1);
 562    }
 563
 564    this->uniforms = new_uniform_count;
 565
 566    /* Now, update the instructions for our repacked uniforms. */
 567    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
 568       for (int i = 0 ; i < 3; i++) {
 569          int src = inst->src[i].nr;
 570
 571          if (inst->src[i].file != UNIFORM)
 572             continue;
 573
 574          inst->src[i].nr = new_loc[src];
 575          inst->src[i].swizzle += BRW_SWIZZLE4(new_chan[src], new_chan[src],
 576                                               new_chan[src], new_chan[src]);
 577       }
 578    }
 579 }
 580
 581 /**
 582  * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
 583  *
 584  * While GLSL IR also performs this optimization, we end up with it in
 585  * our instruction stream for a couple of reasons.  One is that we
 586  * sometimes generate silly instructions, for example in array access
 587  * where we'll generate "ADD offset, index, base" even if base is 0.
 588  * The other is that GLSL IR's constant propagation doesn't track the
 589  * components of aggregates, so some VS patterns (initialize matrix to
 590  * 0, accumulate in vertex blending factors) end up breaking down to
 591  * instructions involving 0.
 592  */
 593 bool
 594 vec4_visitor::opt_algebraic()
 595 {
 596    bool progress = false;
 597
 598    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
 599       switch (inst->opcode) {
 600       case BRW_OPCODE_MOV:
 601          if (inst->src[0].file != IMM)
 602             break;
 603
 604          if (inst->saturate) {
 605             if (inst->dst.type != inst->src[0].type)
 606                assert(!"unimplemented: saturate mixed types");
 607
 608             if (brw_saturate_immediate(inst->dst.type,
 609                                        &inst->src[0].as_brw_reg())) {
 610                inst->saturate = false;
 611                progress = true;
 612             }
 613          }
 614          break;
 615
 616       case VEC4_OPCODE_UNPACK_UNIFORM:
 617          if (inst->src[0].file != UNIFORM) {
 618             inst->opcode = BRW_OPCODE_MOV;
 619             progress = true;
 620          }
 621          break;
 622
 623       case BRW_OPCODE_ADD:
 624          if (inst->src[1].is_zero()) {
 625             inst->opcode = BRW_OPCODE_MOV;
 626             inst->src[1] = src_reg();
 627             progress = true;
 628          }
 629          break;
 630
 631       case BRW_OPCODE_MUL:
 632          if (inst->src[1].is_zero()) {
 633             inst->opcode = BRW_OPCODE_MOV;
 634             switch (inst->src[0].type) {
 635             case BRW_REGISTER_TYPE_F:
 636                inst->src[0] = brw_imm_f(0.0f);
 637                break;
 638             case BRW_REGISTER_TYPE_D:
 639                inst->src[0] = brw_imm_d(0);
 640                break;
 641             case BRW_REGISTER_TYPE_UD:
 642                inst->src[0] = brw_imm_ud(0u);
 643                break;
 644             default:
 645                unreachable("not reached");
 646             }
 647             inst->src[1] = src_reg();
 648             progress = true;
 649          } else if (inst->src[1].is_one()) {
 650             inst->opcode = BRW_OPCODE_MOV;
 651             inst->src[1] = src_reg();
 652             progress = true;
 653          } else if (inst->src[1].is_negative_one()) {
 654             inst->opcode = BRW_OPCODE_MOV;
 655             inst->src[0].negate = !inst->src[0].negate;
 656             inst->src[1] = src_reg();
 657             progress = true;
 658          }
 659          break;
 660       case BRW_OPCODE_CMP:
 661          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
 662              inst->src[0].abs &&
 663              inst->src[0].negate &&
 664              inst->src[1].is_zero()) {
 665             inst->src[0].abs = false;
 666             inst->src[0].negate = false;
 667             inst->conditional_mod = BRW_CONDITIONAL_Z;
 668             progress = true;
 669             break;
 670          }
 671          break;
 672       case SHADER_OPCODE_RCP: {
 673          vec4_instruction *prev = (vec4_instruction *)inst->prev;
 674          if (prev->opcode == SHADER_OPCODE_SQRT) {
 675             if (inst->src[0].equals(src_reg(prev->dst))) {
 676                inst->opcode = SHADER_OPCODE_RSQ;
 677                inst->src[0] = prev->src[0];
 678                progress = true;
 679             }
 680          }
 681          break;
 682       }
 683       case SHADER_OPCODE_BROADCAST:
 684          if (is_uniform(inst->src[0]) ||
 685              inst->src[1].is_zero()) {
 686             inst->opcode = BRW_OPCODE_MOV;
 687             inst->src[1] = src_reg();
 688             inst->force_writemask_all = true;
 689             progress = true;
 690          }
 691          break;
 692
 693       default:
 694          break;
 695       }
 696    }
 697
 698    if (progress)
 699       invalidate_live_intervals();
 700
 701    return progress;
 702 }
 703
 704 /**
 705  * Only a limited number of hardware registers may be used for push
 706  * constants, so this turns access to the overflowed constants into
 707  * pull constants.
 708  */
 709 void
 710 vec4_visitor::move_push_constants_to_pull_constants()
 711 {
 712    int pull_constant_loc[this->uniforms];
 713
 714    /* Only allow 32 registers (256 uniform components) as push constants,
 715     * which is the limit on gen6.
 716     *
 717     * If changing this value, note the limitation about total_regs in
 718     * brw_curbe.c.
 719     */
 720    int max_uniform_components = 32 * 8;
 721    if (this->uniforms * 4 <= max_uniform_components)
 722       return;
 723
 724    /* Make some sort of choice as to which uniforms get sent to pull
 725     * constants.  We could potentially do something clever here like
 726     * look for the most infrequently used uniform vec4s, but leave
 727     * that for later.
 728     */
 729    for (int i = 0; i < this->uniforms * 4; i += 4) {
 730       pull_constant_loc[i / 4] = -1;
 731
 732       if (i >= max_uniform_components) {
 733          const gl_constant_value **values = &stage_prog_data->param[i];
 734
 735          /* Try to find an existing copy of this uniform in the pull
 736           * constants if it was part of an array access already.
 737           */
 738          for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j += 4) {
 739             int matches;
 740
 741             for (matches = 0; matches < 4; matches++) {
 742                if (stage_prog_data->pull_param[j + matches] != values[matches])
 743                   break;
 744             }
 745
 746             if (matches == 4) {
 747                pull_constant_loc[i / 4] = j / 4;
 748                break;
 749             }
 750          }
 751
 752          if (pull_constant_loc[i / 4] == -1) {
 753             assert(stage_prog_data->nr_pull_params % 4 == 0);
 754             pull_constant_loc[i / 4] = stage_prog_data->nr_pull_params / 4;
 755
 756             for (int j = 0; j < 4; j++) {
 757                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
 758                   values[j];
 759             }
 760          }
 761       }
 762    }
 763
 764    /* Now actually rewrite usage of the things we've moved to pull
 765     * constants.
 766     */
 767    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
 768       for (int i = 0 ; i < 3; i++) {
 769          if (inst->src[i].file != UNIFORM ||
 770              pull_constant_loc[inst->src[i].nr] == -1)
 771             continue;
 772
 773          int uniform = inst->src[i].nr;
 774
 775          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
 776
 777          emit_pull_constant_load(block, inst, temp, inst->src[i],
 778                                  pull_constant_loc[uniform]);
 779
 780          inst->src[i].file = temp.file;
 781          inst->src[i].nr = temp.nr;
 782          inst->src[i].reg_offset = temp.reg_offset;
 783          inst->src[i].reladdr = NULL;
 784       }
 785    }
 786
 787    /* Repack push constants to remove the now-unused ones. */
 788    pack_uniform_registers();
 789 }
 790
 791 /* Conditions for which we want to avoid setting the dependency control bits */
 792 bool
 793 vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
 794 {
 795 #define IS_DWORD(reg) \
 796    (reg.type == BRW_REGISTER_TYPE_UD || \
 797     reg.type == BRW_REGISTER_TYPE_D)
 798
 799    /* "When source or destination datatype is 64b or operation is integer DWord
 800     * multiply, DepCtrl must not be used."
 801     * May apply to future SoCs as well.
 802     */
 803    if (devinfo->is_cherryview) {
 804       if (inst->opcode == BRW_OPCODE_MUL &&
 805          IS_DWORD(inst->src[0]) &&
 806          IS_DWORD(inst->src[1]))
 807          return true;
 808    }
 809 #undef IS_DWORD
 810
 811    if (devinfo->gen >= 8) {
 812       if (inst->opcode == BRW_OPCODE_F32TO16)
 813          return true;
 814    }
 815
 816    /*
 817     * mlen:
 818     * In the presence of send messages, totally interrupt dependency
 819     * control. They're long enough that the chance of dependency
 820     * control around them just doesn't matter.
 821     *
 822     * predicate:
 823     * From the Ivy Bridge PRM, volume 4 part 3.7, page 80:
 824     * When a sequence of NoDDChk and NoDDClr are used, the last instruction that
 825     * completes the scoreboard clear must have a non-zero execution mask. This
 826     * means, if any kind of predication can change the execution mask or channel
 827     * enable of the last instruction, the optimization must be avoided. This is
 828     * to avoid instructions being shot down the pipeline when no writes are
 829     * required.
 830     *
 831     * math:
 832     * Dependency control does not work well over math instructions.
 833     * NB: Discovered empirically
 834     */
 835    return (inst->mlen || inst->predicate || inst->is_math());
 836 }
 837
 838 /**
 839  * Sets the dependency control fields on instructions after register
 840  * allocation and before the generator is run.
 841  *
 842  * When you have a sequence of instructions like:
 843  *
 844  * DP4 temp.x vertex uniform[0]
 845  * DP4 temp.y vertex uniform[0]
 846  * DP4 temp.z vertex uniform[0]
 847  * DP4 temp.w vertex uniform[0]
 848  *
 849  * The hardware doesn't know that it can actually run the later instructions
 850  * while the previous ones are in flight, producing stalls.  However, we have
 851  * manual fields we can set in the instructions that let it do so.
 852  */
 853 void
 854 vec4_visitor::opt_set_dependency_control()
 855 {
 856    vec4_instruction *last_grf_write[BRW_MAX_GRF];
 857    uint8_t grf_channels_written[BRW_MAX_GRF];
 858    vec4_instruction *last_mrf_write[BRW_MAX_GRF];
 859    uint8_t mrf_channels_written[BRW_MAX_GRF];
 860
 861    assert(prog_data->total_grf ||
 862           !"Must be called after register allocation");
 863
 864    foreach_block (block, cfg) {
 865       memset(last_grf_write, 0, sizeof(last_grf_write));
 866       memset(last_mrf_write, 0, sizeof(last_mrf_write));
 867
 868       foreach_inst_in_block (vec4_instruction, inst, block) {
 869          /* If we read from a register that we were doing dependency control
 870           * on, don't do dependency control across the read.
 871           */
 872          for (int i = 0; i < 3; i++) {
 873             int reg = inst->src[i].nr + inst->src[i].reg_offset;
 874             if (inst->src[i].file == VGRF) {
 875                last_grf_write[reg] = NULL;
 876             } else if (inst->src[i].file == FIXED_GRF) {
 877                memset(last_grf_write, 0, sizeof(last_grf_write));
 878                break;
 879             }
 880             assert(inst->src[i].file != MRF);
 881          }
 882
 883          if (is_dep_ctrl_unsafe(inst)) {
 884             memset(last_grf_write, 0, sizeof(last_grf_write));
 885             memset(last_mrf_write, 0, sizeof(last_mrf_write));
 886             continue;
 887          }
 888
 889          /* Now, see if we can do dependency control for this instruction
 890           * against a previous one writing to its destination.
 891           */
 892          int reg = inst->dst.nr + inst->dst.reg_offset;
 893          if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) {
 894             if (last_grf_write[reg] &&
 895                 !(inst->dst.writemask & grf_channels_written[reg])) {
 896                last_grf_write[reg]->no_dd_clear = true;
 897                inst->no_dd_check = true;
 898             } else {
 899                grf_channels_written[reg] = 0;
 900             }
 901
 902             last_grf_write[reg] = inst;
 903             grf_channels_written[reg] |= inst->dst.writemask;
 904          } else if (inst->dst.file == MRF) {
 905             if (last_mrf_write[reg] &&
 906                 !(inst->dst.writemask & mrf_channels_written[reg])) {
 907                last_mrf_write[reg]->no_dd_clear = true;
 908                inst->no_dd_check = true;
 909             } else {
 910                mrf_channels_written[reg] = 0;
 911             }
 912
 913             last_mrf_write[reg] = inst;
 914             mrf_channels_written[reg] |= inst->dst.writemask;
 915          }
 916       }
 917    }
 918 }
 919
 920 bool
 921 vec4_instruction::can_reswizzle(const struct brw_device_info *devinfo,
 922                                 int dst_writemask,
 923                                 int swizzle,
 924                                 int swizzle_mask)
 925 {
 926    /* Gen6 MATH instructions can not execute in align16 mode, so swizzles
 927     * or writemasking are not allowed.
 928     */
 929    if (devinfo->gen == 6 && is_math() &&
 930        (swizzle != BRW_SWIZZLE_XYZW || dst_writemask != WRITEMASK_XYZW))
 931       return false;
 932
 933    /* If this instruction sets anything not referenced by swizzle, then we'd
 934     * totally break it when we reswizzle.
 935     */
 936    if (dst.writemask & ~swizzle_mask)
 937       return false;
 938
 939    if (mlen > 0)
 940       return false;
 941
 942    for (int i = 0; i < 3; i++) {
 943       if (src[i].is_accumulator())
 944          return false;
 945    }
 946
 947    return true;
 948 }
 949
 950 /**
 951  * For any channels in the swizzle's source that were populated by this
 952  * instruction, rewrite the instruction to put the appropriate result directly
 953  * in those channels.
 954  *
 955  * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x
 956  */
 957 void
 958 vec4_instruction::reswizzle(int dst_writemask, int swizzle)
 959 {
 960    /* Destination write mask doesn't correspond to source swizzle for the dot
 961     * product and pack_bytes instructions.
 962     */
 963    if (opcode != BRW_OPCODE_DP4 && opcode != BRW_OPCODE_DPH &&
 964        opcode != BRW_OPCODE_DP3 && opcode != BRW_OPCODE_DP2 &&
 965        opcode != VEC4_OPCODE_PACK_BYTES) {
 966       for (int i = 0; i < 3; i++) {
 967          if (src[i].file == BAD_FILE || src[i].file == IMM)
 968             continue;
 969
 970          src[i].swizzle = brw_compose_swizzle(swizzle, src[i].swizzle);
 971       }
 972    }
 973
 974    /* Apply the specified swizzle and writemask to the original mask of
 975     * written components.
 976     */
 977    dst.writemask = dst_writemask &
 978                    brw_apply_swizzle_to_mask(swizzle, dst.writemask);
 979 }
 980
 981 /*
 982  * Tries to reduce extra MOV instructions by taking temporary GRFs that get
 983  * just written and then MOVed into another reg and making the original write
 984  * of the GRF write directly to the final destination instead.
 985  */
 986 bool
 987 vec4_visitor::opt_register_coalesce()
 988 {
 989    bool progress = false;
 990    int next_ip = 0;
 991
 992    calculate_live_intervals();
 993
 994    foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) {
 995       int ip = next_ip;
 996       next_ip++;
 997
 998       if (inst->opcode != BRW_OPCODE_MOV ||
 999           (inst->dst.file != VGRF && inst->dst.file != MRF) ||
1000           inst->predicate ||
1001           inst->src[0].file != VGRF ||
1002           inst->dst.type != inst->src[0].type ||
1003           inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr)
1004          continue;
1005
1006       /* Remove no-op MOVs */
1007       if (inst->dst.file == inst->src[0].file &&
1008           inst->dst.nr == inst->src[0].nr &&
1009           inst->dst.reg_offset == inst->src[0].reg_offset) {
1010          bool is_nop_mov = true;
1011
1012          for (unsigned c = 0; c < 4; c++) {
1013             if ((inst->dst.writemask & (1 << c)) == 0)
1014                continue;
1015
1016             if (BRW_GET_SWZ(inst->src[0].swizzle, c) != c) {
1017                is_nop_mov = false;
1018                break;
1019             }
1020          }
1021
1022          if (is_nop_mov) {
1023             inst->remove(block);
1024             continue;
1025          }
1026       }
1027
1028       bool to_mrf = (inst->dst.file == MRF);
1029
1030       /* Can't coalesce this GRF if someone else was going to
1031        * read it later.
1032        */
1033       if (var_range_end(var_from_reg(alloc, inst->src[0]), 4) > ip)
1034          continue;
1035
1036       /* We need to check interference with the final destination between this
1037        * instruction and the earliest instruction involved in writing the GRF
1038        * we're eliminating.  To do that, keep track of which of our source
1039        * channels we've seen initialized.
1040        */
1041       const unsigned chans_needed =
1042          brw_apply_inv_swizzle_to_mask(inst->src[0].swizzle,
1043                                        inst->dst.writemask);
1044       unsigned chans_remaining = chans_needed;
1045
1046       /* Now walk up the instruction stream trying to see if we can rewrite
1047        * everything writing to the temporary to write into the destination
1048        * instead.
1049        */
1050       vec4_instruction *_scan_inst = (vec4_instruction *)inst->prev;
1051       foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst,
1052                                                   inst) {
1053          _scan_inst = scan_inst;
1054
1055          if (inst->src[0].in_range(scan_inst->dst, scan_inst->regs_written)) {
1056             /* Found something writing to the reg we want to coalesce away. */
1057             if (to_mrf) {
1058                /* SEND instructions can't have MRF as a destination. */
1059                if (scan_inst->mlen)
1060                   break;
1061
1062                if (devinfo->gen == 6) {
1063                   /* gen6 math instructions must have the destination be
1064                    * VGRF, so no compute-to-MRF for them.
1065                    */
1066                   if (scan_inst->is_math()) {
1067                      break;
1068                   }
1069                }
1070             }
1071
1072             /* This doesn't handle saturation on the instruction we
1073              * want to coalesce away if the register types do not match.
1074              * But if scan_inst is a non type-converting 'mov', we can fix
1075              * the types later.
1076              */
1077             if (inst->saturate &&
1078                 inst->dst.type != scan_inst->dst.type &&
1079                 !(scan_inst->opcode == BRW_OPCODE_MOV &&
1080                   scan_inst->dst.type == scan_inst->src[0].type))
1081                break;
1082
1083             /* If we can't handle the swizzle, bail. */
1084             if (!scan_inst->can_reswizzle(devinfo, inst->dst.writemask,
1085                                           inst->src[0].swizzle,
1086                                           chans_needed)) {
1087                break;
1088             }
1089
1090             /* This doesn't handle coalescing of multiple registers. */
1091             if (scan_inst->regs_written > 1)
1092                break;
1093
1094             /* Mark which channels we found unconditional writes for. */
1095             if (!scan_inst->predicate)
1096                chans_remaining &= ~scan_inst->dst.writemask;
1097
1098             if (chans_remaining == 0)
1099                break;
1100          }
1101
1102          /* You can't read from an MRF, so if someone else reads our MRF's
1103           * source GRF that we wanted to rewrite, that stops us.  If it's a
1104           * GRF we're trying to coalesce to, we don't actually handle
1105           * rewriting sources so bail in that case as well.
1106           */
1107          bool interfered = false;
1108          for (int i = 0; i < 3; i++) {
1109             if (inst->src[0].in_range(scan_inst->src[i],
1110                                       scan_inst->regs_read(i)))
1111                interfered = true;
1112          }
1113          if (interfered)
1114             break;
1115
1116          /* If somebody else writes the same channels of our destination here,
1117           * we can't coalesce before that.
1118           */
1119          if (inst->dst.in_range(scan_inst->dst, scan_inst->regs_written) &&
1120              (inst->dst.writemask & scan_inst->dst.writemask) != 0) {
1121             break;
1122          }
1123
1124          /* Check for reads of the register we're trying to coalesce into.  We
1125           * can't go rewriting instructions above that to put some other value
1126           * in the register instead.
1127           */
1128          if (to_mrf && scan_inst->mlen > 0) {
1129             if (inst->dst.nr >= scan_inst->base_mrf &&
1130                 inst->dst.nr < scan_inst->base_mrf + scan_inst->mlen) {
1131                break;
1132             }
1133          } else {
1134             for (int i = 0; i < 3; i++) {
1135                if (inst->dst.in_range(scan_inst->src[i],
1136                                       scan_inst->regs_read(i)))
1137                   interfered = true;
1138             }
1139             if (interfered)
1140                break;
1141          }
1142       }
1143
1144       if (chans_remaining == 0) {
1145          /* If we've made it here, we have an MOV we want to coalesce out, and
1146           * a scan_inst pointing to the earliest instruction involved in
1147           * computing the value.  Now go rewrite the instruction stream
1148           * between the two.
1149           */
1150          vec4_instruction *scan_inst = _scan_inst;
1151          while (scan_inst != inst) {
1152             if (scan_inst->dst.file == VGRF &&
1153                 scan_inst->dst.nr == inst->src[0].nr &&
1154                 scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1155                scan_inst->reswizzle(inst->dst.writemask,
1156                                     inst->src[0].swizzle);
1157                scan_inst->dst.file = inst->dst.file;
1158                scan_inst->dst.nr = inst->dst.nr;
1159                scan_inst->dst.reg_offset = inst->dst.reg_offset;
1160                if (inst->saturate &&
1161                    inst->dst.type != scan_inst->dst.type) {
1162                   /* If we have reached this point, scan_inst is a non
1163                    * type-converting 'mov' and we can modify its register types
1164                    * to match the ones in inst. Otherwise, we could have an
1165                    * incorrect saturation result.
1166                    */
1167                   scan_inst->dst.type = inst->dst.type;
1168                   scan_inst->src[0].type = inst->src[0].type;
1169                }
1170                scan_inst->saturate |= inst->saturate;
1171             }
1172             scan_inst = (vec4_instruction *)scan_inst->next;
1173          }
1174          inst->remove(block);
1175          progress = true;
1176       }
1177    }
1178
1179    if (progress)
1180       invalidate_live_intervals();
1181
1182    return progress;
1183 }
1184
1185 /**
1186  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
1187  * flow.  We could probably do better here with some form of divergence
1188  * analysis.
1189  */
1190 bool
1191 vec4_visitor::eliminate_find_live_channel()
1192 {
1193    bool progress = false;
1194    unsigned depth = 0;
1195
1196    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1197       switch (inst->opcode) {
1198       case BRW_OPCODE_IF:
1199       case BRW_OPCODE_DO:
1200          depth++;
1201          break;
1202
1203       case BRW_OPCODE_ENDIF:
1204       case BRW_OPCODE_WHILE:
1205          depth--;
1206          break;
1207
1208       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
1209          if (depth == 0) {
1210             inst->opcode = BRW_OPCODE_MOV;
1211             inst->src[0] = brw_imm_d(0);
1212             inst->force_writemask_all = true;
1213             progress = true;
1214          }
1215          break;
1216
1217       default:
1218          break;
1219       }
1220    }
1221
1222    return progress;
1223 }
1224
1225 /**
1226  * Splits virtual GRFs requesting more than one contiguous physical register.
1227  *
1228  * We initially create large virtual GRFs for temporary structures, arrays,
1229  * and matrices, so that the dereference visitor functions can add reg_offsets
1230  * to work their way down to the actual member being accessed.  But when it
1231  * comes to optimization, we'd like to treat each register as individual
1232  * storage if possible.
1233  *
1234  * So far, the only thing that might prevent splitting is a send message from
1235  * a GRF on IVB.
1236  */
1237 void
1238 vec4_visitor::split_virtual_grfs()
1239 {
1240    int num_vars = this->alloc.count;
1241    int new_virtual_grf[num_vars];
1242    bool split_grf[num_vars];
1243
1244    memset(new_virtual_grf, 0, sizeof(new_virtual_grf));
1245
1246    /* Try to split anything > 0 sized. */
1247    for (int i = 0; i < num_vars; i++) {
1248       split_grf[i] = this->alloc.sizes[i] != 1;
1249    }
1250
1251    /* Check that the instructions are compatible with the registers we're trying
1252     * to split.
1253     */
1254    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1255       if (inst->dst.file == VGRF && inst->regs_written > 1)
1256          split_grf[inst->dst.nr] = false;
1257
1258       for (int i = 0; i < 3; i++) {
1259          if (inst->src[i].file == VGRF && inst->regs_read(i) > 1)
1260             split_grf[inst->src[i].nr] = false;
1261       }
1262    }
1263
1264    /* Allocate new space for split regs.  Note that the virtual
1265     * numbers will be contiguous.
1266     */
1267    for (int i = 0; i < num_vars; i++) {
1268       if (!split_grf[i])
1269          continue;
1270
1271       new_virtual_grf[i] = alloc.allocate(1);
1272       for (unsigned j = 2; j < this->alloc.sizes[i]; j++) {
1273          unsigned reg = alloc.allocate(1);
1274          assert(reg == new_virtual_grf[i] + j - 1);
1275          (void) reg;
1276       }
1277       this->alloc.sizes[i] = 1;
1278    }
1279
1280    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1281       if (inst->dst.file == VGRF && split_grf[inst->dst.nr] &&
1282           inst->dst.reg_offset != 0) {
1283          inst->dst.nr = (new_virtual_grf[inst->dst.nr] +
1284                           inst->dst.reg_offset - 1);
1285          inst->dst.reg_offset = 0;
1286       }
1287       for (int i = 0; i < 3; i++) {
1288          if (inst->src[i].file == VGRF && split_grf[inst->src[i].nr] &&
1289              inst->src[i].reg_offset != 0) {
1290             inst->src[i].nr = (new_virtual_grf[inst->src[i].nr] +
1291                                 inst->src[i].reg_offset - 1);
1292             inst->src[i].reg_offset = 0;
1293          }
1294       }
1295    }
1296    invalidate_live_intervals();
1297 }
1298
1299 void
1300 vec4_visitor::dump_instruction(backend_instruction *be_inst)
1301 {
1302    dump_instruction(be_inst, stderr);
1303 }
1304
1305 void
1306 vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
1307 {
1308    vec4_instruction *inst = (vec4_instruction *)be_inst;
1309
1310    if (inst->predicate) {
1311       fprintf(file, "(%cf0.%d%s) ",
1312               inst->predicate_inverse ? '-' : '+',
1313               inst->flag_subreg,
1314               pred_ctrl_align16[inst->predicate]);
1315    }
1316
1317    fprintf(file, "%s", brw_instruction_name(inst->opcode));
1318    if (inst->saturate)
1319       fprintf(file, ".sat");
1320    if (inst->conditional_mod) {
1321       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
1322       if (!inst->predicate &&
1323           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
1324                                 inst->opcode != BRW_OPCODE_IF &&
1325                                 inst->opcode != BRW_OPCODE_WHILE))) {
1326          fprintf(file, ".f0.%d", inst->flag_subreg);
1327       }
1328    }
1329    fprintf(file, " ");
1330
1331    switch (inst->dst.file) {
1332    case VGRF:
1333       fprintf(file, "vgrf%d.%d", inst->dst.nr, inst->dst.reg_offset);
1334       break;
1335    case FIXED_GRF:
1336       fprintf(file, "g%d", inst->dst.nr);
1337       break;
1338    case MRF:
1339       fprintf(file, "m%d", inst->dst.nr);
1340       break;
1341    case ARF:
1342       switch (inst->dst.nr) {
1343       case BRW_ARF_NULL:
1344          fprintf(file, "null");
1345          break;
1346       case BRW_ARF_ADDRESS:
1347          fprintf(file, "a0.%d", inst->dst.subnr);
1348          break;
1349       case BRW_ARF_ACCUMULATOR:
1350          fprintf(file, "acc%d", inst->dst.subnr);
1351          break;
1352       case BRW_ARF_FLAG:
1353          fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
1354          break;
1355       default:
1356          fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
1357          break;
1358       }
1359       if (inst->dst.subnr)
1360          fprintf(file, "+%d", inst->dst.subnr);
1361       break;
1362    case BAD_FILE:
1363       fprintf(file, "(null)");
1364       break;
1365    case IMM:
1366    case ATTR:
1367    case UNIFORM:
1368       unreachable("not reached");
1369    }
1370    if (inst->dst.writemask != WRITEMASK_XYZW) {
1371       fprintf(file, ".");
1372       if (inst->dst.writemask & 1)
1373          fprintf(file, "x");
1374       if (inst->dst.writemask & 2)
1375          fprintf(file, "y");
1376       if (inst->dst.writemask & 4)
1377          fprintf(file, "z");
1378       if (inst->dst.writemask & 8)
1379          fprintf(file, "w");
1380    }
1381    fprintf(file, ":%s", brw_reg_type_letters(inst->dst.type));
1382
1383    if (inst->src[0].file != BAD_FILE)
1384       fprintf(file, ", ");
1385
1386    for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
1387       if (inst->src[i].negate)
1388          fprintf(file, "-");
1389       if (inst->src[i].abs)
1390          fprintf(file, "|");
1391       switch (inst->src[i].file) {
1392       case VGRF:
1393          fprintf(file, "vgrf%d", inst->src[i].nr);
1394          break;
1395       case FIXED_GRF:
1396          fprintf(file, "g%d", inst->src[i].nr);
1397          break;
1398       case ATTR:
1399          fprintf(file, "attr%d", inst->src[i].nr);
1400          break;
1401       case UNIFORM:
1402          fprintf(file, "u%d", inst->src[i].nr);
1403          break;
1404       case IMM:
1405          switch (inst->src[i].type) {
1406          case BRW_REGISTER_TYPE_F:
1407             fprintf(file, "%fF", inst->src[i].f);
1408             break;
1409          case BRW_REGISTER_TYPE_D:
1410             fprintf(file, "%dD", inst->src[i].d);
1411             break;
1412          case BRW_REGISTER_TYPE_UD:
1413             fprintf(file, "%uU", inst->src[i].ud);
1414             break;
1415          case BRW_REGISTER_TYPE_VF:
1416             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
1417                     brw_vf_to_float((inst->src[i].ud >>  0) & 0xff),
1418                     brw_vf_to_float((inst->src[i].ud >>  8) & 0xff),
1419                     brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
1420                     brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
1421             break;
1422          default:
1423             fprintf(file, "???");
1424             break;
1425          }
1426          break;
1427       case ARF:
1428          switch (inst->src[i].nr) {
1429          case BRW_ARF_NULL:
1430             fprintf(file, "null");
1431             break;
1432          case BRW_ARF_ADDRESS:
1433             fprintf(file, "a0.%d", inst->src[i].subnr);
1434             break;
1435          case BRW_ARF_ACCUMULATOR:
1436             fprintf(file, "acc%d", inst->src[i].subnr);
1437             break;
1438          case BRW_ARF_FLAG:
1439             fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
1440             break;
1441          default:
1442             fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
1443             break;
1444          }
1445          if (inst->src[i].subnr)
1446             fprintf(file, "+%d", inst->src[i].subnr);
1447          break;
1448       case BAD_FILE:
1449          fprintf(file, "(null)");
1450          break;
1451       case MRF:
1452          unreachable("not reached");
1453       }
1454
1455       /* Don't print .0; and only VGRFs have reg_offsets and sizes */
1456       if (inst->src[i].reg_offset != 0 &&
1457           inst->src[i].file == VGRF &&
1458           alloc.sizes[inst->src[i].nr] != 1)
1459          fprintf(file, ".%d", inst->src[i].reg_offset);
1460
1461       if (inst->src[i].file != IMM) {
1462          static const char *chans[4] = {"x", "y", "z", "w"};
1463          fprintf(file, ".");
1464          for (int c = 0; c < 4; c++) {
1465             fprintf(file, "%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]);
1466          }
1467       }
1468
1469       if (inst->src[i].abs)
1470          fprintf(file, "|");
1471
1472       if (inst->src[i].file != IMM) {
1473          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
1474       }
1475
1476       if (i < 2 && inst->src[i + 1].file != BAD_FILE)
1477          fprintf(file, ", ");
1478    }
1479
1480    if (inst->force_writemask_all)
1481       fprintf(file, " NoMask");
1482
1483    fprintf(file, "\n");
1484 }
1485
1486
1487 static inline struct brw_reg
1488 attribute_to_hw_reg(int attr, bool interleaved)
1489 {
1490    if (interleaved)
1491       return stride(brw_vec4_grf(attr / 2, (attr % 2) * 4), 0, 4, 1);
1492    else
1493       return brw_vec8_grf(attr, 0);
1494 }
1495
1496
1497 /**
1498  * Replace each register of type ATTR in this->instructions with a reference
1499  * to a fixed HW register.
1500  *
1501  * If interleaved is true, then each attribute takes up half a register, with
1502  * register N containing attribute 2*N in its first half and attribute 2*N+1
1503  * in its second half (this corresponds to the payload setup used by geometry
1504  * shaders in "single" or "dual instanced" dispatch mode).  If interleaved is
1505  * false, then each attribute takes up a whole register, with register N
1506  * containing attribute N (this corresponds to the payload setup used by
1507  * vertex shaders, and by geometry shaders in "dual object" dispatch mode).
1508  */
1509 void
1510 vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map,
1511                                           bool interleaved)
1512 {
1513    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1514       /* We have to support ATTR as a destination for GL_FIXED fixup. */
1515       if (inst->dst.file == ATTR) {
1516          int grf = attribute_map[inst->dst.nr + inst->dst.reg_offset];
1517
1518          /* All attributes used in the shader need to have been assigned a
1519           * hardware register by the caller
1520           */
1521          assert(grf != 0);
1522
1523          struct brw_reg reg = attribute_to_hw_reg(grf, interleaved);
1524          reg.type = inst->dst.type;
1525          reg.writemask = inst->dst.writemask;
1526
1527          inst->dst = reg;
1528       }
1529
1530       for (int i = 0; i < 3; i++) {
1531          if (inst->src[i].file != ATTR)
1532             continue;
1533
1534          int grf = attribute_map[inst->src[i].nr + inst->src[i].reg_offset];
1535
1536          /* All attributes used in the shader need to have been assigned a
1537           * hardware register by the caller
1538           */
1539          assert(grf != 0);
1540
1541          struct brw_reg reg = attribute_to_hw_reg(grf, interleaved);
1542          reg.swizzle = inst->src[i].swizzle;
1543          reg.type = inst->src[i].type;
1544          if (inst->src[i].abs)
1545             reg = brw_abs(reg);
1546          if (inst->src[i].negate)
1547             reg = negate(reg);
1548
1549          inst->src[i] = reg;
1550       }
1551    }
1552 }
1553
1554 int
1555 vec4_vs_visitor::setup_attributes(int payload_reg)
1556 {
1557    int nr_attributes;
1558    int attribute_map[VERT_ATTRIB_MAX + 1];
1559    memset(attribute_map, 0, sizeof(attribute_map));
1560
1561    nr_attributes = 0;
1562    for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
1563       if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
1564          attribute_map[i] = payload_reg + nr_attributes;
1565          nr_attributes++;
1566       }
1567    }
1568
1569    /* VertexID is stored by the VF as the last vertex element, but we
1570     * don't represent it with a flag in inputs_read, so we call it
1571     * VERT_ATTRIB_MAX.
1572     */
1573    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) {
1574       attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes;
1575    }
1576
1577    lower_attributes_to_hw_regs(attribute_map, false /* interleaved */);
1578
1579    return payload_reg + vs_prog_data->nr_attributes;
1580 }
1581
1582 int
1583 vec4_visitor::setup_uniforms(int reg)
1584 {
1585    prog_data->base.dispatch_grf_start_reg = reg;
1586
1587    /* The pre-gen6 VS requires that some push constants get loaded no
1588     * matter what, or the GPU would hang.
1589     */
1590    if (devinfo->gen < 6 && this->uniforms == 0) {
1591       assert(this->uniforms < this->uniform_array_size);
1592
1593       stage_prog_data->param =
1594          reralloc(NULL, stage_prog_data->param, const gl_constant_value *, 4);
1595       for (unsigned int i = 0; i < 4; i++) {
1596          unsigned int slot = this->uniforms * 4 + i;
1597          static gl_constant_value zero = { 0.0 };
1598          stage_prog_data->param[slot] = &zero;
1599       }
1600
1601       this->uniforms++;
1602       reg++;
1603    } else {
1604       reg += ALIGN(uniforms, 2) / 2;
1605    }
1606
1607    stage_prog_data->nr_params = this->uniforms * 4;
1608
1609    prog_data->base.curb_read_length =
1610       reg - prog_data->base.dispatch_grf_start_reg;
1611
1612    return reg;
1613 }
1614
1615 void
1616 vec4_vs_visitor::setup_payload(void)
1617 {
1618    int reg = 0;
1619
1620    /* The payload always contains important data in g0, which contains
1621     * the URB handles that are passed on to the URB write at the end
1622     * of the thread.  So, we always start push constants at g1.
1623     */
1624    reg++;
1625
1626    reg = setup_uniforms(reg);
1627
1628    reg = setup_attributes(reg);
1629
1630    this->first_non_payload_grf = reg;
1631 }
1632
1633 src_reg
1634 vec4_visitor::get_timestamp()
1635 {
1636    assert(devinfo->gen >= 7);
1637
1638    src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1639                                 BRW_ARF_TIMESTAMP,
1640                                 0,
1641                                 0,
1642                                 0,
1643                                 BRW_REGISTER_TYPE_UD,
1644                                 BRW_VERTICAL_STRIDE_0,
1645                                 BRW_WIDTH_4,
1646                                 BRW_HORIZONTAL_STRIDE_4,
1647                                 BRW_SWIZZLE_XYZW,
1648                                 WRITEMASK_XYZW));
1649
1650    dst_reg dst = dst_reg(this, glsl_type::uvec4_type);
1651
1652    vec4_instruction *mov = emit(MOV(dst, ts));
1653    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
1654     * even if it's not enabled in the dispatch.
1655     */
1656    mov->force_writemask_all = true;
1657
1658    return src_reg(dst);
1659 }
1660
1661 void
1662 vec4_visitor::emit_shader_time_begin()
1663 {
1664    current_annotation = "shader time start";
1665    shader_start_time = get_timestamp();
1666 }
1667
1668 void
1669 vec4_visitor::emit_shader_time_end()
1670 {
1671    current_annotation = "shader time end";
1672    src_reg shader_end_time = get_timestamp();
1673
1674
1675    /* Check that there weren't any timestamp reset events (assuming these
1676     * were the only two timestamp reads that happened).
1677     */
1678    src_reg reset_end = shader_end_time;
1679    reset_end.swizzle = BRW_SWIZZLE_ZZZZ;
1680    vec4_instruction *test = emit(AND(dst_null_ud(), reset_end, brw_imm_ud(1u)));
1681    test->conditional_mod = BRW_CONDITIONAL_Z;
1682
1683    emit(IF(BRW_PREDICATE_NORMAL));
1684
1685    /* Take the current timestamp and get the delta. */
1686    shader_start_time.negate = true;
1687    dst_reg diff = dst_reg(this, glsl_type::uint_type);
1688    emit(ADD(diff, shader_start_time, shader_end_time));
1689
1690    /* If there were no instructions between the two timestamp gets, the diff
1691     * is 2 cycles.  Remove that overhead, so I can forget about that when
1692     * trying to determine the time taken for single instructions.
1693     */
1694    emit(ADD(diff, src_reg(diff), brw_imm_ud(-2u)));
1695
1696    emit_shader_time_write(0, src_reg(diff));
1697    emit_shader_time_write(1, brw_imm_ud(1u));
1698    emit(BRW_OPCODE_ELSE);
1699    emit_shader_time_write(2, brw_imm_ud(1u));
1700    emit(BRW_OPCODE_ENDIF);
1701 }
1702
1703 void
1704 vec4_visitor::emit_shader_time_write(int shader_time_subindex, src_reg value)
1705 {
1706    dst_reg dst =
1707       dst_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 2));
1708
1709    dst_reg offset = dst;
1710    dst_reg time = dst;
1711    time.reg_offset++;
1712
1713    offset.type = BRW_REGISTER_TYPE_UD;
1714    int index = shader_time_index * 3 + shader_time_subindex;
1715    emit(MOV(offset, brw_imm_d(index * SHADER_TIME_STRIDE)));
1716
1717    time.type = BRW_REGISTER_TYPE_UD;
1718    emit(MOV(time, value));
1719
1720    vec4_instruction *inst =
1721       emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(dst));
1722    inst->mlen = 2;
1723 }
1724
1725 void
1726 vec4_visitor::convert_to_hw_regs()
1727 {
1728    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1729       for (int i = 0; i < 3; i++) {
1730          struct src_reg &src = inst->src[i];
1731          struct brw_reg reg;
1732          switch (src.file) {
1733          case VGRF:
1734             reg = brw_vec8_grf(src.nr + src.reg_offset, 0);
1735             reg.type = src.type;
1736             reg.swizzle = src.swizzle;
1737             reg.abs = src.abs;
1738             reg.negate = src.negate;
1739             break;
1740
1741          case UNIFORM:
1742             reg = stride(brw_vec4_grf(prog_data->base.dispatch_grf_start_reg +
1743                                       (src.nr + src.reg_offset) / 2,
1744                                       ((src.nr + src.reg_offset) % 2) * 4),
1745                          0, 4, 1);
1746             reg.type = src.type;
1747             reg.swizzle = src.swizzle;
1748             reg.abs = src.abs;
1749             reg.negate = src.negate;
1750
1751             /* This should have been moved to pull constants. */
1752             assert(!src.reladdr);
1753             break;
1754
1755          case ARF:
1756          case FIXED_GRF:
1757          case IMM:
1758             continue;
1759
1760          case BAD_FILE:
1761             /* Probably unused. */
1762             reg = brw_null_reg();
1763             break;
1764
1765          case MRF:
1766          case ATTR:
1767             unreachable("not reached");
1768          }
1769          src = reg;
1770       }
1771
1772       dst_reg &dst = inst->dst;
1773       struct brw_reg reg;
1774
1775       switch (inst->dst.file) {
1776       case VGRF:
1777          reg = brw_vec8_grf(dst.nr + dst.reg_offset, 0);
1778          reg.type = dst.type;
1779          reg.writemask = dst.writemask;
1780          break;
1781
1782       case MRF:
1783          assert(((dst.nr + dst.reg_offset) & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
1784          reg = brw_message_reg(dst.nr + dst.reg_offset);
1785          reg.type = dst.type;
1786          reg.writemask = dst.writemask;
1787          break;
1788
1789       case ARF:
1790       case FIXED_GRF:
1791          reg = dst.as_brw_reg();
1792          break;
1793
1794       case BAD_FILE:
1795          reg = brw_null_reg();
1796          break;
1797
1798       case IMM:
1799       case ATTR:
1800       case UNIFORM:
1801          unreachable("not reached");
1802       }
1803
1804       dst = reg;
1805    }
1806 }
1807
1808 bool
1809 vec4_visitor::run()
1810 {
1811    if (shader_time_index >= 0)
1812       emit_shader_time_begin();
1813
1814    emit_prolog();
1815
1816    emit_nir_code();
1817    if (failed)
1818       return false;
1819    base_ir = NULL;
1820
1821    emit_thread_end();
1822
1823    calculate_cfg();
1824
1825    /* Before any optimization, push array accesses out to scratch
1826     * space where we need them to be.  This pass may allocate new
1827     * virtual GRFs, so we want to do it early.  It also makes sure
1828     * that we have reladdr computations available for CSE, since we'll
1829     * often do repeated subexpressions for those.
1830     */
1831    move_grf_array_access_to_scratch();
1832    move_uniform_array_access_to_pull_constants();
1833
1834    pack_uniform_registers();
1835    move_push_constants_to_pull_constants();
1836    split_virtual_grfs();
1837
1838 #define OPT(pass, args...) ({                                          \
1839       pass_num++;                                                      \
1840       bool this_progress = pass(args);                                 \
1841                                                                        \
1842       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {  \
1843          char filename[64];                                            \
1844          snprintf(filename, 64, "%s-%s-%02d-%02d-" #pass,              \
1845                   stage_abbrev, nir->info.name, iteration, pass_num);  \
1846                                                                        \
1847          backend_shader::dump_instructions(filename);                  \
1848       }                                                                \
1849                                                                        \
1850       progress = progress || this_progress;                            \
1851       this_progress;                                                   \
1852    })
1853
1854
1855    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
1856       char filename[64];
1857       snprintf(filename, 64, "%s-%s-00-start",
1858                stage_abbrev, nir->info.name);
1859
1860       backend_shader::dump_instructions(filename);
1861    }
1862
1863    bool progress;
1864    int iteration = 0;
1865    int pass_num = 0;
1866    do {
1867       progress = false;
1868       pass_num = 0;
1869       iteration++;
1870
1871       OPT(opt_predicated_break, this);
1872       OPT(opt_reduce_swizzle);
1873       OPT(dead_code_eliminate);
1874       OPT(dead_control_flow_eliminate, this);
1875       OPT(opt_copy_propagation);
1876       OPT(opt_cmod_propagation);
1877       OPT(opt_cse);
1878       OPT(opt_algebraic);
1879       OPT(opt_register_coalesce);
1880       OPT(eliminate_find_live_channel);
1881    } while (progress);
1882
1883    pass_num = 0;
1884
1885    if (OPT(opt_vector_float)) {
1886       OPT(opt_cse);
1887       OPT(opt_copy_propagation, false);
1888       OPT(opt_copy_propagation, true);
1889       OPT(dead_code_eliminate);
1890    }
1891
1892    if (failed)
1893       return false;
1894
1895    setup_payload();
1896
1897    if (unlikely(INTEL_DEBUG & DEBUG_SPILL_VEC4)) {
1898       /* Debug of register spilling: Go spill everything. */
1899       const int grf_count = alloc.count;
1900       float spill_costs[alloc.count];
1901       bool no_spill[alloc.count];
1902       evaluate_spill_costs(spill_costs, no_spill);
1903       for (int i = 0; i < grf_count; i++) {
1904          if (no_spill[i])
1905             continue;
1906          spill_reg(i);
1907       }
1908    }
1909
1910    bool allocated_without_spills = reg_allocate();
1911
1912    if (!allocated_without_spills) {
1913       compiler->shader_perf_log(log_data,
1914                                 "%s shader triggered register spilling.  "
1915                                 "Try reducing the number of live vec4 values "
1916                                 "to improve performance.\n",
1917                                 stage_name);
1918
1919       while (!reg_allocate()) {
1920          if (failed)
1921             return false;
1922       }
1923    }
1924
1925    opt_schedule_instructions();
1926
1927    opt_set_dependency_control();
1928
1929    convert_to_hw_regs();
1930
1931    if (last_scratch > 0) {
1932       prog_data->base.total_scratch =
1933          brw_get_scratch_size(last_scratch * REG_SIZE);
1934    }
1935
1936    return !failed;
1937 }
1938
1939 } /* namespace brw */
1940
1941 extern "C" {
1942
1943 /**
1944  * Compile a vertex shader.
1945  *
1946  * Returns the final assembly and the program's size.
1947  */
1948 const unsigned *
1949 brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
1950                void *mem_ctx,
1951                const struct brw_vs_prog_key *key,
1952                struct brw_vs_prog_data *prog_data,
1953                const nir_shader *src_shader,
1954                gl_clip_plane *clip_planes,
1955                bool use_legacy_snorm_formula,
1956                int shader_time_index,
1957                unsigned *final_assembly_size,
1958                char **error_str)
1959 {
1960    nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
1961    shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
1962                                       compiler->scalar_stage[MESA_SHADER_VERTEX]);
1963    shader = brw_postprocess_nir(shader, compiler->devinfo,
1964                                 compiler->scalar_stage[MESA_SHADER_VERTEX]);
1965
1966    const unsigned *assembly = NULL;
1967
1968    unsigned nr_attributes = _mesa_bitcount_64(prog_data->inputs_read);
1969
1970    /* gl_VertexID and gl_InstanceID are system values, but arrive via an
1971     * incoming vertex attribute.  So, add an extra slot.
1972     */
1973    if (shader->info.system_values_read &
1974        (BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
1975         BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) {
1976       nr_attributes++;
1977    }
1978
1979    /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
1980     * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode.  Empirically, in
1981     * vec4 mode, the hardware appears to wedge unless we read something.
1982     */
1983    if (compiler->scalar_stage[MESA_SHADER_VERTEX])
1984       prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attributes, 2);
1985    else
1986       prog_data->base.urb_read_length = DIV_ROUND_UP(MAX2(nr_attributes, 1), 2);
1987
1988    prog_data->nr_attributes = nr_attributes;
1989
1990    /* Since vertex shaders reuse the same VUE entry for inputs and outputs
1991     * (overwriting the original contents), we need to make sure the size is
1992     * the larger of the two.
1993     */
1994    const unsigned vue_entries =
1995       MAX2(nr_attributes, (unsigned)prog_data->base.vue_map.num_slots);
1996
1997    if (compiler->devinfo->gen == 6)
1998       prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8);
1999    else
2000       prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
2001
2002    if (compiler->scalar_stage[MESA_SHADER_VERTEX]) {
2003       prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
2004
2005       fs_visitor v(compiler, log_data, mem_ctx, key, &prog_data->base.base,
2006                    NULL, /* prog; Only used for TEXTURE_RECTANGLE on gen < 8 */
2007                    shader, 8, shader_time_index);
2008       if (!v.run_vs(clip_planes)) {
2009          if (error_str)
2010             *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
2011
2012          return NULL;
2013       }
2014
2015       fs_generator g(compiler, log_data, mem_ctx, (void *) key,
2016                      &prog_data->base.base, v.promoted_constants,
2017                      v.runtime_check_aads_emit, "VS");
2018       if (INTEL_DEBUG & DEBUG_VS) {
2019          const char *debug_name =
2020             ralloc_asprintf(mem_ctx, "%s vertex shader %s",
2021                             shader->info.label ? shader->info.label : "unnamed",
2022                             shader->info.name);
2023
2024          g.enable_debug(debug_name);
2025       }
2026       g.generate_code(v.cfg, 8);
2027       assembly = g.get_assembly(final_assembly_size);
2028    }
2029
2030    if (!assembly) {
2031       prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
2032
2033       vec4_vs_visitor v(compiler, log_data, key, prog_data,
2034                         shader, clip_planes, mem_ctx,
2035                         shader_time_index, use_legacy_snorm_formula);
2036       if (!v.run()) {
2037          if (error_str)
2038             *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
2039
2040          return NULL;
2041       }
2042
2043       assembly = brw_vec4_generate_assembly(compiler, log_data, mem_ctx,
2044                                             shader, &prog_data->base, v.cfg,
2045                                             final_assembly_size);
2046    }
2047
2048    return assembly;
2049 }
2050
2051 } /* extern "C" */