src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 extern "C" {
  26 #include "main/macros.h"
  27 #include "program/prog_parameter.h"
  28 }
  29
  30 namespace brw {
  31
  32 src_reg::src_reg(dst_reg reg)
  33 {
  34    init();
  35
  36    this->file = reg.file;
  37    this->reg = reg.reg;
  38    this->reg_offset = reg.reg_offset;
  39    this->type = reg.type;
  40    this->reladdr = reg.reladdr;
  41    this->fixed_hw_reg = reg.fixed_hw_reg;
  42
  43    int swizzles[4];
  44    int next_chan = 0;
  45    int last = 0;
  46
  47    for (int i = 0; i < 4; i++) {
  48       if (!(reg.writemask & (1 << i)))
  49          continue;
  50
  51       swizzles[next_chan++] = last = i;
  52    }
  53
  54    for (; next_chan < 4; next_chan++) {
  55       swizzles[next_chan] = last;
  56    }
  57
  58    this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
  59                                 swizzles[2], swizzles[3]);
  60 }
  61
  62 dst_reg::dst_reg(src_reg reg)
  63 {
  64    init();
  65
  66    this->file = reg.file;
  67    this->reg = reg.reg;
  68    this->reg_offset = reg.reg_offset;
  69    this->type = reg.type;
  70    this->writemask = WRITEMASK_XYZW;
  71    this->reladdr = reg.reladdr;
  72    this->fixed_hw_reg = reg.fixed_hw_reg;
  73 }
  74
  75 vec4_instruction::vec4_instruction(vec4_visitor *v,
  76                                    enum opcode opcode, dst_reg dst,
  77                                    src_reg src0, src_reg src1, src_reg src2)
  78 {
  79    this->opcode = opcode;
  80    this->dst = dst;
  81    this->src[0] = src0;
  82    this->src[1] = src1;
  83    this->src[2] = src2;
  84    this->ir = v->base_ir;
  85    this->annotation = v->current_annotation;
  86 }
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(vec4_instruction *inst)
  90 {
  91    this->instructions.push_tail(inst);
  92
  93    return inst;
  94 }
  95
  96 vec4_instruction *
  97 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  98                    src_reg src0, src_reg src1, src_reg src2)
  99 {
 100    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
 101                                              src0, src1, src2));
 102 }
 103
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
 115 }
 116
 117 vec4_instruction *
 118 vec4_visitor::emit(enum opcode opcode)
 119 {
 120    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 121 }
 122
 123 #define ALU1(op)                                                        \
 124    vec4_instruction *                                                   \
 125    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
 126    {                                                                    \
 127       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 128                                            src0);                       \
 129    }
 130
 131 #define ALU2(op)                                                        \
 132    vec4_instruction *                                                   \
 133    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 134    {                                                                    \
 135       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 136                                            src0, src1);                 \
 137    }
 138
 139 ALU1(NOT)
 140 ALU1(MOV)
 141 ALU1(FRC)
 142 ALU1(RNDD)
 143 ALU1(RNDE)
 144 ALU1(RNDZ)
 145 ALU2(ADD)
 146 ALU2(MUL)
 147 ALU2(MACH)
 148 ALU2(AND)
 149 ALU2(OR)
 150 ALU2(XOR)
 151 ALU2(DP3)
 152 ALU2(DP4)
 153
 154 /** Gen4 predicated IF. */
 155 vec4_instruction *
 156 vec4_visitor::IF(uint32_t predicate)
 157 {
 158    vec4_instruction *inst;
 159
 160    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 161    inst->predicate = predicate;
 162
 163    return inst;
 164 }
 165
 166 /** Gen6+ IF with embedded comparison. */
 167 vec4_instruction *
 168 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 169 {
 170    assert(intel->gen >= 6);
 171
 172    vec4_instruction *inst;
 173
 174    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 175                                         src0, src1);
 176    inst->conditional_mod = condition;
 177
 178    return inst;
 179 }
 180
 181 vec4_instruction *
 182 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 183 {
 184    vec4_instruction *inst;
 185
 186    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst,
 187                                         src0, src1, src_reg());
 188    inst->conditional_mod = condition;
 189
 190    return inst;
 191 }
 192
 193 void
 194 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 195 {
 196    static enum opcode dot_opcodes[] = {
 197       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 198    };
 199
 200    emit(dot_opcodes[elements - 2], dst, src0, src1);
 201 }
 202
 203 void
 204 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 205 {
 206    /* The gen6 math instruction ignores the source modifiers --
 207     * swizzle, abs, negate, and at least some parts of the register
 208     * region description.
 209     *
 210     * While it would seem that this MOV could be avoided at this point
 211     * in the case that the swizzle is matched up with the destination
 212     * writemask, note that uniform packing and register allocation
 213     * could rearrange our swizzle, so let's leave this matter up to
 214     * copy propagation later.
 215     */
 216    src_reg temp_src = src_reg(this, glsl_type::vec4_type);
 217    emit(BRW_OPCODE_MOV, dst_reg(temp_src), src);
 218
 219    if (dst.writemask != WRITEMASK_XYZW) {
 220       /* The gen6 math instruction must be align1, so we can't do
 221        * writemasks.
 222        */
 223       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 224
 225       emit(opcode, temp_dst, temp_src);
 226
 227       emit(BRW_OPCODE_MOV, dst, src_reg(temp_dst));
 228    } else {
 229       emit(opcode, dst, temp_src);
 230    }
 231 }
 232
 233 void
 234 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 235 {
 236    vec4_instruction *inst = emit(opcode, dst, src);
 237    inst->base_mrf = 1;
 238    inst->mlen = 1;
 239 }
 240
 241 void
 242 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 243 {
 244    switch (opcode) {
 245    case SHADER_OPCODE_RCP:
 246    case SHADER_OPCODE_RSQ:
 247    case SHADER_OPCODE_SQRT:
 248    case SHADER_OPCODE_EXP2:
 249    case SHADER_OPCODE_LOG2:
 250    case SHADER_OPCODE_SIN:
 251    case SHADER_OPCODE_COS:
 252       break;
 253    default:
 254       assert(!"not reached: bad math opcode");
 255       return;
 256    }
 257
 258    if (intel->gen >= 6) {
 259       return emit_math1_gen6(opcode, dst, src);
 260    } else {
 261       return emit_math1_gen4(opcode, dst, src);
 262    }
 263 }
 264
 265 void
 266 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 267                               dst_reg dst, src_reg src0, src_reg src1)
 268 {
 269    src_reg expanded;
 270
 271    /* The gen6 math instruction ignores the source modifiers --
 272     * swizzle, abs, negate, and at least some parts of the register
 273     * region description.  Move the sources to temporaries to make it
 274     * generally work.
 275     */
 276
 277    expanded = src_reg(this, glsl_type::vec4_type);
 278    emit(BRW_OPCODE_MOV, dst_reg(expanded), src0);
 279    src0 = expanded;
 280
 281    expanded = src_reg(this, glsl_type::vec4_type);
 282    emit(BRW_OPCODE_MOV, dst_reg(expanded), src1);
 283    src1 = expanded;
 284
 285    if (dst.writemask != WRITEMASK_XYZW) {
 286       /* The gen6 math instruction must be align1, so we can't do
 287        * writemasks.
 288        */
 289       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 290
 291       emit(opcode, temp_dst, src0, src1);
 292
 293       emit(BRW_OPCODE_MOV, dst, src_reg(temp_dst));
 294    } else {
 295       emit(opcode, dst, src0, src1);
 296    }
 297 }
 298
 299 void
 300 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 301                               dst_reg dst, src_reg src0, src_reg src1)
 302 {
 303    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 304    inst->base_mrf = 1;
 305    inst->mlen = 2;
 306 }
 307
 308 void
 309 vec4_visitor::emit_math(enum opcode opcode,
 310                         dst_reg dst, src_reg src0, src_reg src1)
 311 {
 312    assert(opcode == SHADER_OPCODE_POW);
 313
 314    if (intel->gen >= 6) {
 315       return emit_math2_gen6(opcode, dst, src0, src1);
 316    } else {
 317       return emit_math2_gen4(opcode, dst, src0, src1);
 318    }
 319 }
 320
 321 void
 322 vec4_visitor::visit_instructions(const exec_list *list)
 323 {
 324    foreach_list(node, list) {
 325       ir_instruction *ir = (ir_instruction *)node;
 326
 327       base_ir = ir;
 328       ir->accept(this);
 329    }
 330 }
 331
 332
 333 static int
 334 type_size(const struct glsl_type *type)
 335 {
 336    unsigned int i;
 337    int size;
 338
 339    switch (type->base_type) {
 340    case GLSL_TYPE_UINT:
 341    case GLSL_TYPE_INT:
 342    case GLSL_TYPE_FLOAT:
 343    case GLSL_TYPE_BOOL:
 344       if (type->is_matrix()) {
 345          return type->matrix_columns;
 346       } else {
 347          /* Regardless of size of vector, it gets a vec4. This is bad
 348           * packing for things like floats, but otherwise arrays become a
 349           * mess.  Hopefully a later pass over the code can pack scalars
 350           * down if appropriate.
 351           */
 352          return 1;
 353       }
 354    case GLSL_TYPE_ARRAY:
 355       assert(type->length > 0);
 356       return type_size(type->fields.array) * type->length;
 357    case GLSL_TYPE_STRUCT:
 358       size = 0;
 359       for (i = 0; i < type->length; i++) {
 360          size += type_size(type->fields.structure[i].type);
 361       }
 362       return size;
 363    case GLSL_TYPE_SAMPLER:
 364       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 365        * at link time.
 366        */
 367       return 1;
 368    default:
 369       assert(0);
 370       return 0;
 371    }
 372 }
 373
 374 int
 375 vec4_visitor::virtual_grf_alloc(int size)
 376 {
 377    if (virtual_grf_array_size <= virtual_grf_count) {
 378       if (virtual_grf_array_size == 0)
 379          virtual_grf_array_size = 16;
 380       else
 381          virtual_grf_array_size *= 2;
 382       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 383                                    virtual_grf_array_size);
 384    }
 385    virtual_grf_sizes[virtual_grf_count] = size;
 386    return virtual_grf_count++;
 387 }
 388
 389 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 390 {
 391    init();
 392
 393    this->file = GRF;
 394    this->reg = v->virtual_grf_alloc(type_size(type));
 395
 396    if (type->is_array() || type->is_record()) {
 397       this->swizzle = BRW_SWIZZLE_NOOP;
 398    } else {
 399       this->swizzle = swizzle_for_size(type->vector_elements);
 400    }
 401
 402    this->type = brw_type_for_base_type(type);
 403 }
 404
 405 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 406 {
 407    init();
 408
 409    this->file = GRF;
 410    this->reg = v->virtual_grf_alloc(type_size(type));
 411
 412    if (type->is_array() || type->is_record()) {
 413       this->writemask = WRITEMASK_XYZW;
 414    } else {
 415       this->writemask = (1 << type->vector_elements) - 1;
 416    }
 417
 418    this->type = brw_type_for_base_type(type);
 419 }
 420
 421 /* Our support for uniforms is piggy-backed on the struct
 422  * gl_fragment_program, because that's where the values actually
 423  * get stored, rather than in some global gl_shader_program uniform
 424  * store.
 425  */
 426 int
 427 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
 428 {
 429    unsigned int offset = 0;
 430    float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
 431
 432    if (type->is_matrix()) {
 433       const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 434                                                         type->vector_elements,
 435                                                         1);
 436
 437       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 438          offset += setup_uniform_values(loc + offset, column);
 439       }
 440
 441       return offset;
 442    }
 443
 444    switch (type->base_type) {
 445    case GLSL_TYPE_FLOAT:
 446    case GLSL_TYPE_UINT:
 447    case GLSL_TYPE_INT:
 448    case GLSL_TYPE_BOOL:
 449       for (unsigned int i = 0; i < type->vector_elements; i++) {
 450          c->prog_data.param[this->uniforms * 4 + i] = &values[i];
 451       }
 452
 453       /* Set up pad elements to get things aligned to a vec4 boundary. */
 454       for (unsigned int i = type->vector_elements; i < 4; i++) {
 455          static float zero = 0;
 456
 457          c->prog_data.param[this->uniforms * 4 + i] = &zero;
 458       }
 459
 460       /* Track the size of this uniform vector, for future packing of
 461        * uniforms.
 462        */
 463       this->uniform_vector_size[this->uniforms] = type->vector_elements;
 464       this->uniforms++;
 465
 466       return 1;
 467
 468    case GLSL_TYPE_STRUCT:
 469       for (unsigned int i = 0; i < type->length; i++) {
 470          offset += setup_uniform_values(loc + offset,
 471                                         type->fields.structure[i].type);
 472       }
 473       return offset;
 474
 475    case GLSL_TYPE_ARRAY:
 476       for (unsigned int i = 0; i < type->length; i++) {
 477          offset += setup_uniform_values(loc + offset, type->fields.array);
 478       }
 479       return offset;
 480
 481    case GLSL_TYPE_SAMPLER:
 482       /* The sampler takes up a slot, but we don't use any values from it. */
 483       return 1;
 484
 485    default:
 486       assert(!"not reached");
 487       return 0;
 488    }
 489 }
 490
 491 /* Our support for builtin uniforms is even scarier than non-builtin.
 492  * It sits on top of the PROG_STATE_VAR parameters that are
 493  * automatically updated from GL context state.
 494  */
 495 void
 496 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 497 {
 498    const ir_state_slot *const slots = ir->state_slots;
 499    assert(ir->state_slots != NULL);
 500
 501    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 502       /* This state reference has already been setup by ir_to_mesa,
 503        * but we'll get the same index back here.  We can reference
 504        * ParameterValues directly, since unlike brw_fs.cpp, we never
 505        * add new state references during compile.
 506        */
 507       int index = _mesa_add_state_reference(this->vp->Base.Parameters,
 508                                             (gl_state_index *)slots[i].tokens);
 509       float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
 510
 511       this->uniform_vector_size[this->uniforms] = 0;
 512       /* Add each of the unique swizzled channels of the element.
 513        * This will end up matching the size of the glsl_type of this field.
 514        */
 515       int last_swiz = -1;
 516       for (unsigned int j = 0; j < 4; j++) {
 517          int swiz = GET_SWZ(slots[i].swizzle, j);
 518          last_swiz = swiz;
 519
 520          c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
 521          if (swiz <= last_swiz)
 522             this->uniform_vector_size[this->uniforms]++;
 523       }
 524       this->uniforms++;
 525    }
 526 }
 527
 528 dst_reg *
 529 vec4_visitor::variable_storage(ir_variable *var)
 530 {
 531    return (dst_reg *)hash_table_find(this->variable_ht, var);
 532 }
 533
 534 void
 535 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
 536 {
 537    ir_expression *expr = ir->as_expression();
 538
 539    if (expr) {
 540       src_reg op[2];
 541       vec4_instruction *inst;
 542
 543       assert(expr->get_num_operands() <= 2);
 544       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 545          assert(expr->operands[i]->type->is_scalar());
 546
 547          expr->operands[i]->accept(this);
 548          op[i] = this->result;
 549       }
 550
 551       switch (expr->operation) {
 552       case ir_unop_logic_not:
 553          inst = emit(BRW_OPCODE_AND, dst_null_d(), op[0], src_reg(1));
 554          inst->conditional_mod = BRW_CONDITIONAL_Z;
 555          break;
 556
 557       case ir_binop_logic_xor:
 558          inst = emit(BRW_OPCODE_XOR, dst_null_d(), op[0], op[1]);
 559          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 560          break;
 561
 562       case ir_binop_logic_or:
 563          inst = emit(BRW_OPCODE_OR, dst_null_d(), op[0], op[1]);
 564          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 565          break;
 566
 567       case ir_binop_logic_and:
 568          inst = emit(BRW_OPCODE_AND, dst_null_d(), op[0], op[1]);
 569          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 570          break;
 571
 572       case ir_unop_f2b:
 573          if (intel->gen >= 6) {
 574             inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0.0f));
 575          } else {
 576             inst = emit(BRW_OPCODE_MOV, dst_null_f(), op[0]);
 577          }
 578          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 579          break;
 580
 581       case ir_unop_i2b:
 582          if (intel->gen >= 6) {
 583             inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0));
 584          } else {
 585             inst = emit(BRW_OPCODE_MOV, dst_null_d(), op[0]);
 586          }
 587          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 588          break;
 589
 590       case ir_binop_greater:
 591       case ir_binop_gequal:
 592       case ir_binop_less:
 593       case ir_binop_lequal:
 594       case ir_binop_equal:
 595       case ir_binop_all_equal:
 596       case ir_binop_nequal:
 597       case ir_binop_any_nequal:
 598          inst = emit(BRW_OPCODE_CMP, dst_null_cmp(), op[0], op[1]);
 599          inst->conditional_mod =
 600             brw_conditional_for_comparison(expr->operation);
 601          break;
 602
 603       default:
 604          assert(!"not reached");
 605          break;
 606       }
 607       return;
 608    }
 609
 610    ir->accept(this);
 611
 612    if (intel->gen >= 6) {
 613       vec4_instruction *inst = emit(BRW_OPCODE_AND, dst_null_d(),
 614                                this->result, src_reg(1));
 615       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 616    } else {
 617       vec4_instruction *inst = emit(BRW_OPCODE_MOV, dst_null_d(), this->result);
 618       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 619    }
 620 }
 621
 622 /**
 623  * Emit a gen6 IF statement with the comparison folded into the IF
 624  * instruction.
 625  */
 626 void
 627 vec4_visitor::emit_if_gen6(ir_if *ir)
 628 {
 629    ir_expression *expr = ir->condition->as_expression();
 630
 631    if (expr) {
 632       src_reg op[2];
 633       vec4_instruction *inst;
 634       dst_reg temp;
 635
 636       assert(expr->get_num_operands() <= 2);
 637       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 638          expr->operands[i]->accept(this);
 639          op[i] = this->result;
 640       }
 641
 642       switch (expr->operation) {
 643       case ir_unop_logic_not:
 644          inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], src_reg(0));
 645          inst->conditional_mod = BRW_CONDITIONAL_Z;
 646          return;
 647
 648       case ir_binop_logic_xor:
 649          inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], op[1]);
 650          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 651          return;
 652
 653       case ir_binop_logic_or:
 654          temp = dst_reg(this, glsl_type::bool_type);
 655          emit(BRW_OPCODE_OR, temp, op[0], op[1]);
 656          inst = emit(BRW_OPCODE_IF, dst_null_d(), src_reg(temp), src_reg(0));
 657          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 658          return;
 659
 660       case ir_binop_logic_and:
 661          temp = dst_reg(this, glsl_type::bool_type);
 662          emit(BRW_OPCODE_AND, temp, op[0], op[1]);
 663          inst = emit(BRW_OPCODE_IF, dst_null_d(), src_reg(temp), src_reg(0));
 664          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 665          return;
 666
 667       case ir_unop_f2b:
 668          inst = emit(BRW_OPCODE_IF, dst_null_f(), op[0], src_reg(0));
 669          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 670          return;
 671
 672       case ir_unop_i2b:
 673          inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], src_reg(0));
 674          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 675          return;
 676
 677       case ir_binop_greater:
 678       case ir_binop_gequal:
 679       case ir_binop_less:
 680       case ir_binop_lequal:
 681       case ir_binop_equal:
 682       case ir_binop_nequal:
 683          inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], op[1]);
 684          inst->conditional_mod =
 685             brw_conditional_for_comparison(expr->operation);
 686          return;
 687
 688       case ir_binop_all_equal:
 689          inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], op[1]);
 690          inst->conditional_mod = BRW_CONDITIONAL_Z;
 691
 692          inst = emit(BRW_OPCODE_IF);
 693          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 694          return;
 695
 696       case ir_binop_any_nequal:
 697          inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], op[1]);
 698          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 699
 700          inst = emit(BRW_OPCODE_IF);
 701          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 702          return;
 703
 704       case ir_unop_any:
 705          inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0));
 706          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 707
 708          inst = emit(BRW_OPCODE_IF);
 709          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 710          return;
 711
 712       default:
 713          assert(!"not reached");
 714          inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], src_reg(0));
 715          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 716          return;
 717       }
 718       return;
 719    }
 720
 721    ir->condition->accept(this);
 722
 723    vec4_instruction *inst = emit(BRW_OPCODE_IF, dst_null_d(),
 724                             this->result, src_reg(0));
 725    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 726 }
 727
 728 void
 729 vec4_visitor::visit(ir_variable *ir)
 730 {
 731    dst_reg *reg = NULL;
 732
 733    if (variable_storage(ir))
 734       return;
 735
 736    switch (ir->mode) {
 737    case ir_var_in:
 738       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
 739
 740       /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 741        * come in as floating point conversions of the integer values.
 742        */
 743       for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
 744          if (!c->key.gl_fixed_input_size[i])
 745             continue;
 746
 747          dst_reg dst = *reg;
 748          dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
 749          emit(BRW_OPCODE_MUL, dst, src_reg(dst), src_reg(1.0f / 65536.0f));
 750       }
 751       break;
 752
 753    case ir_var_out:
 754       reg = new(mem_ctx) dst_reg(this, ir->type);
 755
 756       for (int i = 0; i < type_size(ir->type); i++) {
 757          output_reg[ir->location + i] = *reg;
 758          output_reg[ir->location + i].reg_offset = i;
 759          output_reg[ir->location + i].type = BRW_REGISTER_TYPE_F;
 760       }
 761       break;
 762
 763    case ir_var_auto:
 764    case ir_var_temporary:
 765       reg = new(mem_ctx) dst_reg(this, ir->type);
 766       break;
 767
 768    case ir_var_uniform:
 769       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 770
 771       /* Track how big the whole uniform variable is, in case we need to put a
 772        * copy of its data into pull constants for array access.
 773        */
 774       this->uniform_size[this->uniforms] = type_size(ir->type);
 775
 776       if (!strncmp(ir->name, "gl_", 3)) {
 777          setup_builtin_uniform_values(ir);
 778       } else {
 779          setup_uniform_values(ir->location, ir->type);
 780       }
 781       break;
 782
 783    default:
 784       assert(!"not reached");
 785    }
 786
 787    reg->type = brw_type_for_base_type(ir->type);
 788    hash_table_insert(this->variable_ht, reg, ir);
 789 }
 790
 791 void
 792 vec4_visitor::visit(ir_loop *ir)
 793 {
 794    dst_reg counter;
 795
 796    /* We don't want debugging output to print the whole body of the
 797     * loop as the annotation.
 798     */
 799    this->base_ir = NULL;
 800
 801    if (ir->counter != NULL) {
 802       this->base_ir = ir->counter;
 803       ir->counter->accept(this);
 804       counter = *(variable_storage(ir->counter));
 805
 806       if (ir->from != NULL) {
 807          this->base_ir = ir->from;
 808          ir->from->accept(this);
 809
 810          emit(BRW_OPCODE_MOV, counter, this->result);
 811       }
 812    }
 813
 814    emit(BRW_OPCODE_DO);
 815
 816    if (ir->to) {
 817       this->base_ir = ir->to;
 818       ir->to->accept(this);
 819
 820       vec4_instruction *inst = emit(BRW_OPCODE_CMP, dst_null_d(),
 821                                     src_reg(counter), this->result);
 822       inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
 823
 824       inst = emit(BRW_OPCODE_BREAK);
 825       inst->predicate = BRW_PREDICATE_NORMAL;
 826    }
 827
 828    visit_instructions(&ir->body_instructions);
 829
 830
 831    if (ir->increment) {
 832       this->base_ir = ir->increment;
 833       ir->increment->accept(this);
 834       emit(BRW_OPCODE_ADD, counter, src_reg(counter), this->result);
 835    }
 836
 837    emit(BRW_OPCODE_WHILE);
 838 }
 839
 840 void
 841 vec4_visitor::visit(ir_loop_jump *ir)
 842 {
 843    switch (ir->mode) {
 844    case ir_loop_jump::jump_break:
 845       emit(BRW_OPCODE_BREAK);
 846       break;
 847    case ir_loop_jump::jump_continue:
 848       emit(BRW_OPCODE_CONTINUE);
 849       break;
 850    }
 851 }
 852
 853
 854 void
 855 vec4_visitor::visit(ir_function_signature *ir)
 856 {
 857    assert(0);
 858    (void)ir;
 859 }
 860
 861 void
 862 vec4_visitor::visit(ir_function *ir)
 863 {
 864    /* Ignore function bodies other than main() -- we shouldn't see calls to
 865     * them since they should all be inlined.
 866     */
 867    if (strcmp(ir->name, "main") == 0) {
 868       const ir_function_signature *sig;
 869       exec_list empty;
 870
 871       sig = ir->matching_signature(&empty);
 872
 873       assert(sig);
 874
 875       visit_instructions(&sig->body);
 876    }
 877 }
 878
 879 GLboolean
 880 vec4_visitor::try_emit_sat(ir_expression *ir)
 881 {
 882    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
 883    if (!sat_src)
 884       return false;
 885
 886    sat_src->accept(this);
 887    src_reg src = this->result;
 888
 889    this->result = src_reg(this, ir->type);
 890    vec4_instruction *inst;
 891    inst = emit(BRW_OPCODE_MOV, dst_reg(this->result), src);
 892    inst->saturate = true;
 893
 894    return true;
 895 }
 896
 897 void
 898 vec4_visitor::emit_bool_comparison(unsigned int op,
 899                                  dst_reg dst, src_reg src0, src_reg src1)
 900 {
 901    /* original gen4 does destination conversion before comparison. */
 902    if (intel->gen < 5)
 903       dst.type = src0.type;
 904
 905    vec4_instruction *inst = emit(BRW_OPCODE_CMP, dst, src0, src1);
 906    inst->conditional_mod = brw_conditional_for_comparison(op);
 907
 908    dst.type = BRW_REGISTER_TYPE_D;
 909    emit(BRW_OPCODE_AND, dst, src_reg(dst), src_reg(0x1));
 910 }
 911
 912 void
 913 vec4_visitor::visit(ir_expression *ir)
 914 {
 915    unsigned int operand;
 916    src_reg op[Elements(ir->operands)];
 917    src_reg result_src;
 918    dst_reg result_dst;
 919    vec4_instruction *inst;
 920
 921    if (try_emit_sat(ir))
 922       return;
 923
 924    for (operand = 0; operand < ir->get_num_operands(); operand++) {
 925       this->result.file = BAD_FILE;
 926       ir->operands[operand]->accept(this);
 927       if (this->result.file == BAD_FILE) {
 928          printf("Failed to get tree for expression operand:\n");
 929          ir->operands[operand]->print();
 930          exit(1);
 931       }
 932       op[operand] = this->result;
 933
 934       /* Matrix expression operands should have been broken down to vector
 935        * operations already.
 936        */
 937       assert(!ir->operands[operand]->type->is_matrix());
 938    }
 939
 940    int vector_elements = ir->operands[0]->type->vector_elements;
 941    if (ir->operands[1]) {
 942       vector_elements = MAX2(vector_elements,
 943                              ir->operands[1]->type->vector_elements);
 944    }
 945
 946    this->result.file = BAD_FILE;
 947
 948    /* Storage for our result.  Ideally for an assignment we'd be using
 949     * the actual storage for the result here, instead.
 950     */
 951    result_src = src_reg(this, ir->type);
 952    /* convenience for the emit functions below. */
 953    result_dst = dst_reg(result_src);
 954    /* If nothing special happens, this is the result. */
 955    this->result = result_src;
 956    /* Limit writes to the channels that will be used by result_src later.
 957     * This does limit this temp's use as a temporary for multi-instruction
 958     * sequences.
 959     */
 960    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
 961
 962    switch (ir->operation) {
 963    case ir_unop_logic_not:
 964       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
 965        * ones complement of the whole register, not just bit 0.
 966        */
 967       emit(BRW_OPCODE_XOR, result_dst, op[0], src_reg(1));
 968       break;
 969    case ir_unop_neg:
 970       op[0].negate = !op[0].negate;
 971       this->result = op[0];
 972       break;
 973    case ir_unop_abs:
 974       op[0].abs = true;
 975       op[0].negate = false;
 976       this->result = op[0];
 977       break;
 978
 979    case ir_unop_sign:
 980       emit(BRW_OPCODE_MOV, result_dst, src_reg(0.0f));
 981
 982       inst = emit(BRW_OPCODE_CMP, dst_null_f(), op[0], src_reg(0.0f));
 983       inst->conditional_mod = BRW_CONDITIONAL_G;
 984       inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1.0f));
 985       inst->predicate = BRW_PREDICATE_NORMAL;
 986
 987       inst = emit(BRW_OPCODE_CMP, dst_null_f(), op[0], src_reg(0.0f));
 988       inst->conditional_mod = BRW_CONDITIONAL_L;
 989       inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(-1.0f));
 990       inst->predicate = BRW_PREDICATE_NORMAL;
 991
 992       break;
 993
 994    case ir_unop_rcp:
 995       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
 996       break;
 997
 998    case ir_unop_exp2:
 999       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1000       break;
1001    case ir_unop_log2:
1002       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1003       break;
1004    case ir_unop_exp:
1005    case ir_unop_log:
1006       assert(!"not reached: should be handled by ir_explog_to_explog2");
1007       break;
1008    case ir_unop_sin:
1009    case ir_unop_sin_reduced:
1010       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1011       break;
1012    case ir_unop_cos:
1013    case ir_unop_cos_reduced:
1014       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1015       break;
1016
1017    case ir_unop_dFdx:
1018    case ir_unop_dFdy:
1019       assert(!"derivatives not valid in vertex shader");
1020       break;
1021
1022    case ir_unop_noise:
1023       assert(!"not reached: should be handled by lower_noise");
1024       break;
1025
1026    case ir_binop_add:
1027       emit(BRW_OPCODE_ADD, result_dst, op[0], op[1]);
1028       break;
1029    case ir_binop_sub:
1030       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1031       break;
1032
1033    case ir_binop_mul:
1034       if (ir->type->is_integer()) {
1035          /* For integer multiplication, the MUL uses the low 16 bits
1036           * of one of the operands (src0 on gen6, src1 on gen7).  The
1037           * MACH accumulates in the contribution of the upper 16 bits
1038           * of that operand.
1039           *
1040           * FINISHME: Emit just the MUL if we know an operand is small
1041           * enough.
1042           */
1043          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1044
1045          emit(BRW_OPCODE_MUL, acc, op[0], op[1]);
1046          emit(BRW_OPCODE_MACH, dst_null_d(), op[0], op[1]);
1047          emit(BRW_OPCODE_MOV, result_dst, src_reg(acc));
1048       } else {
1049          emit(BRW_OPCODE_MUL, result_dst, op[0], op[1]);
1050       }
1051       break;
1052    case ir_binop_div:
1053       assert(!"not reached: should be handled by ir_div_to_mul_rcp");
1054    case ir_binop_mod:
1055       assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
1056       break;
1057
1058    case ir_binop_less:
1059    case ir_binop_greater:
1060    case ir_binop_lequal:
1061    case ir_binop_gequal:
1062    case ir_binop_equal:
1063    case ir_binop_nequal: {
1064       dst_reg temp = result_dst;
1065       /* original gen4 does implicit conversion before comparison. */
1066       if (intel->gen < 5)
1067          temp.type = op[0].type;
1068
1069       inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
1070       inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
1071       emit(BRW_OPCODE_AND, result_dst, this->result, src_reg(0x1));
1072       break;
1073    }
1074
1075    case ir_binop_all_equal:
1076       /* "==" operator producing a scalar boolean. */
1077       if (ir->operands[0]->type->is_vector() ||
1078           ir->operands[1]->type->is_vector()) {
1079          inst = emit(BRW_OPCODE_CMP, dst_null_cmp(), op[0], op[1]);
1080          inst->conditional_mod = BRW_CONDITIONAL_Z;
1081
1082          emit(BRW_OPCODE_MOV, result_dst, src_reg(0));
1083          inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1));
1084          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1085       } else {
1086          dst_reg temp = result_dst;
1087          /* original gen4 does implicit conversion before comparison. */
1088          if (intel->gen < 5)
1089             temp.type = op[0].type;
1090
1091          inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
1092          inst->conditional_mod = BRW_CONDITIONAL_Z;
1093          emit(BRW_OPCODE_AND, result_dst, result_src, src_reg(0x1));
1094       }
1095       break;
1096    case ir_binop_any_nequal:
1097       /* "!=" operator producing a scalar boolean. */
1098       if (ir->operands[0]->type->is_vector() ||
1099           ir->operands[1]->type->is_vector()) {
1100          inst = emit(BRW_OPCODE_CMP, dst_null_cmp(), op[0], op[1]);
1101          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1102
1103          emit(BRW_OPCODE_MOV, result_dst, src_reg(0));
1104          inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1));
1105          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1106       } else {
1107          dst_reg temp = result_dst;
1108          /* original gen4 does implicit conversion before comparison. */
1109          if (intel->gen < 5)
1110             temp.type = op[0].type;
1111
1112          inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
1113          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1114          emit(BRW_OPCODE_AND, result_dst, result_src, src_reg(0x1));
1115       }
1116       break;
1117
1118    case ir_unop_any:
1119       inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0));
1120       inst->conditional_mod = BRW_CONDITIONAL_NZ;
1121
1122       emit(BRW_OPCODE_MOV, result_dst, src_reg(0));
1123
1124       inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1));
1125       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1126       break;
1127
1128    case ir_binop_logic_xor:
1129       emit(BRW_OPCODE_XOR, result_dst, op[0], op[1]);
1130       break;
1131
1132    case ir_binop_logic_or:
1133       emit(BRW_OPCODE_OR, result_dst, op[0], op[1]);
1134       break;
1135
1136    case ir_binop_logic_and:
1137       emit(BRW_OPCODE_AND, result_dst, op[0], op[1]);
1138       break;
1139
1140    case ir_binop_dot:
1141       assert(ir->operands[0]->type->is_vector());
1142       assert(ir->operands[0]->type == ir->operands[1]->type);
1143       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1144       break;
1145
1146    case ir_unop_sqrt:
1147       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1148       break;
1149    case ir_unop_rsq:
1150       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1151       break;
1152    case ir_unop_i2f:
1153    case ir_unop_i2u:
1154    case ir_unop_u2i:
1155    case ir_unop_u2f:
1156    case ir_unop_b2f:
1157    case ir_unop_b2i:
1158    case ir_unop_f2i:
1159       emit(BRW_OPCODE_MOV, result_dst, op[0]);
1160       break;
1161    case ir_unop_f2b:
1162    case ir_unop_i2b: {
1163       dst_reg temp = result_dst;
1164       /* original gen4 does implicit conversion before comparison. */
1165       if (intel->gen < 5)
1166          temp.type = op[0].type;
1167
1168       inst = emit(BRW_OPCODE_CMP, temp, op[0], src_reg(0.0f));
1169       inst->conditional_mod = BRW_CONDITIONAL_NZ;
1170       inst = emit(BRW_OPCODE_AND, result_dst, result_src, src_reg(1));
1171       break;
1172    }
1173
1174    case ir_unop_trunc:
1175       emit(BRW_OPCODE_RNDZ, result_dst, op[0]);
1176       break;
1177    case ir_unop_ceil:
1178       op[0].negate = !op[0].negate;
1179       inst = emit(BRW_OPCODE_RNDD, result_dst, op[0]);
1180       this->result.negate = true;
1181       break;
1182    case ir_unop_floor:
1183       inst = emit(BRW_OPCODE_RNDD, result_dst, op[0]);
1184       break;
1185    case ir_unop_fract:
1186       inst = emit(BRW_OPCODE_FRC, result_dst, op[0]);
1187       break;
1188    case ir_unop_round_even:
1189       emit(BRW_OPCODE_RNDE, result_dst, op[0]);
1190       break;
1191
1192    case ir_binop_min:
1193       inst = emit(BRW_OPCODE_CMP, result_dst, op[0], op[1]);
1194       inst->conditional_mod = BRW_CONDITIONAL_L;
1195
1196       inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1197       inst->predicate = BRW_PREDICATE_NORMAL;
1198       break;
1199    case ir_binop_max:
1200       inst = emit(BRW_OPCODE_CMP, result_dst, op[0], op[1]);
1201       inst->conditional_mod = BRW_CONDITIONAL_G;
1202
1203       inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1204       inst->predicate = BRW_PREDICATE_NORMAL;
1205       break;
1206
1207    case ir_binop_pow:
1208       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1209       break;
1210
1211    case ir_unop_bit_not:
1212       inst = emit(BRW_OPCODE_NOT, result_dst, op[0]);
1213       break;
1214    case ir_binop_bit_and:
1215       inst = emit(BRW_OPCODE_AND, result_dst, op[0], op[1]);
1216       break;
1217    case ir_binop_bit_xor:
1218       inst = emit(BRW_OPCODE_XOR, result_dst, op[0], op[1]);
1219       break;
1220    case ir_binop_bit_or:
1221       inst = emit(BRW_OPCODE_OR, result_dst, op[0], op[1]);
1222       break;
1223
1224    case ir_binop_lshift:
1225    case ir_binop_rshift:
1226       assert(!"GLSL 1.30 features unsupported");
1227       break;
1228
1229    case ir_quadop_vector:
1230       assert(!"not reached: should be handled by lower_quadop_vector");
1231       break;
1232    }
1233 }
1234
1235
1236 void
1237 vec4_visitor::visit(ir_swizzle *ir)
1238 {
1239    src_reg src;
1240    int i = 0;
1241    int swizzle[4];
1242
1243    /* Note that this is only swizzles in expressions, not those on the left
1244     * hand side of an assignment, which do write masking.  See ir_assignment
1245     * for that.
1246     */
1247
1248    ir->val->accept(this);
1249    src = this->result;
1250    assert(src.file != BAD_FILE);
1251
1252    for (i = 0; i < ir->type->vector_elements; i++) {
1253       switch (i) {
1254       case 0:
1255          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1256          break;
1257       case 1:
1258          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1259          break;
1260       case 2:
1261          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1262          break;
1263       case 3:
1264          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1265             break;
1266       }
1267    }
1268    for (; i < 4; i++) {
1269       /* Replicate the last channel out. */
1270       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1271    }
1272
1273    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1274
1275    this->result = src;
1276 }
1277
1278 void
1279 vec4_visitor::visit(ir_dereference_variable *ir)
1280 {
1281    const struct glsl_type *type = ir->type;
1282    dst_reg *reg = variable_storage(ir->var);
1283
1284    if (!reg) {
1285       fail("Failed to find variable storage for %s\n", ir->var->name);
1286       this->result = src_reg(brw_null_reg());
1287       return;
1288    }
1289
1290    this->result = src_reg(*reg);
1291
1292    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1293       this->result.swizzle = swizzle_for_size(type->vector_elements);
1294 }
1295
1296 void
1297 vec4_visitor::visit(ir_dereference_array *ir)
1298 {
1299    ir_constant *constant_index;
1300    src_reg src;
1301    int element_size = type_size(ir->type);
1302
1303    constant_index = ir->array_index->constant_expression_value();
1304
1305    ir->array->accept(this);
1306    src = this->result;
1307
1308    if (constant_index) {
1309       src.reg_offset += constant_index->value.i[0] * element_size;
1310    } else {
1311       /* Variable index array dereference.  It eats the "vec4" of the
1312        * base of the array and an index that offsets the Mesa register
1313        * index.
1314        */
1315       ir->array_index->accept(this);
1316
1317       src_reg index_reg;
1318
1319       if (element_size == 1) {
1320          index_reg = this->result;
1321       } else {
1322          index_reg = src_reg(this, glsl_type::int_type);
1323
1324          emit(BRW_OPCODE_MUL, dst_reg(index_reg),
1325               this->result, src_reg(element_size));
1326       }
1327
1328       if (src.reladdr) {
1329          src_reg temp = src_reg(this, glsl_type::int_type);
1330
1331          emit(BRW_OPCODE_ADD, dst_reg(temp), *src.reladdr, index_reg);
1332
1333          index_reg = temp;
1334       }
1335
1336       src.reladdr = ralloc(mem_ctx, src_reg);
1337       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1338    }
1339
1340    /* If the type is smaller than a vec4, replicate the last channel out. */
1341    if (ir->type->is_scalar() || ir->type->is_vector())
1342       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1343    else
1344       src.swizzle = BRW_SWIZZLE_NOOP;
1345    src.type = brw_type_for_base_type(ir->type);
1346
1347    this->result = src;
1348 }
1349
1350 void
1351 vec4_visitor::visit(ir_dereference_record *ir)
1352 {
1353    unsigned int i;
1354    const glsl_type *struct_type = ir->record->type;
1355    int offset = 0;
1356
1357    ir->record->accept(this);
1358
1359    for (i = 0; i < struct_type->length; i++) {
1360       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1361          break;
1362       offset += type_size(struct_type->fields.structure[i].type);
1363    }
1364
1365    /* If the type is smaller than a vec4, replicate the last channel out. */
1366    if (ir->type->is_scalar() || ir->type->is_vector())
1367       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1368    else
1369       this->result.swizzle = BRW_SWIZZLE_NOOP;
1370    this->result.type = brw_type_for_base_type(ir->type);
1371
1372    this->result.reg_offset += offset;
1373 }
1374
1375 /**
1376  * We want to be careful in assignment setup to hit the actual storage
1377  * instead of potentially using a temporary like we might with the
1378  * ir_dereference handler.
1379  */
1380 static dst_reg
1381 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1382 {
1383    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1384     * access of a vector, it must be separated into a series conditional moves
1385     * before reaching this point (see ir_vec_index_to_cond_assign).
1386     */
1387    assert(ir->as_dereference());
1388    ir_dereference_array *deref_array = ir->as_dereference_array();
1389    if (deref_array) {
1390       assert(!deref_array->array->type->is_vector());
1391    }
1392
1393    /* Use the rvalue deref handler for the most part.  We'll ignore
1394     * swizzles in it and write swizzles using writemask, though.
1395     */
1396    ir->accept(v);
1397    return dst_reg(v->result);
1398 }
1399
1400 void
1401 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1402                               const struct glsl_type *type, bool predicated)
1403 {
1404    if (type->base_type == GLSL_TYPE_STRUCT) {
1405       for (unsigned int i = 0; i < type->length; i++) {
1406          emit_block_move(dst, src, type->fields.structure[i].type, predicated);
1407       }
1408       return;
1409    }
1410
1411    if (type->is_array()) {
1412       for (unsigned int i = 0; i < type->length; i++) {
1413          emit_block_move(dst, src, type->fields.array, predicated);
1414       }
1415       return;
1416    }
1417
1418    if (type->is_matrix()) {
1419       const struct glsl_type *vec_type;
1420
1421       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1422                                          type->vector_elements, 1);
1423
1424       for (int i = 0; i < type->matrix_columns; i++) {
1425          emit_block_move(dst, src, vec_type, predicated);
1426       }
1427       return;
1428    }
1429
1430    assert(type->is_scalar() || type->is_vector());
1431
1432    dst->type = brw_type_for_base_type(type);
1433    src->type = dst->type;
1434
1435    dst->writemask = (1 << type->vector_elements) - 1;
1436
1437    /* Do we need to worry about swizzling a swizzle? */
1438    assert(src->swizzle = BRW_SWIZZLE_NOOP);
1439    src->swizzle = swizzle_for_size(type->vector_elements);
1440
1441    vec4_instruction *inst = emit(BRW_OPCODE_MOV, *dst, *src);
1442    if (predicated)
1443       inst->predicate = BRW_PREDICATE_NORMAL;
1444
1445    dst->reg_offset++;
1446    src->reg_offset++;
1447 }
1448
1449
1450 /* If the RHS processing resulted in an instruction generating a
1451  * temporary value, and it would be easy to rewrite the instruction to
1452  * generate its result right into the LHS instead, do so.  This ends
1453  * up reliably removing instructions where it can be tricky to do so
1454  * later without real UD chain information.
1455  */
1456 bool
1457 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1458                                      dst_reg dst,
1459                                      src_reg src,
1460                                      vec4_instruction *pre_rhs_inst,
1461                                      vec4_instruction *last_rhs_inst)
1462 {
1463    /* This could be supported, but it would take more smarts. */
1464    if (ir->condition)
1465       return false;
1466
1467    if (pre_rhs_inst == last_rhs_inst)
1468       return false; /* No instructions generated to work with. */
1469
1470    /* Make sure the last instruction generated our source reg. */
1471    if (src.file != GRF ||
1472        src.file != last_rhs_inst->dst.file ||
1473        src.reg != last_rhs_inst->dst.reg ||
1474        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1475        src.reladdr ||
1476        src.abs ||
1477        src.negate ||
1478        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1479       return false;
1480
1481    /* Check that that last instruction fully initialized the channels
1482     * we want to use, in the order we want to use them.  We could
1483     * potentially reswizzle the operands of many instructions so that
1484     * we could handle out of order channels, but don't yet.
1485     */
1486    for (int i = 0; i < 4; i++) {
1487       if (dst.writemask & (1 << i)) {
1488          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1489             return false;
1490
1491          if (BRW_GET_SWZ(src.swizzle, i) != i)
1492             return false;
1493       }
1494    }
1495
1496    /* Success!  Rewrite the instruction. */
1497    last_rhs_inst->dst.file = dst.file;
1498    last_rhs_inst->dst.reg = dst.reg;
1499    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1500    last_rhs_inst->dst.reladdr = dst.reladdr;
1501    last_rhs_inst->dst.writemask &= dst.writemask;
1502
1503    return true;
1504 }
1505
1506 void
1507 vec4_visitor::visit(ir_assignment *ir)
1508 {
1509    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1510
1511    if (!ir->lhs->type->is_scalar() &&
1512        !ir->lhs->type->is_vector()) {
1513       ir->rhs->accept(this);
1514       src_reg src = this->result;
1515
1516       if (ir->condition) {
1517          emit_bool_to_cond_code(ir->condition);
1518       }
1519
1520       emit_block_move(&dst, &src, ir->rhs->type, ir->condition != NULL);
1521       return;
1522    }
1523
1524    /* Now we're down to just a scalar/vector with writemasks. */
1525    int i;
1526
1527    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1528    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1529
1530    ir->rhs->accept(this);
1531
1532    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1533
1534    src_reg src = this->result;
1535
1536    int swizzles[4];
1537    int first_enabled_chan = 0;
1538    int src_chan = 0;
1539
1540    assert(ir->lhs->type->is_vector() ||
1541           ir->lhs->type->is_scalar());
1542    dst.writemask = ir->write_mask;
1543
1544    for (int i = 0; i < 4; i++) {
1545       if (dst.writemask & (1 << i)) {
1546          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1547          break;
1548       }
1549    }
1550
1551    /* Swizzle a small RHS vector into the channels being written.
1552     *
1553     * glsl ir treats write_mask as dictating how many channels are
1554     * present on the RHS while in our instructions we need to make
1555     * those channels appear in the slots of the vec4 they're written to.
1556     */
1557    for (int i = 0; i < 4; i++) {
1558       if (dst.writemask & (1 << i))
1559          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1560       else
1561          swizzles[i] = first_enabled_chan;
1562    }
1563    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1564                               swizzles[2], swizzles[3]);
1565
1566    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1567       return;
1568    }
1569
1570    if (ir->condition) {
1571       emit_bool_to_cond_code(ir->condition);
1572    }
1573
1574    for (i = 0; i < type_size(ir->lhs->type); i++) {
1575       vec4_instruction *inst = emit(BRW_OPCODE_MOV, dst, src);
1576
1577       if (ir->condition)
1578          inst->predicate = BRW_PREDICATE_NORMAL;
1579
1580       dst.reg_offset++;
1581       src.reg_offset++;
1582    }
1583 }
1584
1585 void
1586 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1587 {
1588    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1589       foreach_list(node, &ir->components) {
1590          ir_constant *field_value = (ir_constant *)node;
1591
1592          emit_constant_values(dst, field_value);
1593       }
1594       return;
1595    }
1596
1597    if (ir->type->is_array()) {
1598       for (unsigned int i = 0; i < ir->type->length; i++) {
1599          emit_constant_values(dst, ir->array_elements[i]);
1600       }
1601       return;
1602    }
1603
1604    if (ir->type->is_matrix()) {
1605       for (int i = 0; i < ir->type->matrix_columns; i++) {
1606          for (int j = 0; j < ir->type->vector_elements; j++) {
1607             dst->writemask = 1 << j;
1608             dst->type = BRW_REGISTER_TYPE_F;
1609
1610             emit(BRW_OPCODE_MOV, *dst,
1611                  src_reg(ir->value.f[i * ir->type->vector_elements + j]));
1612          }
1613          dst->reg_offset++;
1614       }
1615       return;
1616    }
1617
1618    for (int i = 0; i < ir->type->vector_elements; i++) {
1619       dst->writemask = 1 << i;
1620       dst->type = brw_type_for_base_type(ir->type);
1621
1622       switch (ir->type->base_type) {
1623       case GLSL_TYPE_FLOAT:
1624          emit(BRW_OPCODE_MOV, *dst, src_reg(ir->value.f[i]));
1625          break;
1626       case GLSL_TYPE_INT:
1627          emit(BRW_OPCODE_MOV, *dst, src_reg(ir->value.i[i]));
1628          break;
1629       case GLSL_TYPE_UINT:
1630          emit(BRW_OPCODE_MOV, *dst, src_reg(ir->value.u[i]));
1631          break;
1632       case GLSL_TYPE_BOOL:
1633          emit(BRW_OPCODE_MOV, *dst, src_reg(ir->value.b[i]));
1634          break;
1635       default:
1636          assert(!"Non-float/uint/int/bool constant");
1637          break;
1638       }
1639    }
1640    dst->reg_offset++;
1641 }
1642
1643 void
1644 vec4_visitor::visit(ir_constant *ir)
1645 {
1646    dst_reg dst = dst_reg(this, ir->type);
1647    this->result = src_reg(dst);
1648
1649    emit_constant_values(&dst, ir);
1650 }
1651
1652 void
1653 vec4_visitor::visit(ir_call *ir)
1654 {
1655    assert(!"not reached");
1656 }
1657
1658 void
1659 vec4_visitor::visit(ir_texture *ir)
1660 {
1661    /* FINISHME: Implement vertex texturing.
1662     *
1663     * With 0 vertex samplers available, the linker will reject
1664     * programs that do vertex texturing, but after our visitor has
1665     * run.
1666     */
1667 }
1668
1669 void
1670 vec4_visitor::visit(ir_return *ir)
1671 {
1672    assert(!"not reached");
1673 }
1674
1675 void
1676 vec4_visitor::visit(ir_discard *ir)
1677 {
1678    assert(!"not reached");
1679 }
1680
1681 void
1682 vec4_visitor::visit(ir_if *ir)
1683 {
1684    /* Don't point the annotation at the if statement, because then it plus
1685     * the then and else blocks get printed.
1686     */
1687    this->base_ir = ir->condition;
1688
1689    if (intel->gen == 6) {
1690       emit_if_gen6(ir);
1691    } else {
1692       emit_bool_to_cond_code(ir->condition);
1693       vec4_instruction *inst = emit(BRW_OPCODE_IF);
1694       inst->predicate = BRW_PREDICATE_NORMAL;
1695    }
1696
1697    visit_instructions(&ir->then_instructions);
1698
1699    if (!ir->else_instructions.is_empty()) {
1700       this->base_ir = ir->condition;
1701       emit(BRW_OPCODE_ELSE);
1702
1703       visit_instructions(&ir->else_instructions);
1704    }
1705
1706    this->base_ir = ir->condition;
1707    emit(BRW_OPCODE_ENDIF);
1708 }
1709
1710 int
1711 vec4_visitor::emit_vue_header_gen4(int header_mrf)
1712 {
1713    /* Get the position */
1714    src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
1715
1716    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1717    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1718
1719    current_annotation = "NDC";
1720    dst_reg ndc_w = ndc;
1721    ndc_w.writemask = WRITEMASK_W;
1722    src_reg pos_w = pos;
1723    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1724    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1725
1726    dst_reg ndc_xyz = ndc;
1727    ndc_xyz.writemask = WRITEMASK_XYZ;
1728
1729    emit(BRW_OPCODE_MUL, ndc_xyz, pos, src_reg(ndc_w));
1730
1731    if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1732        c->key.nr_userclip || brw->has_negative_rhw_bug) {
1733       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1734       GLuint i;
1735
1736       emit(BRW_OPCODE_MOV, header1, 0u);
1737
1738       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1739          assert(!"finishme: psiz");
1740          src_reg psiz;
1741
1742          header1.writemask = WRITEMASK_W;
1743          emit(BRW_OPCODE_MUL, header1, psiz, 1u << 11);
1744          emit(BRW_OPCODE_AND, header1, src_reg(header1), 0x7ff << 8);
1745       }
1746
1747       for (i = 0; i < c->key.nr_userclip; i++) {
1748          vec4_instruction *inst;
1749
1750          inst = emit(BRW_OPCODE_DP4, dst_reg(brw_null_reg()),
1751                      pos, src_reg(c->userplane[i]));
1752          inst->conditional_mod = BRW_CONDITIONAL_L;
1753
1754          emit(BRW_OPCODE_OR, header1, src_reg(header1), 1u << i);
1755          inst->predicate = BRW_PREDICATE_NORMAL;
1756       }
1757
1758       /* i965 clipping workaround:
1759        * 1) Test for -ve rhw
1760        * 2) If set,
1761        *      set ndc = (0,0,0,0)
1762        *      set ucp[6] = 1
1763        *
1764        * Later, clipping will detect ucp[6] and ensure the primitive is
1765        * clipped against all fixed planes.
1766        */
1767       if (brw->has_negative_rhw_bug) {
1768 #if 0
1769          /* FINISHME */
1770          brw_CMP(p,
1771                  vec8(brw_null_reg()),
1772                  BRW_CONDITIONAL_L,
1773                  brw_swizzle1(ndc, 3),
1774                  brw_imm_f(0));
1775
1776          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1777          brw_MOV(p, ndc, brw_imm_f(0));
1778          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1779 #endif
1780       }
1781
1782       header1.writemask = WRITEMASK_XYZW;
1783       emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), src_reg(header1));
1784    } else {
1785       emit(BRW_OPCODE_MOV, retype(brw_message_reg(header_mrf++),
1786                                   BRW_REGISTER_TYPE_UD), 0u);
1787    }
1788
1789    if (intel->gen == 5) {
1790       /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1791        * dword 0-3 (m1) of the header is indices, point width, clip flags.
1792        * dword 4-7 (m2) is the ndc position (set above)
1793        * dword 8-11 (m3) of the vertex header is the 4D space position
1794        * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1795        * m6 is a pad so that the vertex element data is aligned
1796        * m7 is the first vertex data we fill.
1797        */
1798       current_annotation = "NDC";
1799       emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), src_reg(ndc));
1800
1801       current_annotation = "gl_Position";
1802       emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), pos);
1803
1804       /* user clip distance. */
1805       header_mrf += 2;
1806
1807       /* Pad so that vertex element data is aligned. */
1808       header_mrf++;
1809    } else {
1810       /* There are 8 dwords in VUE header pre-Ironlake:
1811        * dword 0-3 (m1) is indices, point width, clip flags.
1812        * dword 4-7 (m2) is ndc position (set above)
1813        *
1814        * dword 8-11 (m3) is the first vertex data.
1815        */
1816       current_annotation = "NDC";
1817       emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), src_reg(ndc));
1818
1819       current_annotation = "gl_Position";
1820       emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), pos);
1821    }
1822
1823    return header_mrf;
1824 }
1825
1826 int
1827 vec4_visitor::emit_vue_header_gen6(int header_mrf)
1828 {
1829    struct brw_reg reg;
1830
1831    /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1832     * dword 0-3 (m2) of the header is indices, point width, clip flags.
1833     * dword 4-7 (m3) is the 4D space position
1834     * dword 8-15 (m4,m5) of the vertex header is the user clip distance if
1835     * enabled.
1836     *
1837     * m4 or 6 is the first vertex element data we fill.
1838     */
1839
1840    current_annotation = "indices, point width, clip flags";
1841    reg = brw_message_reg(header_mrf++);
1842    emit(BRW_OPCODE_MOV, retype(reg, BRW_REGISTER_TYPE_D), src_reg(0));
1843    if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1844       emit(BRW_OPCODE_MOV, brw_writemask(reg, WRITEMASK_W),
1845            src_reg(output_reg[VERT_RESULT_PSIZ]));
1846    }
1847
1848    current_annotation = "gl_Position";
1849    emit(BRW_OPCODE_MOV,
1850         brw_message_reg(header_mrf++), src_reg(output_reg[VERT_RESULT_HPOS]));
1851
1852    current_annotation = "user clip distances";
1853    if (c->key.nr_userclip) {
1854       for (int i = 0; i < c->key.nr_userclip; i++) {
1855          struct brw_reg m;
1856          if (i < 4)
1857             m = brw_message_reg(header_mrf);
1858          else
1859             m = brw_message_reg(header_mrf + 1);
1860
1861          emit(DP4(dst_reg(brw_writemask(m, 1 << (i & 3))),
1862                   src_reg(output_reg[VERT_RESULT_HPOS]),
1863                   src_reg(c->userplane[i])));
1864       }
1865       header_mrf += 2;
1866    }
1867
1868    current_annotation = NULL;
1869
1870    return header_mrf;
1871 }
1872
1873 static int
1874 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
1875 {
1876    struct intel_context *intel = &brw->intel;
1877
1878    if (intel->gen >= 6) {
1879       /* URB data written (does not include the message header reg) must
1880        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1881        * section 5.4.3.2.2: URB_INTERLEAVED.
1882        *
1883        * URB entries are allocated on a multiple of 1024 bits, so an
1884        * extra 128 bits written here to make the end align to 256 is
1885        * no problem.
1886        */
1887       if ((mlen % 2) != 1)
1888          mlen++;
1889    }
1890
1891    return mlen;
1892 }
1893
1894 /**
1895  * Generates the VUE payload plus the 1 or 2 URB write instructions to
1896  * complete the VS thread.
1897  *
1898  * The VUE layout is documented in Volume 2a.
1899  */
1900 void
1901 vec4_visitor::emit_urb_writes()
1902 {
1903    /* MRF 0 is reserved for the debugger, so start with message header
1904     * in MRF 1.
1905     */
1906    int base_mrf = 1;
1907    int mrf = base_mrf;
1908    int urb_entry_size;
1909    uint64_t outputs_remaining = c->prog_data.outputs_written;
1910    /* In the process of generating our URB write message contents, we
1911     * may need to unspill a register or load from an array.  Those
1912     * reads would use MRFs 14-15.
1913     */
1914    int max_usable_mrf = 13;
1915
1916    /* FINISHME: edgeflag */
1917
1918    /* First mrf is the g0-based message header containing URB handles and such,
1919     * which is implied in VS_OPCODE_URB_WRITE.
1920     */
1921    mrf++;
1922
1923    if (intel->gen >= 6) {
1924       mrf = emit_vue_header_gen6(mrf);
1925    } else {
1926       mrf = emit_vue_header_gen4(mrf);
1927    }
1928
1929    /* Set up the VUE data for the first URB write */
1930    int attr;
1931    for (attr = 0; attr < VERT_RESULT_MAX; attr++) {
1932       if (!(c->prog_data.outputs_written & BITFIELD64_BIT(attr)))
1933          continue;
1934
1935       outputs_remaining &= ~BITFIELD64_BIT(attr);
1936
1937       /* This is set up in the VUE header. */
1938       if (attr == VERT_RESULT_HPOS)
1939          continue;
1940
1941       /* This is loaded into the VUE header, and thus doesn't occupy
1942        * an attribute slot.
1943        */
1944       if (attr == VERT_RESULT_PSIZ)
1945          continue;
1946
1947       vec4_instruction *inst = emit(BRW_OPCODE_MOV, brw_message_reg(mrf++),
1948                                     src_reg(output_reg[attr]));
1949
1950       if ((attr == VERT_RESULT_COL0 ||
1951            attr == VERT_RESULT_COL1 ||
1952            attr == VERT_RESULT_BFC0 ||
1953            attr == VERT_RESULT_BFC1) &&
1954           c->key.clamp_vertex_color) {
1955          inst->saturate = true;
1956       }
1957
1958       /* If this was MRF 15, we can't fit anything more into this URB
1959        * WRITE.  Note that base_mrf of 1 means that MRF 15 is an
1960        * even-numbered amount of URB write data, which will meet
1961        * gen6's requirements for length alignment.
1962        */
1963       if (mrf > max_usable_mrf) {
1964          attr++;
1965          break;
1966       }
1967    }
1968
1969    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
1970    inst->base_mrf = base_mrf;
1971    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
1972    inst->eot = !outputs_remaining;
1973
1974    urb_entry_size = mrf - base_mrf;
1975
1976    /* Optional second URB write */
1977    if (outputs_remaining) {
1978       mrf = base_mrf + 1;
1979
1980       for (; attr < VERT_RESULT_MAX; attr++) {
1981          if (!(c->prog_data.outputs_written & BITFIELD64_BIT(attr)))
1982             continue;
1983
1984          assert(mrf < max_usable_mrf);
1985
1986          emit(BRW_OPCODE_MOV, brw_message_reg(mrf++), src_reg(output_reg[attr]));
1987       }
1988
1989       inst = emit(VS_OPCODE_URB_WRITE);
1990       inst->base_mrf = base_mrf;
1991       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
1992       inst->eot = true;
1993       /* URB destination offset.  In the previous write, we got MRFs
1994        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
1995        * URB row increments, and each of our MRFs is half of one of
1996        * those, since we're doing interleaved writes.
1997        */
1998       inst->offset = (max_usable_mrf - base_mrf) / 2;
1999
2000       urb_entry_size += mrf - base_mrf;
2001    }
2002
2003    if (intel->gen == 6)
2004       c->prog_data.urb_entry_size = ALIGN(urb_entry_size, 8) / 8;
2005    else
2006       c->prog_data.urb_entry_size = ALIGN(urb_entry_size, 4) / 4;
2007 }
2008
2009 src_reg
2010 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2011                                  src_reg *reladdr, int reg_offset)
2012 {
2013    /* Because we store the values to scratch interleaved like our
2014     * vertex data, we need to scale the vec4 index by 2.
2015     */
2016    int message_header_scale = 2;
2017
2018    /* Pre-gen6, the message header uses byte offsets instead of vec4
2019     * (16-byte) offset units.
2020     */
2021    if (intel->gen < 6)
2022       message_header_scale *= 16;
2023
2024    if (reladdr) {
2025       src_reg index = src_reg(this, glsl_type::int_type);
2026
2027       vec4_instruction *add = emit(BRW_OPCODE_ADD,
2028                                    dst_reg(index),
2029                                    *reladdr,
2030                                    src_reg(reg_offset));
2031       /* Move our new instruction from the tail to its correct place. */
2032       add->remove();
2033       inst->insert_before(add);
2034
2035       vec4_instruction *mul = emit(BRW_OPCODE_MUL, dst_reg(index),
2036                                    index, src_reg(message_header_scale));
2037       mul->remove();
2038       inst->insert_before(mul);
2039
2040       return index;
2041    } else {
2042       return src_reg(reg_offset * message_header_scale);
2043    }
2044 }
2045
2046 src_reg
2047 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2048                                        src_reg *reladdr, int reg_offset)
2049 {
2050    if (reladdr) {
2051       src_reg index = src_reg(this, glsl_type::int_type);
2052
2053       vec4_instruction *add = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_ADD,
2054                                                             dst_reg(index),
2055                                                             *reladdr,
2056                                                             src_reg(reg_offset));
2057       add->ir = inst->ir;
2058       add->annotation = inst->annotation;
2059       inst->insert_before(add);
2060
2061       /* Pre-gen6, the message header uses byte offsets instead of vec4
2062        * (16-byte) offset units.
2063        */
2064       if (intel->gen < 6) {
2065          vec4_instruction *mul = new(mem_ctx) vec4_instruction(this,
2066                                                                BRW_OPCODE_MUL,
2067                                                                dst_reg(index),
2068                                                                index,
2069                                                                src_reg(16));
2070          mul->ir = inst->ir;
2071          mul->annotation = inst->annotation;
2072          inst->insert_before(mul);
2073       }
2074
2075       return index;
2076    } else {
2077       int message_header_scale = intel->gen < 6 ? 16 : 1;
2078       return src_reg(reg_offset * message_header_scale);
2079    }
2080 }
2081
2082 /**
2083  * Emits an instruction before @inst to load the value named by @orig_src
2084  * from scratch space at @base_offset to @temp.
2085  */
2086 void
2087 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2088                                 dst_reg temp, src_reg orig_src,
2089                                 int base_offset)
2090 {
2091    int reg_offset = base_offset + orig_src.reg_offset;
2092    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2093
2094    vec4_instruction *scratch_read_inst = emit(VS_OPCODE_SCRATCH_READ,
2095                                               temp, index);
2096
2097    scratch_read_inst->base_mrf = 14;
2098    scratch_read_inst->mlen = 1;
2099    /* Move our instruction from the tail to its correct place. */
2100    scratch_read_inst->remove();
2101    inst->insert_before(scratch_read_inst);
2102 }
2103
2104 /**
2105  * Emits an instruction after @inst to store the value to be written
2106  * to @orig_dst to scratch space at @base_offset, from @temp.
2107  */
2108 void
2109 vec4_visitor::emit_scratch_write(vec4_instruction *inst,
2110                                  src_reg temp, dst_reg orig_dst,
2111                                  int base_offset)
2112 {
2113    int reg_offset = base_offset + orig_dst.reg_offset;
2114    src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
2115
2116    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2117                                        orig_dst.writemask));
2118    vec4_instruction *scratch_write_inst = emit(VS_OPCODE_SCRATCH_WRITE,
2119                                                dst, temp, index);
2120    scratch_write_inst->base_mrf = 13;
2121    scratch_write_inst->mlen = 2;
2122    scratch_write_inst->predicate = inst->predicate;
2123    /* Move our instruction from the tail to its correct place. */
2124    scratch_write_inst->remove();
2125    inst->insert_after(scratch_write_inst);
2126 }
2127
2128 /**
2129  * We can't generally support array access in GRF space, because a
2130  * single instruction's destination can only span 2 contiguous
2131  * registers.  So, we send all GRF arrays that get variable index
2132  * access to scratch space.
2133  */
2134 void
2135 vec4_visitor::move_grf_array_access_to_scratch()
2136 {
2137    int scratch_loc[this->virtual_grf_count];
2138
2139    for (int i = 0; i < this->virtual_grf_count; i++) {
2140       scratch_loc[i] = -1;
2141    }
2142
2143    /* First, calculate the set of virtual GRFs that need to be punted
2144     * to scratch due to having any array access on them, and where in
2145     * scratch.
2146     */
2147    foreach_list(node, &this->instructions) {
2148       vec4_instruction *inst = (vec4_instruction *)node;
2149
2150       if (inst->dst.file == GRF && inst->dst.reladdr &&
2151           scratch_loc[inst->dst.reg] == -1) {
2152          scratch_loc[inst->dst.reg] = c->last_scratch;
2153          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg] * 8 * 4;
2154       }
2155
2156       for (int i = 0 ; i < 3; i++) {
2157          src_reg *src = &inst->src[i];
2158
2159          if (src->file == GRF && src->reladdr &&
2160              scratch_loc[src->reg] == -1) {
2161             scratch_loc[src->reg] = c->last_scratch;
2162             c->last_scratch += this->virtual_grf_sizes[src->reg] * 8 * 4;
2163          }
2164       }
2165    }
2166
2167    /* Now, for anything that will be accessed through scratch, rewrite
2168     * it to load/store.  Note that this is a _safe list walk, because
2169     * we may generate a new scratch_write instruction after the one
2170     * we're processing.
2171     */
2172    foreach_list_safe(node, &this->instructions) {
2173       vec4_instruction *inst = (vec4_instruction *)node;
2174
2175       /* Set up the annotation tracking for new generated instructions. */
2176       base_ir = inst->ir;
2177       current_annotation = inst->annotation;
2178
2179       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2180          src_reg temp = src_reg(this, glsl_type::vec4_type);
2181
2182          emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
2183
2184          inst->dst.file = temp.file;
2185          inst->dst.reg = temp.reg;
2186          inst->dst.reg_offset = temp.reg_offset;
2187          inst->dst.reladdr = NULL;
2188       }
2189
2190       for (int i = 0 ; i < 3; i++) {
2191          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2192             continue;
2193
2194          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2195
2196          emit_scratch_read(inst, temp, inst->src[i],
2197                            scratch_loc[inst->src[i].reg]);
2198
2199          inst->src[i].file = temp.file;
2200          inst->src[i].reg = temp.reg;
2201          inst->src[i].reg_offset = temp.reg_offset;
2202          inst->src[i].reladdr = NULL;
2203       }
2204    }
2205 }
2206
2207 /**
2208  * Emits an instruction before @inst to load the value named by @orig_src
2209  * from the pull constant buffer (surface) at @base_offset to @temp.
2210  */
2211 void
2212 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2213                                       dst_reg temp, src_reg orig_src,
2214                                       int base_offset)
2215 {
2216    int reg_offset = base_offset + orig_src.reg_offset;
2217    src_reg index = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2218    vec4_instruction *load;
2219
2220    load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2221                                         temp, index);
2222    load->annotation = inst->annotation;
2223    load->ir = inst->ir;
2224    load->base_mrf = 14;
2225    load->mlen = 1;
2226    inst->insert_before(load);
2227 }
2228
2229 /**
2230  * Implements array access of uniforms by inserting a
2231  * PULL_CONSTANT_LOAD instruction.
2232  *
2233  * Unlike temporary GRF array access (where we don't support it due to
2234  * the difficulty of doing relative addressing on instruction
2235  * destinations), we could potentially do array access of uniforms
2236  * that were loaded in GRF space as push constants.  In real-world
2237  * usage we've seen, though, the arrays being used are always larger
2238  * than we could load as push constants, so just always move all
2239  * uniform array access out to a pull constant buffer.
2240  */
2241 void
2242 vec4_visitor::move_uniform_array_access_to_pull_constants()
2243 {
2244    int pull_constant_loc[this->uniforms];
2245
2246    for (int i = 0; i < this->uniforms; i++) {
2247       pull_constant_loc[i] = -1;
2248    }
2249
2250    /* Walk through and find array access of uniforms.  Put a copy of that
2251     * uniform in the pull constant buffer.
2252     *
2253     * Note that we don't move constant-indexed accesses to arrays.  No
2254     * testing has been done of the performance impact of this choice.
2255     */
2256    foreach_list_safe(node, &this->instructions) {
2257       vec4_instruction *inst = (vec4_instruction *)node;
2258
2259       for (int i = 0 ; i < 3; i++) {
2260          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2261             continue;
2262
2263          int uniform = inst->src[i].reg;
2264
2265          /* If this array isn't already present in the pull constant buffer,
2266           * add it.
2267           */
2268          if (pull_constant_loc[uniform] == -1) {
2269             const float **values = &prog_data->param[uniform * 4];
2270
2271             pull_constant_loc[uniform] = prog_data->nr_pull_params;
2272
2273             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2274                prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2275             }
2276          }
2277
2278          /* Set up the annotation tracking for new generated instructions. */
2279          base_ir = inst->ir;
2280          current_annotation = inst->annotation;
2281
2282          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2283
2284          emit_pull_constant_load(inst, temp, inst->src[i],
2285                                  pull_constant_loc[uniform]);
2286
2287          inst->src[i].file = temp.file;
2288          inst->src[i].reg = temp.reg;
2289          inst->src[i].reg_offset = temp.reg_offset;
2290          inst->src[i].reladdr = NULL;
2291       }
2292    }
2293
2294    /* Now there are no accesses of the UNIFORM file with a reladdr, so
2295     * no need to track them as larger-than-vec4 objects.  This will be
2296     * relied on in cutting out unused uniform vectors from push
2297     * constants.
2298     */
2299    split_uniform_registers();
2300 }
2301
2302 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2303                            struct gl_shader_program *prog,
2304                            struct brw_shader *shader)
2305 {
2306    this->c = c;
2307    this->p = &c->func;
2308    this->brw = p->brw;
2309    this->intel = &brw->intel;
2310    this->ctx = &intel->ctx;
2311    this->prog = prog;
2312    this->shader = shader;
2313
2314    this->mem_ctx = ralloc_context(NULL);
2315    this->failed = false;
2316
2317    this->base_ir = NULL;
2318    this->current_annotation = NULL;
2319
2320    this->c = c;
2321    this->vp = prog->VertexProgram;
2322    this->prog_data = &c->prog_data;
2323
2324    this->variable_ht = hash_table_ctor(0,
2325                                        hash_table_pointer_hash,
2326                                        hash_table_pointer_compare);
2327
2328    this->virtual_grf_def = NULL;
2329    this->virtual_grf_use = NULL;
2330    this->virtual_grf_sizes = NULL;
2331    this->virtual_grf_count = 0;
2332    this->virtual_grf_array_size = 0;
2333    this->live_intervals_valid = false;
2334
2335    this->uniforms = 0;
2336
2337    this->variable_ht = hash_table_ctor(0,
2338                                        hash_table_pointer_hash,
2339                                        hash_table_pointer_compare);
2340 }
2341
2342 vec4_visitor::~vec4_visitor()
2343 {
2344    ralloc_free(this->mem_ctx);
2345    hash_table_dtor(this->variable_ht);
2346 }
2347
2348
2349 void
2350 vec4_visitor::fail(const char *format, ...)
2351 {
2352    va_list va;
2353    char *msg;
2354
2355    if (failed)
2356       return;
2357
2358    failed = true;
2359
2360    va_start(va, format);
2361    msg = ralloc_vasprintf(mem_ctx, format, va);
2362    va_end(va);
2363    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2364
2365    this->fail_msg = msg;
2366
2367    if (INTEL_DEBUG & DEBUG_VS) {
2368       fprintf(stderr, "%s",  msg);
2369    }
2370 }
2371
2372 } /* namespace brw */