src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 extern "C" {
  26 #include "main/macros.h"
  27 #include "program/prog_parameter.h"
  28 }
  29
  30 namespace brw {
  31
  32 src_reg::src_reg(dst_reg reg)
  33 {
  34    init();
  35
  36    this->file = reg.file;
  37    this->reg = reg.reg;
  38    this->reg_offset = reg.reg_offset;
  39    this->type = reg.type;
  40    this->reladdr = reg.reladdr;
  41    this->fixed_hw_reg = reg.fixed_hw_reg;
  42
  43    int swizzles[4];
  44    int next_chan = 0;
  45    int last = 0;
  46
  47    for (int i = 0; i < 4; i++) {
  48       if (!(reg.writemask & (1 << i)))
  49          continue;
  50
  51       swizzles[next_chan++] = last = i;
  52    }
  53
  54    for (; next_chan < 4; next_chan++) {
  55       swizzles[next_chan] = last;
  56    }
  57
  58    this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
  59                                 swizzles[2], swizzles[3]);
  60 }
  61
  62 dst_reg::dst_reg(src_reg reg)
  63 {
  64    init();
  65
  66    this->file = reg.file;
  67    this->reg = reg.reg;
  68    this->reg_offset = reg.reg_offset;
  69    this->type = reg.type;
  70    this->writemask = WRITEMASK_XYZW;
  71    this->reladdr = reg.reladdr;
  72    this->fixed_hw_reg = reg.fixed_hw_reg;
  73 }
  74
  75 vec4_instruction::vec4_instruction(vec4_visitor *v,
  76                                    enum opcode opcode, dst_reg dst,
  77                                    src_reg src0, src_reg src1, src_reg src2)
  78 {
  79    this->opcode = opcode;
  80    this->dst = dst;
  81    this->src[0] = src0;
  82    this->src[1] = src1;
  83    this->src[2] = src2;
  84    this->ir = v->base_ir;
  85    this->annotation = v->current_annotation;
  86 }
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(vec4_instruction *inst)
  90 {
  91    this->instructions.push_tail(inst);
  92
  93    return inst;
  94 }
  95
  96 vec4_instruction *
  97 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  98 {
  99    new_inst->ir = inst->ir;
 100    new_inst->annotation = inst->annotation;
 101
 102    inst->insert_before(new_inst);
 103
 104    return inst;
 105 }
 106
 107 vec4_instruction *
 108 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
 109                    src_reg src0, src_reg src1, src_reg src2)
 110 {
 111    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
 112                                              src0, src1, src2));
 113 }
 114
 115
 116 vec4_instruction *
 117 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
 118 {
 119    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
 120 }
 121
 122 vec4_instruction *
 123 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
 124 {
 125    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
 126 }
 127
 128 vec4_instruction *
 129 vec4_visitor::emit(enum opcode opcode)
 130 {
 131    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 132 }
 133
 134 #define ALU1(op)                                                        \
 135    vec4_instruction *                                                   \
 136    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
 137    {                                                                    \
 138       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 139                                            src0);                       \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    vec4_instruction *                                                   \
 144    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 145    {                                                                    \
 146       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 147                                            src0, src1);                 \
 148    }
 149
 150 ALU1(NOT)
 151 ALU1(MOV)
 152 ALU1(FRC)
 153 ALU1(RNDD)
 154 ALU1(RNDE)
 155 ALU1(RNDZ)
 156 ALU2(ADD)
 157 ALU2(MUL)
 158 ALU2(MACH)
 159 ALU2(AND)
 160 ALU2(OR)
 161 ALU2(XOR)
 162 ALU2(DP3)
 163 ALU2(DP4)
 164
 165 /** Gen4 predicated IF. */
 166 vec4_instruction *
 167 vec4_visitor::IF(uint32_t predicate)
 168 {
 169    vec4_instruction *inst;
 170
 171    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 172    inst->predicate = predicate;
 173
 174    return inst;
 175 }
 176
 177 /** Gen6+ IF with embedded comparison. */
 178 vec4_instruction *
 179 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 180 {
 181    assert(intel->gen >= 6);
 182
 183    vec4_instruction *inst;
 184
 185    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 186                                         src0, src1);
 187    inst->conditional_mod = condition;
 188
 189    return inst;
 190 }
 191
 192 /**
 193  * CMP: Sets the low bit of the destination channels with the result
 194  * of the comparison, while the upper bits are undefined, and updates
 195  * the flag register with the packed 16 bits of the result.
 196  */
 197 vec4_instruction *
 198 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 199 {
 200    vec4_instruction *inst;
 201
 202    /* original gen4 does type conversion to the destination type
 203     * before before comparison, producing garbage results for floating
 204     * point comparisons.
 205     */
 206    if (intel->gen == 4)
 207       dst.type = src0.type;
 208
 209    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 210    inst->conditional_mod = condition;
 211
 212    return inst;
 213 }
 214
 215 vec4_instruction *
 216 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 217 {
 218    vec4_instruction *inst;
 219
 220    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 221                                         dst, index);
 222    inst->base_mrf = 14;
 223    inst->mlen = 1;
 224
 225    return inst;
 226 }
 227
 228 vec4_instruction *
 229 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 230 {
 231    vec4_instruction *inst;
 232
 233    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 234                                         dst, src, index);
 235    inst->base_mrf = 13;
 236    inst->mlen = 2;
 237
 238    return inst;
 239 }
 240
 241 void
 242 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 243 {
 244    static enum opcode dot_opcodes[] = {
 245       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 246    };
 247
 248    emit(dot_opcodes[elements - 2], dst, src0, src1);
 249 }
 250
 251 void
 252 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 253 {
 254    /* The gen6 math instruction ignores the source modifiers --
 255     * swizzle, abs, negate, and at least some parts of the register
 256     * region description.
 257     *
 258     * While it would seem that this MOV could be avoided at this point
 259     * in the case that the swizzle is matched up with the destination
 260     * writemask, note that uniform packing and register allocation
 261     * could rearrange our swizzle, so let's leave this matter up to
 262     * copy propagation later.
 263     */
 264    src_reg temp_src = src_reg(this, glsl_type::vec4_type);
 265    emit(MOV(dst_reg(temp_src), src));
 266
 267    if (dst.writemask != WRITEMASK_XYZW) {
 268       /* The gen6 math instruction must be align1, so we can't do
 269        * writemasks.
 270        */
 271       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 272
 273       emit(opcode, temp_dst, temp_src);
 274
 275       emit(MOV(dst, src_reg(temp_dst)));
 276    } else {
 277       emit(opcode, dst, temp_src);
 278    }
 279 }
 280
 281 void
 282 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 283 {
 284    vec4_instruction *inst = emit(opcode, dst, src);
 285    inst->base_mrf = 1;
 286    inst->mlen = 1;
 287 }
 288
 289 void
 290 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 291 {
 292    switch (opcode) {
 293    case SHADER_OPCODE_RCP:
 294    case SHADER_OPCODE_RSQ:
 295    case SHADER_OPCODE_SQRT:
 296    case SHADER_OPCODE_EXP2:
 297    case SHADER_OPCODE_LOG2:
 298    case SHADER_OPCODE_SIN:
 299    case SHADER_OPCODE_COS:
 300       break;
 301    default:
 302       assert(!"not reached: bad math opcode");
 303       return;
 304    }
 305
 306    if (intel->gen >= 6) {
 307       return emit_math1_gen6(opcode, dst, src);
 308    } else {
 309       return emit_math1_gen4(opcode, dst, src);
 310    }
 311 }
 312
 313 void
 314 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 315                               dst_reg dst, src_reg src0, src_reg src1)
 316 {
 317    src_reg expanded;
 318
 319    /* The gen6 math instruction ignores the source modifiers --
 320     * swizzle, abs, negate, and at least some parts of the register
 321     * region description.  Move the sources to temporaries to make it
 322     * generally work.
 323     */
 324
 325    expanded = src_reg(this, glsl_type::vec4_type);
 326    emit(MOV(dst_reg(expanded), src0));
 327    src0 = expanded;
 328
 329    expanded = src_reg(this, glsl_type::vec4_type);
 330    emit(MOV(dst_reg(expanded), src1));
 331    src1 = expanded;
 332
 333    if (dst.writemask != WRITEMASK_XYZW) {
 334       /* The gen6 math instruction must be align1, so we can't do
 335        * writemasks.
 336        */
 337       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 338
 339       emit(opcode, temp_dst, src0, src1);
 340
 341       emit(MOV(dst, src_reg(temp_dst)));
 342    } else {
 343       emit(opcode, dst, src0, src1);
 344    }
 345 }
 346
 347 void
 348 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 349                               dst_reg dst, src_reg src0, src_reg src1)
 350 {
 351    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 352    inst->base_mrf = 1;
 353    inst->mlen = 2;
 354 }
 355
 356 void
 357 vec4_visitor::emit_math(enum opcode opcode,
 358                         dst_reg dst, src_reg src0, src_reg src1)
 359 {
 360    assert(opcode == SHADER_OPCODE_POW);
 361
 362    if (intel->gen >= 6) {
 363       return emit_math2_gen6(opcode, dst, src0, src1);
 364    } else {
 365       return emit_math2_gen4(opcode, dst, src0, src1);
 366    }
 367 }
 368
 369 void
 370 vec4_visitor::visit_instructions(const exec_list *list)
 371 {
 372    foreach_list(node, list) {
 373       ir_instruction *ir = (ir_instruction *)node;
 374
 375       base_ir = ir;
 376       ir->accept(this);
 377    }
 378 }
 379
 380
 381 static int
 382 type_size(const struct glsl_type *type)
 383 {
 384    unsigned int i;
 385    int size;
 386
 387    switch (type->base_type) {
 388    case GLSL_TYPE_UINT:
 389    case GLSL_TYPE_INT:
 390    case GLSL_TYPE_FLOAT:
 391    case GLSL_TYPE_BOOL:
 392       if (type->is_matrix()) {
 393          return type->matrix_columns;
 394       } else {
 395          /* Regardless of size of vector, it gets a vec4. This is bad
 396           * packing for things like floats, but otherwise arrays become a
 397           * mess.  Hopefully a later pass over the code can pack scalars
 398           * down if appropriate.
 399           */
 400          return 1;
 401       }
 402    case GLSL_TYPE_ARRAY:
 403       assert(type->length > 0);
 404       return type_size(type->fields.array) * type->length;
 405    case GLSL_TYPE_STRUCT:
 406       size = 0;
 407       for (i = 0; i < type->length; i++) {
 408          size += type_size(type->fields.structure[i].type);
 409       }
 410       return size;
 411    case GLSL_TYPE_SAMPLER:
 412       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 413        * at link time.
 414        */
 415       return 1;
 416    default:
 417       assert(0);
 418       return 0;
 419    }
 420 }
 421
 422 int
 423 vec4_visitor::virtual_grf_alloc(int size)
 424 {
 425    if (virtual_grf_array_size <= virtual_grf_count) {
 426       if (virtual_grf_array_size == 0)
 427          virtual_grf_array_size = 16;
 428       else
 429          virtual_grf_array_size *= 2;
 430       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 431                                    virtual_grf_array_size);
 432    }
 433    virtual_grf_sizes[virtual_grf_count] = size;
 434    return virtual_grf_count++;
 435 }
 436
 437 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 438 {
 439    init();
 440
 441    this->file = GRF;
 442    this->reg = v->virtual_grf_alloc(type_size(type));
 443
 444    if (type->is_array() || type->is_record()) {
 445       this->swizzle = BRW_SWIZZLE_NOOP;
 446    } else {
 447       this->swizzle = swizzle_for_size(type->vector_elements);
 448    }
 449
 450    this->type = brw_type_for_base_type(type);
 451 }
 452
 453 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 454 {
 455    init();
 456
 457    this->file = GRF;
 458    this->reg = v->virtual_grf_alloc(type_size(type));
 459
 460    if (type->is_array() || type->is_record()) {
 461       this->writemask = WRITEMASK_XYZW;
 462    } else {
 463       this->writemask = (1 << type->vector_elements) - 1;
 464    }
 465
 466    this->type = brw_type_for_base_type(type);
 467 }
 468
 469 /* Our support for uniforms is piggy-backed on the struct
 470  * gl_fragment_program, because that's where the values actually
 471  * get stored, rather than in some global gl_shader_program uniform
 472  * store.
 473  */
 474 int
 475 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
 476 {
 477    unsigned int offset = 0;
 478    float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
 479
 480    if (type->is_matrix()) {
 481       const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 482                                                         type->vector_elements,
 483                                                         1);
 484
 485       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 486          offset += setup_uniform_values(loc + offset, column);
 487       }
 488
 489       return offset;
 490    }
 491
 492    switch (type->base_type) {
 493    case GLSL_TYPE_FLOAT:
 494    case GLSL_TYPE_UINT:
 495    case GLSL_TYPE_INT:
 496    case GLSL_TYPE_BOOL:
 497       for (unsigned int i = 0; i < type->vector_elements; i++) {
 498          c->prog_data.param[this->uniforms * 4 + i] = &values[i];
 499       }
 500
 501       /* Set up pad elements to get things aligned to a vec4 boundary. */
 502       for (unsigned int i = type->vector_elements; i < 4; i++) {
 503          static float zero = 0;
 504
 505          c->prog_data.param[this->uniforms * 4 + i] = &zero;
 506       }
 507
 508       /* Track the size of this uniform vector, for future packing of
 509        * uniforms.
 510        */
 511       this->uniform_vector_size[this->uniforms] = type->vector_elements;
 512       this->uniforms++;
 513
 514       return 1;
 515
 516    case GLSL_TYPE_STRUCT:
 517       for (unsigned int i = 0; i < type->length; i++) {
 518          offset += setup_uniform_values(loc + offset,
 519                                         type->fields.structure[i].type);
 520       }
 521       return offset;
 522
 523    case GLSL_TYPE_ARRAY:
 524       for (unsigned int i = 0; i < type->length; i++) {
 525          offset += setup_uniform_values(loc + offset, type->fields.array);
 526       }
 527       return offset;
 528
 529    case GLSL_TYPE_SAMPLER:
 530       /* The sampler takes up a slot, but we don't use any values from it. */
 531       return 1;
 532
 533    default:
 534       assert(!"not reached");
 535       return 0;
 536    }
 537 }
 538
 539 /* Our support for builtin uniforms is even scarier than non-builtin.
 540  * It sits on top of the PROG_STATE_VAR parameters that are
 541  * automatically updated from GL context state.
 542  */
 543 void
 544 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 545 {
 546    const ir_state_slot *const slots = ir->state_slots;
 547    assert(ir->state_slots != NULL);
 548
 549    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 550       /* This state reference has already been setup by ir_to_mesa,
 551        * but we'll get the same index back here.  We can reference
 552        * ParameterValues directly, since unlike brw_fs.cpp, we never
 553        * add new state references during compile.
 554        */
 555       int index = _mesa_add_state_reference(this->vp->Base.Parameters,
 556                                             (gl_state_index *)slots[i].tokens);
 557       float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
 558
 559       this->uniform_vector_size[this->uniforms] = 0;
 560       /* Add each of the unique swizzled channels of the element.
 561        * This will end up matching the size of the glsl_type of this field.
 562        */
 563       int last_swiz = -1;
 564       for (unsigned int j = 0; j < 4; j++) {
 565          int swiz = GET_SWZ(slots[i].swizzle, j);
 566          last_swiz = swiz;
 567
 568          c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
 569          if (swiz <= last_swiz)
 570             this->uniform_vector_size[this->uniforms]++;
 571       }
 572       this->uniforms++;
 573    }
 574 }
 575
 576 dst_reg *
 577 vec4_visitor::variable_storage(ir_variable *var)
 578 {
 579    return (dst_reg *)hash_table_find(this->variable_ht, var);
 580 }
 581
 582 void
 583 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
 584 {
 585    ir_expression *expr = ir->as_expression();
 586
 587    if (expr) {
 588       src_reg op[2];
 589       vec4_instruction *inst;
 590
 591       assert(expr->get_num_operands() <= 2);
 592       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 593          assert(expr->operands[i]->type->is_scalar());
 594
 595          expr->operands[i]->accept(this);
 596          op[i] = this->result;
 597       }
 598
 599       switch (expr->operation) {
 600       case ir_unop_logic_not:
 601          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 602          inst->conditional_mod = BRW_CONDITIONAL_Z;
 603          break;
 604
 605       case ir_binop_logic_xor:
 606          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 607          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 608          break;
 609
 610       case ir_binop_logic_or:
 611          inst = emit(OR(dst_null_d(), op[0], op[1]));
 612          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 613          break;
 614
 615       case ir_binop_logic_and:
 616          inst = emit(AND(dst_null_d(), op[0], op[1]));
 617          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 618          break;
 619
 620       case ir_unop_f2b:
 621          if (intel->gen >= 6) {
 622             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 623          } else {
 624             inst = emit(MOV(dst_null_f(), op[0]));
 625             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 626          }
 627          break;
 628
 629       case ir_unop_i2b:
 630          if (intel->gen >= 6) {
 631             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 632          } else {
 633             inst = emit(MOV(dst_null_d(), op[0]));
 634             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 635          }
 636          break;
 637
 638       case ir_binop_greater:
 639       case ir_binop_gequal:
 640       case ir_binop_less:
 641       case ir_binop_lequal:
 642       case ir_binop_equal:
 643       case ir_binop_all_equal:
 644       case ir_binop_nequal:
 645       case ir_binop_any_nequal:
 646          emit(CMP(dst_null_d(), op[0], op[1],
 647                   brw_conditional_for_comparison(expr->operation)));
 648          break;
 649
 650       default:
 651          assert(!"not reached");
 652          break;
 653       }
 654       return;
 655    }
 656
 657    ir->accept(this);
 658
 659    if (intel->gen >= 6) {
 660       vec4_instruction *inst = emit(AND(dst_null_d(),
 661                                         this->result, src_reg(1)));
 662       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 663    } else {
 664       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 665       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 666    }
 667 }
 668
 669 /**
 670  * Emit a gen6 IF statement with the comparison folded into the IF
 671  * instruction.
 672  */
 673 void
 674 vec4_visitor::emit_if_gen6(ir_if *ir)
 675 {
 676    ir_expression *expr = ir->condition->as_expression();
 677
 678    if (expr) {
 679       src_reg op[2];
 680       dst_reg temp;
 681
 682       assert(expr->get_num_operands() <= 2);
 683       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 684          expr->operands[i]->accept(this);
 685          op[i] = this->result;
 686       }
 687
 688       switch (expr->operation) {
 689       case ir_unop_logic_not:
 690          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 691          return;
 692
 693       case ir_binop_logic_xor:
 694          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 695          return;
 696
 697       case ir_binop_logic_or:
 698          temp = dst_reg(this, glsl_type::bool_type);
 699          emit(OR(temp, op[0], op[1]));
 700          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 701          return;
 702
 703       case ir_binop_logic_and:
 704          temp = dst_reg(this, glsl_type::bool_type);
 705          emit(AND(temp, op[0], op[1]));
 706          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 707          return;
 708
 709       case ir_unop_f2b:
 710          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 711          return;
 712
 713       case ir_unop_i2b:
 714          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 715          return;
 716
 717       case ir_binop_greater:
 718       case ir_binop_gequal:
 719       case ir_binop_less:
 720       case ir_binop_lequal:
 721       case ir_binop_equal:
 722       case ir_binop_nequal:
 723          emit(IF(op[0], op[1],
 724                  brw_conditional_for_comparison(expr->operation)));
 725          return;
 726
 727       case ir_binop_all_equal:
 728          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 729          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 730          return;
 731
 732       case ir_binop_any_nequal:
 733          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 734          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 735          return;
 736
 737       case ir_unop_any:
 738          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 739          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 740          return;
 741
 742       default:
 743          assert(!"not reached");
 744          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 745          return;
 746       }
 747       return;
 748    }
 749
 750    ir->condition->accept(this);
 751
 752    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 753 }
 754
 755 void
 756 vec4_visitor::visit(ir_variable *ir)
 757 {
 758    dst_reg *reg = NULL;
 759
 760    if (variable_storage(ir))
 761       return;
 762
 763    switch (ir->mode) {
 764    case ir_var_in:
 765       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
 766
 767       /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 768        * come in as floating point conversions of the integer values.
 769        */
 770       for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
 771          if (!c->key.gl_fixed_input_size[i])
 772             continue;
 773
 774          dst_reg dst = *reg;
 775          dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
 776          emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 777       }
 778       break;
 779
 780    case ir_var_out:
 781       reg = new(mem_ctx) dst_reg(this, ir->type);
 782
 783       for (int i = 0; i < type_size(ir->type); i++) {
 784          output_reg[ir->location + i] = *reg;
 785          output_reg[ir->location + i].reg_offset = i;
 786          output_reg[ir->location + i].type = BRW_REGISTER_TYPE_F;
 787       }
 788       break;
 789
 790    case ir_var_auto:
 791    case ir_var_temporary:
 792       reg = new(mem_ctx) dst_reg(this, ir->type);
 793       break;
 794
 795    case ir_var_uniform:
 796       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 797
 798       /* Track how big the whole uniform variable is, in case we need to put a
 799        * copy of its data into pull constants for array access.
 800        */
 801       this->uniform_size[this->uniforms] = type_size(ir->type);
 802
 803       if (!strncmp(ir->name, "gl_", 3)) {
 804          setup_builtin_uniform_values(ir);
 805       } else {
 806          setup_uniform_values(ir->location, ir->type);
 807       }
 808       break;
 809
 810    default:
 811       assert(!"not reached");
 812    }
 813
 814    reg->type = brw_type_for_base_type(ir->type);
 815    hash_table_insert(this->variable_ht, reg, ir);
 816 }
 817
 818 void
 819 vec4_visitor::visit(ir_loop *ir)
 820 {
 821    dst_reg counter;
 822
 823    /* We don't want debugging output to print the whole body of the
 824     * loop as the annotation.
 825     */
 826    this->base_ir = NULL;
 827
 828    if (ir->counter != NULL) {
 829       this->base_ir = ir->counter;
 830       ir->counter->accept(this);
 831       counter = *(variable_storage(ir->counter));
 832
 833       if (ir->from != NULL) {
 834          this->base_ir = ir->from;
 835          ir->from->accept(this);
 836
 837          emit(MOV(counter, this->result));
 838       }
 839    }
 840
 841    emit(BRW_OPCODE_DO);
 842
 843    if (ir->to) {
 844       this->base_ir = ir->to;
 845       ir->to->accept(this);
 846
 847       emit(CMP(dst_null_d(), src_reg(counter), this->result,
 848                brw_conditional_for_comparison(ir->cmp)));
 849
 850       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
 851       inst->predicate = BRW_PREDICATE_NORMAL;
 852    }
 853
 854    visit_instructions(&ir->body_instructions);
 855
 856
 857    if (ir->increment) {
 858       this->base_ir = ir->increment;
 859       ir->increment->accept(this);
 860       emit(ADD(counter, src_reg(counter), this->result));
 861    }
 862
 863    emit(BRW_OPCODE_WHILE);
 864 }
 865
 866 void
 867 vec4_visitor::visit(ir_loop_jump *ir)
 868 {
 869    switch (ir->mode) {
 870    case ir_loop_jump::jump_break:
 871       emit(BRW_OPCODE_BREAK);
 872       break;
 873    case ir_loop_jump::jump_continue:
 874       emit(BRW_OPCODE_CONTINUE);
 875       break;
 876    }
 877 }
 878
 879
 880 void
 881 vec4_visitor::visit(ir_function_signature *ir)
 882 {
 883    assert(0);
 884    (void)ir;
 885 }
 886
 887 void
 888 vec4_visitor::visit(ir_function *ir)
 889 {
 890    /* Ignore function bodies other than main() -- we shouldn't see calls to
 891     * them since they should all be inlined.
 892     */
 893    if (strcmp(ir->name, "main") == 0) {
 894       const ir_function_signature *sig;
 895       exec_list empty;
 896
 897       sig = ir->matching_signature(&empty);
 898
 899       assert(sig);
 900
 901       visit_instructions(&sig->body);
 902    }
 903 }
 904
 905 GLboolean
 906 vec4_visitor::try_emit_sat(ir_expression *ir)
 907 {
 908    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
 909    if (!sat_src)
 910       return false;
 911
 912    sat_src->accept(this);
 913    src_reg src = this->result;
 914
 915    this->result = src_reg(this, ir->type);
 916    vec4_instruction *inst;
 917    inst = emit(MOV(dst_reg(this->result), src));
 918    inst->saturate = true;
 919
 920    return true;
 921 }
 922
 923 void
 924 vec4_visitor::emit_bool_comparison(unsigned int op,
 925                                  dst_reg dst, src_reg src0, src_reg src1)
 926 {
 927    /* original gen4 does destination conversion before comparison. */
 928    if (intel->gen < 5)
 929       dst.type = src0.type;
 930
 931    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
 932
 933    dst.type = BRW_REGISTER_TYPE_D;
 934    emit(AND(dst, src_reg(dst), src_reg(0x1)));
 935 }
 936
 937 void
 938 vec4_visitor::visit(ir_expression *ir)
 939 {
 940    unsigned int operand;
 941    src_reg op[Elements(ir->operands)];
 942    src_reg result_src;
 943    dst_reg result_dst;
 944    vec4_instruction *inst;
 945
 946    if (try_emit_sat(ir))
 947       return;
 948
 949    for (operand = 0; operand < ir->get_num_operands(); operand++) {
 950       this->result.file = BAD_FILE;
 951       ir->operands[operand]->accept(this);
 952       if (this->result.file == BAD_FILE) {
 953          printf("Failed to get tree for expression operand:\n");
 954          ir->operands[operand]->print();
 955          exit(1);
 956       }
 957       op[operand] = this->result;
 958
 959       /* Matrix expression operands should have been broken down to vector
 960        * operations already.
 961        */
 962       assert(!ir->operands[operand]->type->is_matrix());
 963    }
 964
 965    int vector_elements = ir->operands[0]->type->vector_elements;
 966    if (ir->operands[1]) {
 967       vector_elements = MAX2(vector_elements,
 968                              ir->operands[1]->type->vector_elements);
 969    }
 970
 971    this->result.file = BAD_FILE;
 972
 973    /* Storage for our result.  Ideally for an assignment we'd be using
 974     * the actual storage for the result here, instead.
 975     */
 976    result_src = src_reg(this, ir->type);
 977    /* convenience for the emit functions below. */
 978    result_dst = dst_reg(result_src);
 979    /* If nothing special happens, this is the result. */
 980    this->result = result_src;
 981    /* Limit writes to the channels that will be used by result_src later.
 982     * This does limit this temp's use as a temporary for multi-instruction
 983     * sequences.
 984     */
 985    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
 986
 987    switch (ir->operation) {
 988    case ir_unop_logic_not:
 989       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
 990        * ones complement of the whole register, not just bit 0.
 991        */
 992       emit(XOR(result_dst, op[0], src_reg(1)));
 993       break;
 994    case ir_unop_neg:
 995       op[0].negate = !op[0].negate;
 996       this->result = op[0];
 997       break;
 998    case ir_unop_abs:
 999       op[0].abs = true;
1000       op[0].negate = false;
1001       this->result = op[0];
1002       break;
1003
1004    case ir_unop_sign:
1005       emit(MOV(result_dst, src_reg(0.0f)));
1006
1007       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1008       inst = emit(MOV(result_dst, src_reg(1.0f)));
1009       inst->predicate = BRW_PREDICATE_NORMAL;
1010
1011       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1012       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1013       inst->predicate = BRW_PREDICATE_NORMAL;
1014
1015       break;
1016
1017    case ir_unop_rcp:
1018       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1019       break;
1020
1021    case ir_unop_exp2:
1022       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1023       break;
1024    case ir_unop_log2:
1025       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1026       break;
1027    case ir_unop_exp:
1028    case ir_unop_log:
1029       assert(!"not reached: should be handled by ir_explog_to_explog2");
1030       break;
1031    case ir_unop_sin:
1032    case ir_unop_sin_reduced:
1033       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1034       break;
1035    case ir_unop_cos:
1036    case ir_unop_cos_reduced:
1037       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1038       break;
1039
1040    case ir_unop_dFdx:
1041    case ir_unop_dFdy:
1042       assert(!"derivatives not valid in vertex shader");
1043       break;
1044
1045    case ir_unop_noise:
1046       assert(!"not reached: should be handled by lower_noise");
1047       break;
1048
1049    case ir_binop_add:
1050       emit(ADD(result_dst, op[0], op[1]));
1051       break;
1052    case ir_binop_sub:
1053       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1054       break;
1055
1056    case ir_binop_mul:
1057       if (ir->type->is_integer()) {
1058          /* For integer multiplication, the MUL uses the low 16 bits
1059           * of one of the operands (src0 on gen6, src1 on gen7).  The
1060           * MACH accumulates in the contribution of the upper 16 bits
1061           * of that operand.
1062           *
1063           * FINISHME: Emit just the MUL if we know an operand is small
1064           * enough.
1065           */
1066          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1067
1068          emit(MUL(acc, op[0], op[1]));
1069          emit(MACH(dst_null_d(), op[0], op[1]));
1070          emit(MOV(result_dst, src_reg(acc)));
1071       } else {
1072          emit(MUL(result_dst, op[0], op[1]));
1073       }
1074       break;
1075    case ir_binop_div:
1076       assert(!"not reached: should be handled by ir_div_to_mul_rcp");
1077    case ir_binop_mod:
1078       assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
1079       break;
1080
1081    case ir_binop_less:
1082    case ir_binop_greater:
1083    case ir_binop_lequal:
1084    case ir_binop_gequal:
1085    case ir_binop_equal:
1086    case ir_binop_nequal: {
1087       emit(CMP(result_dst, op[0], op[1],
1088                brw_conditional_for_comparison(ir->operation)));
1089       emit(AND(result_dst, result_src, src_reg(0x1)));
1090       break;
1091    }
1092
1093    case ir_binop_all_equal:
1094       /* "==" operator producing a scalar boolean. */
1095       if (ir->operands[0]->type->is_vector() ||
1096           ir->operands[1]->type->is_vector()) {
1097          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1098          emit(MOV(result_dst, src_reg(0)));
1099          inst = emit(MOV(result_dst, src_reg(1)));
1100          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1101       } else {
1102          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1103          emit(AND(result_dst, result_src, src_reg(0x1)));
1104       }
1105       break;
1106    case ir_binop_any_nequal:
1107       /* "!=" operator producing a scalar boolean. */
1108       if (ir->operands[0]->type->is_vector() ||
1109           ir->operands[1]->type->is_vector()) {
1110          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1111
1112          emit(MOV(result_dst, src_reg(0)));
1113          inst = emit(MOV(result_dst, src_reg(1)));
1114          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1115       } else {
1116          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1117          emit(AND(result_dst, result_src, src_reg(0x1)));
1118       }
1119       break;
1120
1121    case ir_unop_any:
1122       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1123       emit(MOV(result_dst, src_reg(0)));
1124
1125       inst = emit(MOV(result_dst, src_reg(1)));
1126       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1127       break;
1128
1129    case ir_binop_logic_xor:
1130       emit(XOR(result_dst, op[0], op[1]));
1131       break;
1132
1133    case ir_binop_logic_or:
1134       emit(OR(result_dst, op[0], op[1]));
1135       break;
1136
1137    case ir_binop_logic_and:
1138       emit(AND(result_dst, op[0], op[1]));
1139       break;
1140
1141    case ir_binop_dot:
1142       assert(ir->operands[0]->type->is_vector());
1143       assert(ir->operands[0]->type == ir->operands[1]->type);
1144       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1145       break;
1146
1147    case ir_unop_sqrt:
1148       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1149       break;
1150    case ir_unop_rsq:
1151       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1152       break;
1153    case ir_unop_i2f:
1154    case ir_unop_i2u:
1155    case ir_unop_u2i:
1156    case ir_unop_u2f:
1157    case ir_unop_b2f:
1158    case ir_unop_b2i:
1159    case ir_unop_f2i:
1160       emit(MOV(result_dst, op[0]));
1161       break;
1162    case ir_unop_f2b:
1163    case ir_unop_i2b: {
1164       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1165       emit(AND(result_dst, result_src, src_reg(1)));
1166       break;
1167    }
1168
1169    case ir_unop_trunc:
1170       emit(RNDZ(result_dst, op[0]));
1171       break;
1172    case ir_unop_ceil:
1173       op[0].negate = !op[0].negate;
1174       inst = emit(RNDD(result_dst, op[0]));
1175       this->result.negate = true;
1176       break;
1177    case ir_unop_floor:
1178       inst = emit(RNDD(result_dst, op[0]));
1179       break;
1180    case ir_unop_fract:
1181       inst = emit(FRC(result_dst, op[0]));
1182       break;
1183    case ir_unop_round_even:
1184       emit(RNDE(result_dst, op[0]));
1185       break;
1186
1187    case ir_binop_min:
1188       emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_L));
1189
1190       inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1191       inst->predicate = BRW_PREDICATE_NORMAL;
1192       break;
1193    case ir_binop_max:
1194       emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_G));
1195
1196       inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1197       inst->predicate = BRW_PREDICATE_NORMAL;
1198       break;
1199
1200    case ir_binop_pow:
1201       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1202       break;
1203
1204    case ir_unop_bit_not:
1205       inst = emit(NOT(result_dst, op[0]));
1206       break;
1207    case ir_binop_bit_and:
1208       inst = emit(AND(result_dst, op[0], op[1]));
1209       break;
1210    case ir_binop_bit_xor:
1211       inst = emit(XOR(result_dst, op[0], op[1]));
1212       break;
1213    case ir_binop_bit_or:
1214       inst = emit(OR(result_dst, op[0], op[1]));
1215       break;
1216
1217    case ir_binop_lshift:
1218    case ir_binop_rshift:
1219       assert(!"GLSL 1.30 features unsupported");
1220       break;
1221
1222    case ir_quadop_vector:
1223       assert(!"not reached: should be handled by lower_quadop_vector");
1224       break;
1225    }
1226 }
1227
1228
1229 void
1230 vec4_visitor::visit(ir_swizzle *ir)
1231 {
1232    src_reg src;
1233    int i = 0;
1234    int swizzle[4];
1235
1236    /* Note that this is only swizzles in expressions, not those on the left
1237     * hand side of an assignment, which do write masking.  See ir_assignment
1238     * for that.
1239     */
1240
1241    ir->val->accept(this);
1242    src = this->result;
1243    assert(src.file != BAD_FILE);
1244
1245    for (i = 0; i < ir->type->vector_elements; i++) {
1246       switch (i) {
1247       case 0:
1248          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1249          break;
1250       case 1:
1251          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1252          break;
1253       case 2:
1254          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1255          break;
1256       case 3:
1257          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1258             break;
1259       }
1260    }
1261    for (; i < 4; i++) {
1262       /* Replicate the last channel out. */
1263       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1264    }
1265
1266    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1267
1268    this->result = src;
1269 }
1270
1271 void
1272 vec4_visitor::visit(ir_dereference_variable *ir)
1273 {
1274    const struct glsl_type *type = ir->type;
1275    dst_reg *reg = variable_storage(ir->var);
1276
1277    if (!reg) {
1278       fail("Failed to find variable storage for %s\n", ir->var->name);
1279       this->result = src_reg(brw_null_reg());
1280       return;
1281    }
1282
1283    this->result = src_reg(*reg);
1284
1285    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1286       this->result.swizzle = swizzle_for_size(type->vector_elements);
1287 }
1288
1289 void
1290 vec4_visitor::visit(ir_dereference_array *ir)
1291 {
1292    ir_constant *constant_index;
1293    src_reg src;
1294    int element_size = type_size(ir->type);
1295
1296    constant_index = ir->array_index->constant_expression_value();
1297
1298    ir->array->accept(this);
1299    src = this->result;
1300
1301    if (constant_index) {
1302       src.reg_offset += constant_index->value.i[0] * element_size;
1303    } else {
1304       /* Variable index array dereference.  It eats the "vec4" of the
1305        * base of the array and an index that offsets the Mesa register
1306        * index.
1307        */
1308       ir->array_index->accept(this);
1309
1310       src_reg index_reg;
1311
1312       if (element_size == 1) {
1313          index_reg = this->result;
1314       } else {
1315          index_reg = src_reg(this, glsl_type::int_type);
1316
1317          emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1318       }
1319
1320       if (src.reladdr) {
1321          src_reg temp = src_reg(this, glsl_type::int_type);
1322
1323          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1324
1325          index_reg = temp;
1326       }
1327
1328       src.reladdr = ralloc(mem_ctx, src_reg);
1329       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1330    }
1331
1332    /* If the type is smaller than a vec4, replicate the last channel out. */
1333    if (ir->type->is_scalar() || ir->type->is_vector())
1334       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1335    else
1336       src.swizzle = BRW_SWIZZLE_NOOP;
1337    src.type = brw_type_for_base_type(ir->type);
1338
1339    this->result = src;
1340 }
1341
1342 void
1343 vec4_visitor::visit(ir_dereference_record *ir)
1344 {
1345    unsigned int i;
1346    const glsl_type *struct_type = ir->record->type;
1347    int offset = 0;
1348
1349    ir->record->accept(this);
1350
1351    for (i = 0; i < struct_type->length; i++) {
1352       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1353          break;
1354       offset += type_size(struct_type->fields.structure[i].type);
1355    }
1356
1357    /* If the type is smaller than a vec4, replicate the last channel out. */
1358    if (ir->type->is_scalar() || ir->type->is_vector())
1359       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1360    else
1361       this->result.swizzle = BRW_SWIZZLE_NOOP;
1362    this->result.type = brw_type_for_base_type(ir->type);
1363
1364    this->result.reg_offset += offset;
1365 }
1366
1367 /**
1368  * We want to be careful in assignment setup to hit the actual storage
1369  * instead of potentially using a temporary like we might with the
1370  * ir_dereference handler.
1371  */
1372 static dst_reg
1373 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1374 {
1375    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1376     * access of a vector, it must be separated into a series conditional moves
1377     * before reaching this point (see ir_vec_index_to_cond_assign).
1378     */
1379    assert(ir->as_dereference());
1380    ir_dereference_array *deref_array = ir->as_dereference_array();
1381    if (deref_array) {
1382       assert(!deref_array->array->type->is_vector());
1383    }
1384
1385    /* Use the rvalue deref handler for the most part.  We'll ignore
1386     * swizzles in it and write swizzles using writemask, though.
1387     */
1388    ir->accept(v);
1389    return dst_reg(v->result);
1390 }
1391
1392 void
1393 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1394                               const struct glsl_type *type, bool predicated)
1395 {
1396    if (type->base_type == GLSL_TYPE_STRUCT) {
1397       for (unsigned int i = 0; i < type->length; i++) {
1398          emit_block_move(dst, src, type->fields.structure[i].type, predicated);
1399       }
1400       return;
1401    }
1402
1403    if (type->is_array()) {
1404       for (unsigned int i = 0; i < type->length; i++) {
1405          emit_block_move(dst, src, type->fields.array, predicated);
1406       }
1407       return;
1408    }
1409
1410    if (type->is_matrix()) {
1411       const struct glsl_type *vec_type;
1412
1413       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1414                                          type->vector_elements, 1);
1415
1416       for (int i = 0; i < type->matrix_columns; i++) {
1417          emit_block_move(dst, src, vec_type, predicated);
1418       }
1419       return;
1420    }
1421
1422    assert(type->is_scalar() || type->is_vector());
1423
1424    dst->type = brw_type_for_base_type(type);
1425    src->type = dst->type;
1426
1427    dst->writemask = (1 << type->vector_elements) - 1;
1428
1429    /* Do we need to worry about swizzling a swizzle? */
1430    assert(src->swizzle = BRW_SWIZZLE_NOOP);
1431    src->swizzle = swizzle_for_size(type->vector_elements);
1432
1433    vec4_instruction *inst = emit(MOV(*dst, *src));
1434    if (predicated)
1435       inst->predicate = BRW_PREDICATE_NORMAL;
1436
1437    dst->reg_offset++;
1438    src->reg_offset++;
1439 }
1440
1441
1442 /* If the RHS processing resulted in an instruction generating a
1443  * temporary value, and it would be easy to rewrite the instruction to
1444  * generate its result right into the LHS instead, do so.  This ends
1445  * up reliably removing instructions where it can be tricky to do so
1446  * later without real UD chain information.
1447  */
1448 bool
1449 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1450                                      dst_reg dst,
1451                                      src_reg src,
1452                                      vec4_instruction *pre_rhs_inst,
1453                                      vec4_instruction *last_rhs_inst)
1454 {
1455    /* This could be supported, but it would take more smarts. */
1456    if (ir->condition)
1457       return false;
1458
1459    if (pre_rhs_inst == last_rhs_inst)
1460       return false; /* No instructions generated to work with. */
1461
1462    /* Make sure the last instruction generated our source reg. */
1463    if (src.file != GRF ||
1464        src.file != last_rhs_inst->dst.file ||
1465        src.reg != last_rhs_inst->dst.reg ||
1466        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1467        src.reladdr ||
1468        src.abs ||
1469        src.negate ||
1470        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1471       return false;
1472
1473    /* Check that that last instruction fully initialized the channels
1474     * we want to use, in the order we want to use them.  We could
1475     * potentially reswizzle the operands of many instructions so that
1476     * we could handle out of order channels, but don't yet.
1477     */
1478    for (int i = 0; i < 4; i++) {
1479       if (dst.writemask & (1 << i)) {
1480          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1481             return false;
1482
1483          if (BRW_GET_SWZ(src.swizzle, i) != i)
1484             return false;
1485       }
1486    }
1487
1488    /* Success!  Rewrite the instruction. */
1489    last_rhs_inst->dst.file = dst.file;
1490    last_rhs_inst->dst.reg = dst.reg;
1491    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1492    last_rhs_inst->dst.reladdr = dst.reladdr;
1493    last_rhs_inst->dst.writemask &= dst.writemask;
1494
1495    return true;
1496 }
1497
1498 void
1499 vec4_visitor::visit(ir_assignment *ir)
1500 {
1501    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1502
1503    if (!ir->lhs->type->is_scalar() &&
1504        !ir->lhs->type->is_vector()) {
1505       ir->rhs->accept(this);
1506       src_reg src = this->result;
1507
1508       if (ir->condition) {
1509          emit_bool_to_cond_code(ir->condition);
1510       }
1511
1512       emit_block_move(&dst, &src, ir->rhs->type, ir->condition != NULL);
1513       return;
1514    }
1515
1516    /* Now we're down to just a scalar/vector with writemasks. */
1517    int i;
1518
1519    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1520    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1521
1522    ir->rhs->accept(this);
1523
1524    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1525
1526    src_reg src = this->result;
1527
1528    int swizzles[4];
1529    int first_enabled_chan = 0;
1530    int src_chan = 0;
1531
1532    assert(ir->lhs->type->is_vector() ||
1533           ir->lhs->type->is_scalar());
1534    dst.writemask = ir->write_mask;
1535
1536    for (int i = 0; i < 4; i++) {
1537       if (dst.writemask & (1 << i)) {
1538          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1539          break;
1540       }
1541    }
1542
1543    /* Swizzle a small RHS vector into the channels being written.
1544     *
1545     * glsl ir treats write_mask as dictating how many channels are
1546     * present on the RHS while in our instructions we need to make
1547     * those channels appear in the slots of the vec4 they're written to.
1548     */
1549    for (int i = 0; i < 4; i++) {
1550       if (dst.writemask & (1 << i))
1551          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1552       else
1553          swizzles[i] = first_enabled_chan;
1554    }
1555    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1556                               swizzles[2], swizzles[3]);
1557
1558    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1559       return;
1560    }
1561
1562    if (ir->condition) {
1563       emit_bool_to_cond_code(ir->condition);
1564    }
1565
1566    for (i = 0; i < type_size(ir->lhs->type); i++) {
1567       vec4_instruction *inst = emit(MOV(dst, src));
1568
1569       if (ir->condition)
1570          inst->predicate = BRW_PREDICATE_NORMAL;
1571
1572       dst.reg_offset++;
1573       src.reg_offset++;
1574    }
1575 }
1576
1577 void
1578 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1579 {
1580    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1581       foreach_list(node, &ir->components) {
1582          ir_constant *field_value = (ir_constant *)node;
1583
1584          emit_constant_values(dst, field_value);
1585       }
1586       return;
1587    }
1588
1589    if (ir->type->is_array()) {
1590       for (unsigned int i = 0; i < ir->type->length; i++) {
1591          emit_constant_values(dst, ir->array_elements[i]);
1592       }
1593       return;
1594    }
1595
1596    if (ir->type->is_matrix()) {
1597       for (int i = 0; i < ir->type->matrix_columns; i++) {
1598          for (int j = 0; j < ir->type->vector_elements; j++) {
1599             dst->writemask = 1 << j;
1600             dst->type = BRW_REGISTER_TYPE_F;
1601
1602             emit(MOV(*dst,
1603                      src_reg(ir->value.f[i * ir->type->vector_elements + j])));
1604          }
1605          dst->reg_offset++;
1606       }
1607       return;
1608    }
1609
1610    for (int i = 0; i < ir->type->vector_elements; i++) {
1611       dst->writemask = 1 << i;
1612       dst->type = brw_type_for_base_type(ir->type);
1613
1614       switch (ir->type->base_type) {
1615       case GLSL_TYPE_FLOAT:
1616          emit(MOV(*dst, src_reg(ir->value.f[i])));
1617          break;
1618       case GLSL_TYPE_INT:
1619          emit(MOV(*dst, src_reg(ir->value.i[i])));
1620          break;
1621       case GLSL_TYPE_UINT:
1622          emit(MOV(*dst, src_reg(ir->value.u[i])));
1623          break;
1624       case GLSL_TYPE_BOOL:
1625          emit(MOV(*dst, src_reg(ir->value.b[i])));
1626          break;
1627       default:
1628          assert(!"Non-float/uint/int/bool constant");
1629          break;
1630       }
1631    }
1632    dst->reg_offset++;
1633 }
1634
1635 void
1636 vec4_visitor::visit(ir_constant *ir)
1637 {
1638    dst_reg dst = dst_reg(this, ir->type);
1639    this->result = src_reg(dst);
1640
1641    emit_constant_values(&dst, ir);
1642 }
1643
1644 void
1645 vec4_visitor::visit(ir_call *ir)
1646 {
1647    assert(!"not reached");
1648 }
1649
1650 void
1651 vec4_visitor::visit(ir_texture *ir)
1652 {
1653    /* FINISHME: Implement vertex texturing.
1654     *
1655     * With 0 vertex samplers available, the linker will reject
1656     * programs that do vertex texturing, but after our visitor has
1657     * run.
1658     */
1659 }
1660
1661 void
1662 vec4_visitor::visit(ir_return *ir)
1663 {
1664    assert(!"not reached");
1665 }
1666
1667 void
1668 vec4_visitor::visit(ir_discard *ir)
1669 {
1670    assert(!"not reached");
1671 }
1672
1673 void
1674 vec4_visitor::visit(ir_if *ir)
1675 {
1676    /* Don't point the annotation at the if statement, because then it plus
1677     * the then and else blocks get printed.
1678     */
1679    this->base_ir = ir->condition;
1680
1681    if (intel->gen == 6) {
1682       emit_if_gen6(ir);
1683    } else {
1684       emit_bool_to_cond_code(ir->condition);
1685       emit(IF(BRW_PREDICATE_NORMAL));
1686    }
1687
1688    visit_instructions(&ir->then_instructions);
1689
1690    if (!ir->else_instructions.is_empty()) {
1691       this->base_ir = ir->condition;
1692       emit(BRW_OPCODE_ELSE);
1693
1694       visit_instructions(&ir->else_instructions);
1695    }
1696
1697    this->base_ir = ir->condition;
1698    emit(BRW_OPCODE_ENDIF);
1699 }
1700
1701 void
1702 vec4_visitor::emit_ndc_computation()
1703 {
1704    /* Get the position */
1705    src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
1706
1707    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1708    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1709    output_reg[BRW_VERT_RESULT_NDC] = ndc;
1710
1711    current_annotation = "NDC";
1712    dst_reg ndc_w = ndc;
1713    ndc_w.writemask = WRITEMASK_W;
1714    src_reg pos_w = pos;
1715    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1716    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1717
1718    dst_reg ndc_xyz = ndc;
1719    ndc_xyz.writemask = WRITEMASK_XYZ;
1720
1721    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1722 }
1723
1724 void
1725 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
1726 {
1727    if (intel->gen < 6 &&
1728        ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1729         c->key.nr_userclip || brw->has_negative_rhw_bug)) {
1730       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1731       GLuint i;
1732
1733       emit(MOV(header1, 0u));
1734
1735       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1736          assert(!"finishme: psiz");
1737          src_reg psiz;
1738
1739          header1.writemask = WRITEMASK_W;
1740          emit(MUL(header1, psiz, 1u << 11));
1741          emit(AND(header1, src_reg(header1), 0x7ff << 8));
1742       }
1743
1744       for (i = 0; i < c->key.nr_userclip; i++) {
1745          vec4_instruction *inst;
1746
1747          inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
1748                          src_reg(c->userplane[i])));
1749          inst->conditional_mod = BRW_CONDITIONAL_L;
1750
1751          emit(OR(header1, src_reg(header1), 1u << i));
1752          inst->predicate = BRW_PREDICATE_NORMAL;
1753       }
1754
1755       /* i965 clipping workaround:
1756        * 1) Test for -ve rhw
1757        * 2) If set,
1758        *      set ndc = (0,0,0,0)
1759        *      set ucp[6] = 1
1760        *
1761        * Later, clipping will detect ucp[6] and ensure the primitive is
1762        * clipped against all fixed planes.
1763        */
1764       if (brw->has_negative_rhw_bug) {
1765 #if 0
1766          /* FINISHME */
1767          brw_CMP(p,
1768                  vec8(brw_null_reg()),
1769                  BRW_CONDITIONAL_L,
1770                  brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
1771                  brw_imm_f(0));
1772
1773          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1774          brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
1775          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1776 #endif
1777       }
1778
1779       header1.writemask = WRITEMASK_XYZW;
1780       emit(MOV(reg, src_reg(header1)));
1781    } else if (intel->gen < 6) {
1782       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
1783    } else {
1784       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
1785       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1786          emit(MOV(brw_writemask(reg, WRITEMASK_W),
1787                   src_reg(output_reg[VERT_RESULT_PSIZ])));
1788       }
1789    }
1790 }
1791
1792 void
1793 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
1794 {
1795    if (intel->gen < 6) {
1796       /* Clip distance slots are set aside in gen5, but they are not used.  It
1797        * is not clear whether we actually need to set aside space for them,
1798        * but the performance cost is negligible.
1799        */
1800       return;
1801    }
1802
1803    for (int i = 0; i + offset < c->key.nr_userclip && i < 4; ++i) {
1804       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
1805                src_reg(output_reg[VERT_RESULT_HPOS]),
1806                src_reg(c->userplane[i + offset])));
1807    }
1808 }
1809
1810 void
1811 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
1812 {
1813    struct brw_reg reg = brw_message_reg(mrf);
1814
1815    switch (vert_result) {
1816    case VERT_RESULT_PSIZ:
1817       /* PSIZ is always in slot 0, and is coupled with other flags. */
1818       current_annotation = "indices, point width, clip flags";
1819       emit_psiz_and_flags(reg);
1820       break;
1821    case BRW_VERT_RESULT_NDC:
1822       current_annotation = "NDC";
1823       emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
1824       break;
1825    case BRW_VERT_RESULT_HPOS_DUPLICATE:
1826    case VERT_RESULT_HPOS:
1827       current_annotation = "gl_Position";
1828       emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
1829       break;
1830    case BRW_VERT_RESULT_CLIP0:
1831       current_annotation = "user clip distances";
1832       emit_clip_distances(reg, 0);
1833       break;
1834    case BRW_VERT_RESULT_CLIP1:
1835       current_annotation = "user clip distances";
1836       emit_clip_distances(reg, 4);
1837       break;
1838    case BRW_VERT_RESULT_PAD:
1839       /* No need to write to this slot */
1840       break;
1841    default: {
1842       assert (vert_result < VERT_RESULT_MAX);
1843       current_annotation = NULL;
1844       /* Copy the register, saturating if necessary */
1845       vec4_instruction *inst = emit(MOV(reg,
1846                                         src_reg(output_reg[vert_result])));
1847       if ((vert_result == VERT_RESULT_COL0 ||
1848            vert_result == VERT_RESULT_COL1 ||
1849            vert_result == VERT_RESULT_BFC0 ||
1850            vert_result == VERT_RESULT_BFC1) &&
1851           c->key.clamp_vertex_color) {
1852          inst->saturate = true;
1853       }
1854    }
1855       break;
1856    }
1857 }
1858
1859 static int
1860 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
1861 {
1862    struct intel_context *intel = &brw->intel;
1863
1864    if (intel->gen >= 6) {
1865       /* URB data written (does not include the message header reg) must
1866        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1867        * section 5.4.3.2.2: URB_INTERLEAVED.
1868        *
1869        * URB entries are allocated on a multiple of 1024 bits, so an
1870        * extra 128 bits written here to make the end align to 256 is
1871        * no problem.
1872        */
1873       if ((mlen % 2) != 1)
1874          mlen++;
1875    }
1876
1877    return mlen;
1878 }
1879
1880 /**
1881  * Generates the VUE payload plus the 1 or 2 URB write instructions to
1882  * complete the VS thread.
1883  *
1884  * The VUE layout is documented in Volume 2a.
1885  */
1886 void
1887 vec4_visitor::emit_urb_writes()
1888 {
1889    /* MRF 0 is reserved for the debugger, so start with message header
1890     * in MRF 1.
1891     */
1892    int base_mrf = 1;
1893    int mrf = base_mrf;
1894    int urb_entry_size;
1895    /* In the process of generating our URB write message contents, we
1896     * may need to unspill a register or load from an array.  Those
1897     * reads would use MRFs 14-15.
1898     */
1899    int max_usable_mrf = 13;
1900
1901    /* FINISHME: edgeflag */
1902
1903    brw_compute_vue_map(&c->vue_map, intel, c->key.nr_userclip,
1904                        c->key.two_side_color, c->prog_data.outputs_written);
1905
1906    /* First mrf is the g0-based message header containing URB handles and such,
1907     * which is implied in VS_OPCODE_URB_WRITE.
1908     */
1909    mrf++;
1910
1911    if (intel->gen < 6) {
1912       emit_ndc_computation();
1913    }
1914
1915    /* Set up the VUE data for the first URB write */
1916    int slot;
1917    for (slot = 0; slot < c->vue_map.num_slots; ++slot) {
1918       emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
1919
1920       /* If this was MRF 15, we can't fit anything more into this URB
1921        * WRITE.  Note that base_mrf of 1 means that MRF 15 is an
1922        * even-numbered amount of URB write data, which will meet
1923        * gen6's requirements for length alignment.
1924        */
1925       if (mrf > max_usable_mrf) {
1926          slot++;
1927          break;
1928       }
1929    }
1930
1931    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
1932    inst->base_mrf = base_mrf;
1933    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
1934    inst->eot = (slot >= c->vue_map.num_slots);
1935
1936    urb_entry_size = mrf - base_mrf;
1937
1938    /* Optional second URB write */
1939    if (!inst->eot) {
1940       mrf = base_mrf + 1;
1941
1942       for (; slot < c->vue_map.num_slots; ++slot) {
1943          assert(mrf < max_usable_mrf);
1944
1945          emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
1946       }
1947
1948       inst = emit(VS_OPCODE_URB_WRITE);
1949       inst->base_mrf = base_mrf;
1950       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
1951       inst->eot = true;
1952       /* URB destination offset.  In the previous write, we got MRFs
1953        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
1954        * URB row increments, and each of our MRFs is half of one of
1955        * those, since we're doing interleaved writes.
1956        */
1957       inst->offset = (max_usable_mrf - base_mrf) / 2;
1958
1959       urb_entry_size += mrf - base_mrf;
1960    }
1961
1962    if (intel->gen == 6)
1963       c->prog_data.urb_entry_size = ALIGN(urb_entry_size, 8) / 8;
1964    else
1965       c->prog_data.urb_entry_size = ALIGN(urb_entry_size, 4) / 4;
1966 }
1967
1968 src_reg
1969 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
1970                                  src_reg *reladdr, int reg_offset)
1971 {
1972    /* Because we store the values to scratch interleaved like our
1973     * vertex data, we need to scale the vec4 index by 2.
1974     */
1975    int message_header_scale = 2;
1976
1977    /* Pre-gen6, the message header uses byte offsets instead of vec4
1978     * (16-byte) offset units.
1979     */
1980    if (intel->gen < 6)
1981       message_header_scale *= 16;
1982
1983    if (reladdr) {
1984       src_reg index = src_reg(this, glsl_type::int_type);
1985
1986       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
1987       emit_before(inst, MUL(dst_reg(index),
1988                             index, src_reg(message_header_scale)));
1989
1990       return index;
1991    } else {
1992       return src_reg(reg_offset * message_header_scale);
1993    }
1994 }
1995
1996 src_reg
1997 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
1998                                        src_reg *reladdr, int reg_offset)
1999 {
2000    if (reladdr) {
2001       src_reg index = src_reg(this, glsl_type::int_type);
2002
2003       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2004
2005       /* Pre-gen6, the message header uses byte offsets instead of vec4
2006        * (16-byte) offset units.
2007        */
2008       if (intel->gen < 6) {
2009          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2010       }
2011
2012       return index;
2013    } else {
2014       int message_header_scale = intel->gen < 6 ? 16 : 1;
2015       return src_reg(reg_offset * message_header_scale);
2016    }
2017 }
2018
2019 /**
2020  * Emits an instruction before @inst to load the value named by @orig_src
2021  * from scratch space at @base_offset to @temp.
2022  */
2023 void
2024 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2025                                 dst_reg temp, src_reg orig_src,
2026                                 int base_offset)
2027 {
2028    int reg_offset = base_offset + orig_src.reg_offset;
2029    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2030
2031    emit_before(inst, SCRATCH_READ(temp, index));
2032 }
2033
2034 /**
2035  * Emits an instruction after @inst to store the value to be written
2036  * to @orig_dst to scratch space at @base_offset, from @temp.
2037  */
2038 void
2039 vec4_visitor::emit_scratch_write(vec4_instruction *inst,
2040                                  src_reg temp, dst_reg orig_dst,
2041                                  int base_offset)
2042 {
2043    int reg_offset = base_offset + orig_dst.reg_offset;
2044    src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
2045
2046    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2047                                        orig_dst.writemask));
2048    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2049    write->predicate = inst->predicate;
2050    write->ir = inst->ir;
2051    write->annotation = inst->annotation;
2052    inst->insert_after(write);
2053 }
2054
2055 /**
2056  * We can't generally support array access in GRF space, because a
2057  * single instruction's destination can only span 2 contiguous
2058  * registers.  So, we send all GRF arrays that get variable index
2059  * access to scratch space.
2060  */
2061 void
2062 vec4_visitor::move_grf_array_access_to_scratch()
2063 {
2064    int scratch_loc[this->virtual_grf_count];
2065
2066    for (int i = 0; i < this->virtual_grf_count; i++) {
2067       scratch_loc[i] = -1;
2068    }
2069
2070    /* First, calculate the set of virtual GRFs that need to be punted
2071     * to scratch due to having any array access on them, and where in
2072     * scratch.
2073     */
2074    foreach_list(node, &this->instructions) {
2075       vec4_instruction *inst = (vec4_instruction *)node;
2076
2077       if (inst->dst.file == GRF && inst->dst.reladdr &&
2078           scratch_loc[inst->dst.reg] == -1) {
2079          scratch_loc[inst->dst.reg] = c->last_scratch;
2080          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg] * 8 * 4;
2081       }
2082
2083       for (int i = 0 ; i < 3; i++) {
2084          src_reg *src = &inst->src[i];
2085
2086          if (src->file == GRF && src->reladdr &&
2087              scratch_loc[src->reg] == -1) {
2088             scratch_loc[src->reg] = c->last_scratch;
2089             c->last_scratch += this->virtual_grf_sizes[src->reg] * 8 * 4;
2090          }
2091       }
2092    }
2093
2094    /* Now, for anything that will be accessed through scratch, rewrite
2095     * it to load/store.  Note that this is a _safe list walk, because
2096     * we may generate a new scratch_write instruction after the one
2097     * we're processing.
2098     */
2099    foreach_list_safe(node, &this->instructions) {
2100       vec4_instruction *inst = (vec4_instruction *)node;
2101
2102       /* Set up the annotation tracking for new generated instructions. */
2103       base_ir = inst->ir;
2104       current_annotation = inst->annotation;
2105
2106       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2107          src_reg temp = src_reg(this, glsl_type::vec4_type);
2108
2109          emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
2110
2111          inst->dst.file = temp.file;
2112          inst->dst.reg = temp.reg;
2113          inst->dst.reg_offset = temp.reg_offset;
2114          inst->dst.reladdr = NULL;
2115       }
2116
2117       for (int i = 0 ; i < 3; i++) {
2118          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2119             continue;
2120
2121          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2122
2123          emit_scratch_read(inst, temp, inst->src[i],
2124                            scratch_loc[inst->src[i].reg]);
2125
2126          inst->src[i].file = temp.file;
2127          inst->src[i].reg = temp.reg;
2128          inst->src[i].reg_offset = temp.reg_offset;
2129          inst->src[i].reladdr = NULL;
2130       }
2131    }
2132 }
2133
2134 /**
2135  * Emits an instruction before @inst to load the value named by @orig_src
2136  * from the pull constant buffer (surface) at @base_offset to @temp.
2137  */
2138 void
2139 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2140                                       dst_reg temp, src_reg orig_src,
2141                                       int base_offset)
2142 {
2143    int reg_offset = base_offset + orig_src.reg_offset;
2144    src_reg index = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2145    vec4_instruction *load;
2146
2147    load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2148                                         temp, index);
2149    load->base_mrf = 14;
2150    load->mlen = 1;
2151    emit_before(inst, load);
2152 }
2153
2154 /**
2155  * Implements array access of uniforms by inserting a
2156  * PULL_CONSTANT_LOAD instruction.
2157  *
2158  * Unlike temporary GRF array access (where we don't support it due to
2159  * the difficulty of doing relative addressing on instruction
2160  * destinations), we could potentially do array access of uniforms
2161  * that were loaded in GRF space as push constants.  In real-world
2162  * usage we've seen, though, the arrays being used are always larger
2163  * than we could load as push constants, so just always move all
2164  * uniform array access out to a pull constant buffer.
2165  */
2166 void
2167 vec4_visitor::move_uniform_array_access_to_pull_constants()
2168 {
2169    int pull_constant_loc[this->uniforms];
2170
2171    for (int i = 0; i < this->uniforms; i++) {
2172       pull_constant_loc[i] = -1;
2173    }
2174
2175    /* Walk through and find array access of uniforms.  Put a copy of that
2176     * uniform in the pull constant buffer.
2177     *
2178     * Note that we don't move constant-indexed accesses to arrays.  No
2179     * testing has been done of the performance impact of this choice.
2180     */
2181    foreach_list_safe(node, &this->instructions) {
2182       vec4_instruction *inst = (vec4_instruction *)node;
2183
2184       for (int i = 0 ; i < 3; i++) {
2185          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2186             continue;
2187
2188          int uniform = inst->src[i].reg;
2189
2190          /* If this array isn't already present in the pull constant buffer,
2191           * add it.
2192           */
2193          if (pull_constant_loc[uniform] == -1) {
2194             const float **values = &prog_data->param[uniform * 4];
2195
2196             pull_constant_loc[uniform] = prog_data->nr_pull_params;
2197
2198             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2199                prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2200             }
2201          }
2202
2203          /* Set up the annotation tracking for new generated instructions. */
2204          base_ir = inst->ir;
2205          current_annotation = inst->annotation;
2206
2207          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2208
2209          emit_pull_constant_load(inst, temp, inst->src[i],
2210                                  pull_constant_loc[uniform]);
2211
2212          inst->src[i].file = temp.file;
2213          inst->src[i].reg = temp.reg;
2214          inst->src[i].reg_offset = temp.reg_offset;
2215          inst->src[i].reladdr = NULL;
2216       }
2217    }
2218
2219    /* Now there are no accesses of the UNIFORM file with a reladdr, so
2220     * no need to track them as larger-than-vec4 objects.  This will be
2221     * relied on in cutting out unused uniform vectors from push
2222     * constants.
2223     */
2224    split_uniform_registers();
2225 }
2226
2227 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2228                            struct gl_shader_program *prog,
2229                            struct brw_shader *shader)
2230 {
2231    this->c = c;
2232    this->p = &c->func;
2233    this->brw = p->brw;
2234    this->intel = &brw->intel;
2235    this->ctx = &intel->ctx;
2236    this->prog = prog;
2237    this->shader = shader;
2238
2239    this->mem_ctx = ralloc_context(NULL);
2240    this->failed = false;
2241
2242    this->base_ir = NULL;
2243    this->current_annotation = NULL;
2244
2245    this->c = c;
2246    this->vp = prog->VertexProgram;
2247    this->prog_data = &c->prog_data;
2248
2249    this->variable_ht = hash_table_ctor(0,
2250                                        hash_table_pointer_hash,
2251                                        hash_table_pointer_compare);
2252
2253    this->virtual_grf_def = NULL;
2254    this->virtual_grf_use = NULL;
2255    this->virtual_grf_sizes = NULL;
2256    this->virtual_grf_count = 0;
2257    this->virtual_grf_array_size = 0;
2258    this->live_intervals_valid = false;
2259
2260    this->uniforms = 0;
2261
2262    this->variable_ht = hash_table_ctor(0,
2263                                        hash_table_pointer_hash,
2264                                        hash_table_pointer_compare);
2265 }
2266
2267 vec4_visitor::~vec4_visitor()
2268 {
2269    ralloc_free(this->mem_ctx);
2270    hash_table_dtor(this->variable_ht);
2271 }
2272
2273
2274 void
2275 vec4_visitor::fail(const char *format, ...)
2276 {
2277    va_list va;
2278    char *msg;
2279
2280    if (failed)
2281       return;
2282
2283    failed = true;
2284
2285    va_start(va, format);
2286    msg = ralloc_vasprintf(mem_ctx, format, va);
2287    va_end(va);
2288    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2289
2290    this->fail_msg = msg;
2291
2292    if (INTEL_DEBUG & DEBUG_VS) {
2293       fprintf(stderr, "%s",  msg);
2294    }
2295 }
2296
2297 } /* namespace brw */