src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 extern "C" {
  26 #include "main/macros.h"
  27 #include "program/prog_parameter.h"
  28 #include "program/sampler.h"
  29 }
  30
  31 namespace brw {
  32
  33 vec4_instruction::vec4_instruction(vec4_visitor *v,
  34                                    enum opcode opcode, dst_reg dst,
  35                                    src_reg src0, src_reg src1, src_reg src2)
  36 {
  37    this->opcode = opcode;
  38    this->dst = dst;
  39    this->src[0] = src0;
  40    this->src[1] = src1;
  41    this->src[2] = src2;
  42    this->ir = v->base_ir;
  43    this->annotation = v->current_annotation;
  44 }
  45
  46 vec4_instruction *
  47 vec4_visitor::emit(vec4_instruction *inst)
  48 {
  49    this->instructions.push_tail(inst);
  50
  51    return inst;
  52 }
  53
  54 vec4_instruction *
  55 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  56 {
  57    new_inst->ir = inst->ir;
  58    new_inst->annotation = inst->annotation;
  59
  60    inst->insert_before(new_inst);
  61
  62    return inst;
  63 }
  64
  65 vec4_instruction *
  66 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  67                    src_reg src0, src_reg src1, src_reg src2)
  68 {
  69    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  70                                              src0, src1, src2));
  71 }
  72
  73
  74 vec4_instruction *
  75 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  76 {
  77    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  78 }
  79
  80 vec4_instruction *
  81 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  82 {
  83    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  84 }
  85
  86 vec4_instruction *
  87 vec4_visitor::emit(enum opcode opcode)
  88 {
  89    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  90 }
  91
  92 #define ALU1(op)                                                        \
  93    vec4_instruction *                                                   \
  94    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
  95    {                                                                    \
  96       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
  97                                            src0);                       \
  98    }
  99
 100 #define ALU2(op)                                                        \
 101    vec4_instruction *                                                   \
 102    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 103    {                                                                    \
 104       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 105                                            src0, src1);                 \
 106    }
 107
 108 ALU1(NOT)
 109 ALU1(MOV)
 110 ALU1(FRC)
 111 ALU1(RNDD)
 112 ALU1(RNDE)
 113 ALU1(RNDZ)
 114 ALU2(ADD)
 115 ALU2(MUL)
 116 ALU2(MACH)
 117 ALU2(AND)
 118 ALU2(OR)
 119 ALU2(XOR)
 120 ALU2(DP3)
 121 ALU2(DP4)
 122
 123 /** Gen4 predicated IF. */
 124 vec4_instruction *
 125 vec4_visitor::IF(uint32_t predicate)
 126 {
 127    vec4_instruction *inst;
 128
 129    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 130    inst->predicate = predicate;
 131
 132    return inst;
 133 }
 134
 135 /** Gen6+ IF with embedded comparison. */
 136 vec4_instruction *
 137 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 138 {
 139    assert(intel->gen >= 6);
 140
 141    vec4_instruction *inst;
 142
 143    resolve_ud_negate(&src0);
 144    resolve_ud_negate(&src1);
 145
 146    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 147                                         src0, src1);
 148    inst->conditional_mod = condition;
 149
 150    return inst;
 151 }
 152
 153 /**
 154  * CMP: Sets the low bit of the destination channels with the result
 155  * of the comparison, while the upper bits are undefined, and updates
 156  * the flag register with the packed 16 bits of the result.
 157  */
 158 vec4_instruction *
 159 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 160 {
 161    vec4_instruction *inst;
 162
 163    /* original gen4 does type conversion to the destination type
 164     * before before comparison, producing garbage results for floating
 165     * point comparisons.
 166     */
 167    if (intel->gen == 4) {
 168       dst.type = src0.type;
 169       if (dst.file == HW_REG)
 170          dst.fixed_hw_reg.type = dst.type;
 171    }
 172
 173    resolve_ud_negate(&src0);
 174    resolve_ud_negate(&src1);
 175
 176    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 177    inst->conditional_mod = condition;
 178
 179    return inst;
 180 }
 181
 182 vec4_instruction *
 183 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 184 {
 185    vec4_instruction *inst;
 186
 187    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 188                                         dst, index);
 189    inst->base_mrf = 14;
 190    inst->mlen = 1;
 191
 192    return inst;
 193 }
 194
 195 vec4_instruction *
 196 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 197 {
 198    vec4_instruction *inst;
 199
 200    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 201                                         dst, src, index);
 202    inst->base_mrf = 13;
 203    inst->mlen = 2;
 204
 205    return inst;
 206 }
 207
 208 void
 209 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 210 {
 211    static enum opcode dot_opcodes[] = {
 212       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 213    };
 214
 215    emit(dot_opcodes[elements - 2], dst, src0, src1);
 216 }
 217
 218 void
 219 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 220 {
 221    /* The gen6 math instruction ignores the source modifiers --
 222     * swizzle, abs, negate, and at least some parts of the register
 223     * region description.
 224     *
 225     * While it would seem that this MOV could be avoided at this point
 226     * in the case that the swizzle is matched up with the destination
 227     * writemask, note that uniform packing and register allocation
 228     * could rearrange our swizzle, so let's leave this matter up to
 229     * copy propagation later.
 230     */
 231    src_reg temp_src = src_reg(this, glsl_type::vec4_type);
 232    emit(MOV(dst_reg(temp_src), src));
 233
 234    if (dst.writemask != WRITEMASK_XYZW) {
 235       /* The gen6 math instruction must be align1, so we can't do
 236        * writemasks.
 237        */
 238       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 239
 240       emit(opcode, temp_dst, temp_src);
 241
 242       emit(MOV(dst, src_reg(temp_dst)));
 243    } else {
 244       emit(opcode, dst, temp_src);
 245    }
 246 }
 247
 248 void
 249 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 250 {
 251    vec4_instruction *inst = emit(opcode, dst, src);
 252    inst->base_mrf = 1;
 253    inst->mlen = 1;
 254 }
 255
 256 void
 257 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 258 {
 259    switch (opcode) {
 260    case SHADER_OPCODE_RCP:
 261    case SHADER_OPCODE_RSQ:
 262    case SHADER_OPCODE_SQRT:
 263    case SHADER_OPCODE_EXP2:
 264    case SHADER_OPCODE_LOG2:
 265    case SHADER_OPCODE_SIN:
 266    case SHADER_OPCODE_COS:
 267       break;
 268    default:
 269       assert(!"not reached: bad math opcode");
 270       return;
 271    }
 272
 273    if (intel->gen >= 7) {
 274       emit(opcode, dst, src);
 275    } else if (intel->gen == 6) {
 276       return emit_math1_gen6(opcode, dst, src);
 277    } else {
 278       return emit_math1_gen4(opcode, dst, src);
 279    }
 280 }
 281
 282 void
 283 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 284                               dst_reg dst, src_reg src0, src_reg src1)
 285 {
 286    src_reg expanded;
 287
 288    /* The gen6 math instruction ignores the source modifiers --
 289     * swizzle, abs, negate, and at least some parts of the register
 290     * region description.  Move the sources to temporaries to make it
 291     * generally work.
 292     */
 293
 294    expanded = src_reg(this, glsl_type::vec4_type);
 295    expanded.type = src0.type;
 296    emit(MOV(dst_reg(expanded), src0));
 297    src0 = expanded;
 298
 299    expanded = src_reg(this, glsl_type::vec4_type);
 300    expanded.type = src1.type;
 301    emit(MOV(dst_reg(expanded), src1));
 302    src1 = expanded;
 303
 304    if (dst.writemask != WRITEMASK_XYZW) {
 305       /* The gen6 math instruction must be align1, so we can't do
 306        * writemasks.
 307        */
 308       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 309       temp_dst.type = dst.type;
 310
 311       emit(opcode, temp_dst, src0, src1);
 312
 313       emit(MOV(dst, src_reg(temp_dst)));
 314    } else {
 315       emit(opcode, dst, src0, src1);
 316    }
 317 }
 318
 319 void
 320 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 321                               dst_reg dst, src_reg src0, src_reg src1)
 322 {
 323    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 324    inst->base_mrf = 1;
 325    inst->mlen = 2;
 326 }
 327
 328 void
 329 vec4_visitor::emit_math(enum opcode opcode,
 330                         dst_reg dst, src_reg src0, src_reg src1)
 331 {
 332    switch (opcode) {
 333    case SHADER_OPCODE_POW:
 334    case SHADER_OPCODE_INT_QUOTIENT:
 335    case SHADER_OPCODE_INT_REMAINDER:
 336       break;
 337    default:
 338       assert(!"not reached: unsupported binary math opcode");
 339       return;
 340    }
 341
 342    if (intel->gen >= 7) {
 343       emit(opcode, dst, src0, src1);
 344    } else if (intel->gen == 6) {
 345       return emit_math2_gen6(opcode, dst, src0, src1);
 346    } else {
 347       return emit_math2_gen4(opcode, dst, src0, src1);
 348    }
 349 }
 350
 351 void
 352 vec4_visitor::visit_instructions(const exec_list *list)
 353 {
 354    foreach_list(node, list) {
 355       ir_instruction *ir = (ir_instruction *)node;
 356
 357       base_ir = ir;
 358       ir->accept(this);
 359    }
 360 }
 361
 362
 363 static int
 364 type_size(const struct glsl_type *type)
 365 {
 366    unsigned int i;
 367    int size;
 368
 369    switch (type->base_type) {
 370    case GLSL_TYPE_UINT:
 371    case GLSL_TYPE_INT:
 372    case GLSL_TYPE_FLOAT:
 373    case GLSL_TYPE_BOOL:
 374       if (type->is_matrix()) {
 375          return type->matrix_columns;
 376       } else {
 377          /* Regardless of size of vector, it gets a vec4. This is bad
 378           * packing for things like floats, but otherwise arrays become a
 379           * mess.  Hopefully a later pass over the code can pack scalars
 380           * down if appropriate.
 381           */
 382          return 1;
 383       }
 384    case GLSL_TYPE_ARRAY:
 385       assert(type->length > 0);
 386       return type_size(type->fields.array) * type->length;
 387    case GLSL_TYPE_STRUCT:
 388       size = 0;
 389       for (i = 0; i < type->length; i++) {
 390          size += type_size(type->fields.structure[i].type);
 391       }
 392       return size;
 393    case GLSL_TYPE_SAMPLER:
 394       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 395        * at link time.
 396        */
 397       return 1;
 398    default:
 399       assert(0);
 400       return 0;
 401    }
 402 }
 403
 404 int
 405 vec4_visitor::virtual_grf_alloc(int size)
 406 {
 407    if (virtual_grf_array_size <= virtual_grf_count) {
 408       if (virtual_grf_array_size == 0)
 409          virtual_grf_array_size = 16;
 410       else
 411          virtual_grf_array_size *= 2;
 412       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 413                                    virtual_grf_array_size);
 414       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 415                                      virtual_grf_array_size);
 416    }
 417    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 418    virtual_grf_reg_count += size;
 419    virtual_grf_sizes[virtual_grf_count] = size;
 420    return virtual_grf_count++;
 421 }
 422
 423 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 424 {
 425    init();
 426
 427    this->file = GRF;
 428    this->reg = v->virtual_grf_alloc(type_size(type));
 429
 430    if (type->is_array() || type->is_record()) {
 431       this->swizzle = BRW_SWIZZLE_NOOP;
 432    } else {
 433       this->swizzle = swizzle_for_size(type->vector_elements);
 434    }
 435
 436    this->type = brw_type_for_base_type(type);
 437 }
 438
 439 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 440 {
 441    init();
 442
 443    this->file = GRF;
 444    this->reg = v->virtual_grf_alloc(type_size(type));
 445
 446    if (type->is_array() || type->is_record()) {
 447       this->writemask = WRITEMASK_XYZW;
 448    } else {
 449       this->writemask = (1 << type->vector_elements) - 1;
 450    }
 451
 452    this->type = brw_type_for_base_type(type);
 453 }
 454
 455 /* Our support for uniforms is piggy-backed on the struct
 456  * gl_fragment_program, because that's where the values actually
 457  * get stored, rather than in some global gl_shader_program uniform
 458  * store.
 459  */
 460 int
 461 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
 462 {
 463    unsigned int offset = 0;
 464    float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
 465
 466    if (type->is_matrix()) {
 467       const glsl_type *column = type->column_type();
 468
 469       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 470          offset += setup_uniform_values(loc + offset, column);
 471       }
 472
 473       return offset;
 474    }
 475
 476    switch (type->base_type) {
 477    case GLSL_TYPE_FLOAT:
 478    case GLSL_TYPE_UINT:
 479    case GLSL_TYPE_INT:
 480    case GLSL_TYPE_BOOL:
 481       for (unsigned int i = 0; i < type->vector_elements; i++) {
 482          c->prog_data.param[this->uniforms * 4 + i] = &values[i];
 483       }
 484
 485       /* Set up pad elements to get things aligned to a vec4 boundary. */
 486       for (unsigned int i = type->vector_elements; i < 4; i++) {
 487          static float zero = 0;
 488
 489          c->prog_data.param[this->uniforms * 4 + i] = &zero;
 490       }
 491
 492       /* Track the size of this uniform vector, for future packing of
 493        * uniforms.
 494        */
 495       this->uniform_vector_size[this->uniforms] = type->vector_elements;
 496       this->uniforms++;
 497
 498       return 1;
 499
 500    case GLSL_TYPE_STRUCT:
 501       for (unsigned int i = 0; i < type->length; i++) {
 502          offset += setup_uniform_values(loc + offset,
 503                                         type->fields.structure[i].type);
 504       }
 505       return offset;
 506
 507    case GLSL_TYPE_ARRAY:
 508       for (unsigned int i = 0; i < type->length; i++) {
 509          offset += setup_uniform_values(loc + offset, type->fields.array);
 510       }
 511       return offset;
 512
 513    case GLSL_TYPE_SAMPLER:
 514       /* The sampler takes up a slot, but we don't use any values from it. */
 515       return 1;
 516
 517    default:
 518       assert(!"not reached");
 519       return 0;
 520    }
 521 }
 522
 523 void
 524 vec4_visitor::setup_uniform_clipplane_values()
 525 {
 526    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 527
 528    /* Pre-Gen6, we compact clip planes.  For example, if the user
 529     * enables just clip planes 0, 1, and 3, we will enable clip planes
 530     * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
 531     * plane 2.  This simplifies the implementation of the Gen6 clip
 532     * thread.
 533     *
 534     * In Gen6 and later, we don't compact clip planes, because this
 535     * simplifies the implementation of gl_ClipDistance.
 536     */
 537    int compacted_clipplane_index = 0;
 538    for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
 539       if (intel->gen < 6 &&
 540           !(c->key.userclip_planes_enabled_gen_4_5 & (1 << i))) {
 541          continue;
 542       }
 543       this->uniform_vector_size[this->uniforms] = 4;
 544       this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
 545       this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
 546       for (int j = 0; j < 4; ++j) {
 547          c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 548       }
 549       ++compacted_clipplane_index;
 550       ++this->uniforms;
 551    }
 552 }
 553
 554 /* Our support for builtin uniforms is even scarier than non-builtin.
 555  * It sits on top of the PROG_STATE_VAR parameters that are
 556  * automatically updated from GL context state.
 557  */
 558 void
 559 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 560 {
 561    const ir_state_slot *const slots = ir->state_slots;
 562    assert(ir->state_slots != NULL);
 563
 564    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 565       /* This state reference has already been setup by ir_to_mesa,
 566        * but we'll get the same index back here.  We can reference
 567        * ParameterValues directly, since unlike brw_fs.cpp, we never
 568        * add new state references during compile.
 569        */
 570       int index = _mesa_add_state_reference(this->vp->Base.Parameters,
 571                                             (gl_state_index *)slots[i].tokens);
 572       float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
 573
 574       this->uniform_vector_size[this->uniforms] = 0;
 575       /* Add each of the unique swizzled channels of the element.
 576        * This will end up matching the size of the glsl_type of this field.
 577        */
 578       int last_swiz = -1;
 579       for (unsigned int j = 0; j < 4; j++) {
 580          int swiz = GET_SWZ(slots[i].swizzle, j);
 581          last_swiz = swiz;
 582
 583          c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
 584          if (swiz <= last_swiz)
 585             this->uniform_vector_size[this->uniforms]++;
 586       }
 587       this->uniforms++;
 588    }
 589 }
 590
 591 dst_reg *
 592 vec4_visitor::variable_storage(ir_variable *var)
 593 {
 594    return (dst_reg *)hash_table_find(this->variable_ht, var);
 595 }
 596
 597 void
 598 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 599 {
 600    ir_expression *expr = ir->as_expression();
 601
 602    *predicate = BRW_PREDICATE_NORMAL;
 603
 604    if (expr) {
 605       src_reg op[2];
 606       vec4_instruction *inst;
 607
 608       assert(expr->get_num_operands() <= 2);
 609       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 610          expr->operands[i]->accept(this);
 611          op[i] = this->result;
 612
 613          resolve_ud_negate(&op[i]);
 614       }
 615
 616       switch (expr->operation) {
 617       case ir_unop_logic_not:
 618          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 619          inst->conditional_mod = BRW_CONDITIONAL_Z;
 620          break;
 621
 622       case ir_binop_logic_xor:
 623          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 624          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 625          break;
 626
 627       case ir_binop_logic_or:
 628          inst = emit(OR(dst_null_d(), op[0], op[1]));
 629          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 630          break;
 631
 632       case ir_binop_logic_and:
 633          inst = emit(AND(dst_null_d(), op[0], op[1]));
 634          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 635          break;
 636
 637       case ir_unop_f2b:
 638          if (intel->gen >= 6) {
 639             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 640          } else {
 641             inst = emit(MOV(dst_null_f(), op[0]));
 642             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 643          }
 644          break;
 645
 646       case ir_unop_i2b:
 647          if (intel->gen >= 6) {
 648             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 649          } else {
 650             inst = emit(MOV(dst_null_d(), op[0]));
 651             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 652          }
 653          break;
 654
 655       case ir_binop_all_equal:
 656          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 657          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 658          break;
 659
 660       case ir_binop_any_nequal:
 661          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 662          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 663          break;
 664
 665       case ir_unop_any:
 666          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 667          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 668          break;
 669
 670       case ir_binop_greater:
 671       case ir_binop_gequal:
 672       case ir_binop_less:
 673       case ir_binop_lequal:
 674       case ir_binop_equal:
 675       case ir_binop_nequal:
 676          emit(CMP(dst_null_d(), op[0], op[1],
 677                   brw_conditional_for_comparison(expr->operation)));
 678          break;
 679
 680       default:
 681          assert(!"not reached");
 682          break;
 683       }
 684       return;
 685    }
 686
 687    ir->accept(this);
 688
 689    resolve_ud_negate(&this->result);
 690
 691    if (intel->gen >= 6) {
 692       vec4_instruction *inst = emit(AND(dst_null_d(),
 693                                         this->result, src_reg(1)));
 694       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 695    } else {
 696       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 697       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 698    }
 699 }
 700
 701 /**
 702  * Emit a gen6 IF statement with the comparison folded into the IF
 703  * instruction.
 704  */
 705 void
 706 vec4_visitor::emit_if_gen6(ir_if *ir)
 707 {
 708    ir_expression *expr = ir->condition->as_expression();
 709
 710    if (expr) {
 711       src_reg op[2];
 712       dst_reg temp;
 713
 714       assert(expr->get_num_operands() <= 2);
 715       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 716          expr->operands[i]->accept(this);
 717          op[i] = this->result;
 718       }
 719
 720       switch (expr->operation) {
 721       case ir_unop_logic_not:
 722          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 723          return;
 724
 725       case ir_binop_logic_xor:
 726          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 727          return;
 728
 729       case ir_binop_logic_or:
 730          temp = dst_reg(this, glsl_type::bool_type);
 731          emit(OR(temp, op[0], op[1]));
 732          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 733          return;
 734
 735       case ir_binop_logic_and:
 736          temp = dst_reg(this, glsl_type::bool_type);
 737          emit(AND(temp, op[0], op[1]));
 738          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 739          return;
 740
 741       case ir_unop_f2b:
 742          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 743          return;
 744
 745       case ir_unop_i2b:
 746          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 747          return;
 748
 749       case ir_binop_greater:
 750       case ir_binop_gequal:
 751       case ir_binop_less:
 752       case ir_binop_lequal:
 753       case ir_binop_equal:
 754       case ir_binop_nequal:
 755          emit(IF(op[0], op[1],
 756                  brw_conditional_for_comparison(expr->operation)));
 757          return;
 758
 759       case ir_binop_all_equal:
 760          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 761          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 762          return;
 763
 764       case ir_binop_any_nequal:
 765          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 766          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 767          return;
 768
 769       case ir_unop_any:
 770          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 771          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 772          return;
 773
 774       default:
 775          assert(!"not reached");
 776          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 777          return;
 778       }
 779       return;
 780    }
 781
 782    ir->condition->accept(this);
 783
 784    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 785 }
 786
 787 void
 788 vec4_visitor::visit(ir_variable *ir)
 789 {
 790    dst_reg *reg = NULL;
 791
 792    if (variable_storage(ir))
 793       return;
 794
 795    switch (ir->mode) {
 796    case ir_var_in:
 797       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
 798
 799       /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 800        * come in as floating point conversions of the integer values.
 801        */
 802       for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
 803          if (!c->key.gl_fixed_input_size[i])
 804             continue;
 805
 806          dst_reg dst = *reg;
 807          dst.type = brw_type_for_base_type(ir->type);
 808          dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
 809          emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 810       }
 811       break;
 812
 813    case ir_var_out:
 814       reg = new(mem_ctx) dst_reg(this, ir->type);
 815
 816       for (int i = 0; i < type_size(ir->type); i++) {
 817          output_reg[ir->location + i] = *reg;
 818          output_reg[ir->location + i].reg_offset = i;
 819          output_reg[ir->location + i].type =
 820             brw_type_for_base_type(ir->type->get_scalar_type());
 821          output_reg_annotation[ir->location + i] = ir->name;
 822       }
 823       break;
 824
 825    case ir_var_auto:
 826    case ir_var_temporary:
 827       reg = new(mem_ctx) dst_reg(this, ir->type);
 828       break;
 829
 830    case ir_var_uniform:
 831       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 832
 833       /* Thanks to the lower_ubo_reference pass, we will see only
 834        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 835        * variables, so no need for them to be in variable_ht.
 836        */
 837       if (ir->uniform_block != -1)
 838          return;
 839
 840       /* Track how big the whole uniform variable is, in case we need to put a
 841        * copy of its data into pull constants for array access.
 842        */
 843       this->uniform_size[this->uniforms] = type_size(ir->type);
 844
 845       if (!strncmp(ir->name, "gl_", 3)) {
 846          setup_builtin_uniform_values(ir);
 847       } else {
 848          setup_uniform_values(ir->location, ir->type);
 849       }
 850       break;
 851
 852    case ir_var_system_value:
 853       /* VertexID is stored by the VF as the last vertex element, but
 854        * we don't represent it with a flag in inputs_read, so we call
 855        * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
 856        */
 857       reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
 858       prog_data->uses_vertexid = true;
 859
 860       switch (ir->location) {
 861       case SYSTEM_VALUE_VERTEX_ID:
 862          reg->writemask = WRITEMASK_X;
 863          break;
 864       case SYSTEM_VALUE_INSTANCE_ID:
 865          reg->writemask = WRITEMASK_Y;
 866          break;
 867       default:
 868          assert(!"not reached");
 869          break;
 870       }
 871       break;
 872
 873    default:
 874       assert(!"not reached");
 875    }
 876
 877    reg->type = brw_type_for_base_type(ir->type);
 878    hash_table_insert(this->variable_ht, reg, ir);
 879 }
 880
 881 void
 882 vec4_visitor::visit(ir_loop *ir)
 883 {
 884    dst_reg counter;
 885
 886    /* We don't want debugging output to print the whole body of the
 887     * loop as the annotation.
 888     */
 889    this->base_ir = NULL;
 890
 891    if (ir->counter != NULL) {
 892       this->base_ir = ir->counter;
 893       ir->counter->accept(this);
 894       counter = *(variable_storage(ir->counter));
 895
 896       if (ir->from != NULL) {
 897          this->base_ir = ir->from;
 898          ir->from->accept(this);
 899
 900          emit(MOV(counter, this->result));
 901       }
 902    }
 903
 904    emit(BRW_OPCODE_DO);
 905
 906    if (ir->to) {
 907       this->base_ir = ir->to;
 908       ir->to->accept(this);
 909
 910       emit(CMP(dst_null_d(), src_reg(counter), this->result,
 911                brw_conditional_for_comparison(ir->cmp)));
 912
 913       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
 914       inst->predicate = BRW_PREDICATE_NORMAL;
 915    }
 916
 917    visit_instructions(&ir->body_instructions);
 918
 919
 920    if (ir->increment) {
 921       this->base_ir = ir->increment;
 922       ir->increment->accept(this);
 923       emit(ADD(counter, src_reg(counter), this->result));
 924    }
 925
 926    emit(BRW_OPCODE_WHILE);
 927 }
 928
 929 void
 930 vec4_visitor::visit(ir_loop_jump *ir)
 931 {
 932    switch (ir->mode) {
 933    case ir_loop_jump::jump_break:
 934       emit(BRW_OPCODE_BREAK);
 935       break;
 936    case ir_loop_jump::jump_continue:
 937       emit(BRW_OPCODE_CONTINUE);
 938       break;
 939    }
 940 }
 941
 942
 943 void
 944 vec4_visitor::visit(ir_function_signature *ir)
 945 {
 946    assert(0);
 947    (void)ir;
 948 }
 949
 950 void
 951 vec4_visitor::visit(ir_function *ir)
 952 {
 953    /* Ignore function bodies other than main() -- we shouldn't see calls to
 954     * them since they should all be inlined.
 955     */
 956    if (strcmp(ir->name, "main") == 0) {
 957       const ir_function_signature *sig;
 958       exec_list empty;
 959
 960       sig = ir->matching_signature(&empty);
 961
 962       assert(sig);
 963
 964       visit_instructions(&sig->body);
 965    }
 966 }
 967
 968 bool
 969 vec4_visitor::try_emit_sat(ir_expression *ir)
 970 {
 971    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
 972    if (!sat_src)
 973       return false;
 974
 975    sat_src->accept(this);
 976    src_reg src = this->result;
 977
 978    this->result = src_reg(this, ir->type);
 979    vec4_instruction *inst;
 980    inst = emit(MOV(dst_reg(this->result), src));
 981    inst->saturate = true;
 982
 983    return true;
 984 }
 985
 986 void
 987 vec4_visitor::emit_bool_comparison(unsigned int op,
 988                                  dst_reg dst, src_reg src0, src_reg src1)
 989 {
 990    /* original gen4 does destination conversion before comparison. */
 991    if (intel->gen < 5)
 992       dst.type = src0.type;
 993
 994    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
 995
 996    dst.type = BRW_REGISTER_TYPE_D;
 997    emit(AND(dst, src_reg(dst), src_reg(0x1)));
 998 }
 999
1000 void
1001 vec4_visitor::visit(ir_expression *ir)
1002 {
1003    unsigned int operand;
1004    src_reg op[Elements(ir->operands)];
1005    src_reg result_src;
1006    dst_reg result_dst;
1007    vec4_instruction *inst;
1008
1009    if (try_emit_sat(ir))
1010       return;
1011
1012    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1013       this->result.file = BAD_FILE;
1014       ir->operands[operand]->accept(this);
1015       if (this->result.file == BAD_FILE) {
1016          printf("Failed to get tree for expression operand:\n");
1017          ir->operands[operand]->print();
1018          exit(1);
1019       }
1020       op[operand] = this->result;
1021
1022       /* Matrix expression operands should have been broken down to vector
1023        * operations already.
1024        */
1025       assert(!ir->operands[operand]->type->is_matrix());
1026    }
1027
1028    int vector_elements = ir->operands[0]->type->vector_elements;
1029    if (ir->operands[1]) {
1030       vector_elements = MAX2(vector_elements,
1031                              ir->operands[1]->type->vector_elements);
1032    }
1033
1034    this->result.file = BAD_FILE;
1035
1036    /* Storage for our result.  Ideally for an assignment we'd be using
1037     * the actual storage for the result here, instead.
1038     */
1039    result_src = src_reg(this, ir->type);
1040    /* convenience for the emit functions below. */
1041    result_dst = dst_reg(result_src);
1042    /* If nothing special happens, this is the result. */
1043    this->result = result_src;
1044    /* Limit writes to the channels that will be used by result_src later.
1045     * This does limit this temp's use as a temporary for multi-instruction
1046     * sequences.
1047     */
1048    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1049
1050    switch (ir->operation) {
1051    case ir_unop_logic_not:
1052       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1053        * ones complement of the whole register, not just bit 0.
1054        */
1055       emit(XOR(result_dst, op[0], src_reg(1)));
1056       break;
1057    case ir_unop_neg:
1058       op[0].negate = !op[0].negate;
1059       this->result = op[0];
1060       break;
1061    case ir_unop_abs:
1062       op[0].abs = true;
1063       op[0].negate = false;
1064       this->result = op[0];
1065       break;
1066
1067    case ir_unop_sign:
1068       emit(MOV(result_dst, src_reg(0.0f)));
1069
1070       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1071       inst = emit(MOV(result_dst, src_reg(1.0f)));
1072       inst->predicate = BRW_PREDICATE_NORMAL;
1073
1074       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1075       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1076       inst->predicate = BRW_PREDICATE_NORMAL;
1077
1078       break;
1079
1080    case ir_unop_rcp:
1081       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1082       break;
1083
1084    case ir_unop_exp2:
1085       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1086       break;
1087    case ir_unop_log2:
1088       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1089       break;
1090    case ir_unop_exp:
1091    case ir_unop_log:
1092       assert(!"not reached: should be handled by ir_explog_to_explog2");
1093       break;
1094    case ir_unop_sin:
1095    case ir_unop_sin_reduced:
1096       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1097       break;
1098    case ir_unop_cos:
1099    case ir_unop_cos_reduced:
1100       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1101       break;
1102
1103    case ir_unop_dFdx:
1104    case ir_unop_dFdy:
1105       assert(!"derivatives not valid in vertex shader");
1106       break;
1107
1108    case ir_unop_noise:
1109       assert(!"not reached: should be handled by lower_noise");
1110       break;
1111
1112    case ir_binop_add:
1113       emit(ADD(result_dst, op[0], op[1]));
1114       break;
1115    case ir_binop_sub:
1116       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1117       break;
1118
1119    case ir_binop_mul:
1120       if (ir->type->is_integer()) {
1121          /* For integer multiplication, the MUL uses the low 16 bits
1122           * of one of the operands (src0 on gen6, src1 on gen7).  The
1123           * MACH accumulates in the contribution of the upper 16 bits
1124           * of that operand.
1125           *
1126           * FINISHME: Emit just the MUL if we know an operand is small
1127           * enough.
1128           */
1129          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1130
1131          emit(MUL(acc, op[0], op[1]));
1132          emit(MACH(dst_null_d(), op[0], op[1]));
1133          emit(MOV(result_dst, src_reg(acc)));
1134       } else {
1135          emit(MUL(result_dst, op[0], op[1]));
1136       }
1137       break;
1138    case ir_binop_div:
1139       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1140       assert(ir->type->is_integer());
1141       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1142       break;
1143    case ir_binop_mod:
1144       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1145       assert(ir->type->is_integer());
1146       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1147       break;
1148
1149    case ir_binop_less:
1150    case ir_binop_greater:
1151    case ir_binop_lequal:
1152    case ir_binop_gequal:
1153    case ir_binop_equal:
1154    case ir_binop_nequal: {
1155       emit(CMP(result_dst, op[0], op[1],
1156                brw_conditional_for_comparison(ir->operation)));
1157       emit(AND(result_dst, result_src, src_reg(0x1)));
1158       break;
1159    }
1160
1161    case ir_binop_all_equal:
1162       /* "==" operator producing a scalar boolean. */
1163       if (ir->operands[0]->type->is_vector() ||
1164           ir->operands[1]->type->is_vector()) {
1165          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1166          emit(MOV(result_dst, src_reg(0)));
1167          inst = emit(MOV(result_dst, src_reg(1)));
1168          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1169       } else {
1170          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1171          emit(AND(result_dst, result_src, src_reg(0x1)));
1172       }
1173       break;
1174    case ir_binop_any_nequal:
1175       /* "!=" operator producing a scalar boolean. */
1176       if (ir->operands[0]->type->is_vector() ||
1177           ir->operands[1]->type->is_vector()) {
1178          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1179
1180          emit(MOV(result_dst, src_reg(0)));
1181          inst = emit(MOV(result_dst, src_reg(1)));
1182          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1183       } else {
1184          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1185          emit(AND(result_dst, result_src, src_reg(0x1)));
1186       }
1187       break;
1188
1189    case ir_unop_any:
1190       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1191       emit(MOV(result_dst, src_reg(0)));
1192
1193       inst = emit(MOV(result_dst, src_reg(1)));
1194       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1195       break;
1196
1197    case ir_binop_logic_xor:
1198       emit(XOR(result_dst, op[0], op[1]));
1199       break;
1200
1201    case ir_binop_logic_or:
1202       emit(OR(result_dst, op[0], op[1]));
1203       break;
1204
1205    case ir_binop_logic_and:
1206       emit(AND(result_dst, op[0], op[1]));
1207       break;
1208
1209    case ir_binop_dot:
1210       assert(ir->operands[0]->type->is_vector());
1211       assert(ir->operands[0]->type == ir->operands[1]->type);
1212       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1213       break;
1214
1215    case ir_unop_sqrt:
1216       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1217       break;
1218    case ir_unop_rsq:
1219       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1220       break;
1221
1222    case ir_unop_bitcast_i2f:
1223    case ir_unop_bitcast_u2f:
1224       this->result = op[0];
1225       this->result.type = BRW_REGISTER_TYPE_F;
1226       break;
1227
1228    case ir_unop_bitcast_f2i:
1229       this->result = op[0];
1230       this->result.type = BRW_REGISTER_TYPE_D;
1231       break;
1232
1233    case ir_unop_bitcast_f2u:
1234       this->result = op[0];
1235       this->result.type = BRW_REGISTER_TYPE_UD;
1236       break;
1237
1238    case ir_unop_i2f:
1239    case ir_unop_i2u:
1240    case ir_unop_u2i:
1241    case ir_unop_u2f:
1242    case ir_unop_b2f:
1243    case ir_unop_b2i:
1244    case ir_unop_f2i:
1245    case ir_unop_f2u:
1246       emit(MOV(result_dst, op[0]));
1247       break;
1248    case ir_unop_f2b:
1249    case ir_unop_i2b: {
1250       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1251       emit(AND(result_dst, result_src, src_reg(1)));
1252       break;
1253    }
1254
1255    case ir_unop_trunc:
1256       emit(RNDZ(result_dst, op[0]));
1257       break;
1258    case ir_unop_ceil:
1259       op[0].negate = !op[0].negate;
1260       inst = emit(RNDD(result_dst, op[0]));
1261       this->result.negate = true;
1262       break;
1263    case ir_unop_floor:
1264       inst = emit(RNDD(result_dst, op[0]));
1265       break;
1266    case ir_unop_fract:
1267       inst = emit(FRC(result_dst, op[0]));
1268       break;
1269    case ir_unop_round_even:
1270       emit(RNDE(result_dst, op[0]));
1271       break;
1272
1273    case ir_binop_min:
1274       if (intel->gen >= 6) {
1275          inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1276          inst->conditional_mod = BRW_CONDITIONAL_L;
1277       } else {
1278          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_L));
1279
1280          inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1281          inst->predicate = BRW_PREDICATE_NORMAL;
1282       }
1283       break;
1284    case ir_binop_max:
1285       if (intel->gen >= 6) {
1286          inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1287          inst->conditional_mod = BRW_CONDITIONAL_G;
1288       } else {
1289          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_G));
1290
1291          inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1292          inst->predicate = BRW_PREDICATE_NORMAL;
1293       }
1294       break;
1295
1296    case ir_binop_pow:
1297       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1298       break;
1299
1300    case ir_unop_bit_not:
1301       inst = emit(NOT(result_dst, op[0]));
1302       break;
1303    case ir_binop_bit_and:
1304       inst = emit(AND(result_dst, op[0], op[1]));
1305       break;
1306    case ir_binop_bit_xor:
1307       inst = emit(XOR(result_dst, op[0], op[1]));
1308       break;
1309    case ir_binop_bit_or:
1310       inst = emit(OR(result_dst, op[0], op[1]));
1311       break;
1312
1313    case ir_binop_lshift:
1314       inst = emit(BRW_OPCODE_SHL, result_dst, op[0], op[1]);
1315       break;
1316
1317    case ir_binop_rshift:
1318       if (ir->type->base_type == GLSL_TYPE_INT)
1319          inst = emit(BRW_OPCODE_ASR, result_dst, op[0], op[1]);
1320       else
1321          inst = emit(BRW_OPCODE_SHR, result_dst, op[0], op[1]);
1322       break;
1323
1324    case ir_binop_ubo_load: {
1325       ir_constant *uniform_block = ir->operands[0]->as_constant();
1326       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1327       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1328       src_reg offset = op[1];
1329
1330       /* Now, load the vector from that offset. */
1331       assert(ir->type->is_vector() || ir->type->is_scalar());
1332
1333       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1334       packed_consts.type = result.type;
1335       src_reg surf_index =
1336          src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1337       if (const_offset_ir) {
1338          offset = src_reg(const_offset / 16);
1339       } else {
1340          emit(BRW_OPCODE_SHR, dst_reg(offset), offset, src_reg(4));
1341       }
1342
1343       vec4_instruction *pull =
1344          emit(new(mem_ctx) vec4_instruction(this,
1345                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1346                                             dst_reg(packed_consts),
1347                                             surf_index,
1348                                             offset));
1349       pull->base_mrf = 14;
1350       pull->mlen = 1;
1351
1352       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1353       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1354                                             const_offset % 16 / 4,
1355                                             const_offset % 16 / 4,
1356                                             const_offset % 16 / 4);
1357
1358       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1359       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1360          emit(CMP(result_dst, packed_consts, src_reg(0u),
1361                   BRW_CONDITIONAL_NZ));
1362          emit(AND(result_dst, result, src_reg(0x1)));
1363       } else {
1364          emit(MOV(result_dst, packed_consts));
1365       }
1366       break;
1367    }
1368
1369    case ir_quadop_vector:
1370       assert(!"not reached: should be handled by lower_quadop_vector");
1371       break;
1372    }
1373 }
1374
1375
1376 void
1377 vec4_visitor::visit(ir_swizzle *ir)
1378 {
1379    src_reg src;
1380    int i = 0;
1381    int swizzle[4];
1382
1383    /* Note that this is only swizzles in expressions, not those on the left
1384     * hand side of an assignment, which do write masking.  See ir_assignment
1385     * for that.
1386     */
1387
1388    ir->val->accept(this);
1389    src = this->result;
1390    assert(src.file != BAD_FILE);
1391
1392    for (i = 0; i < ir->type->vector_elements; i++) {
1393       switch (i) {
1394       case 0:
1395          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1396          break;
1397       case 1:
1398          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1399          break;
1400       case 2:
1401          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1402          break;
1403       case 3:
1404          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1405             break;
1406       }
1407    }
1408    for (; i < 4; i++) {
1409       /* Replicate the last channel out. */
1410       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1411    }
1412
1413    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1414
1415    this->result = src;
1416 }
1417
1418 void
1419 vec4_visitor::visit(ir_dereference_variable *ir)
1420 {
1421    const struct glsl_type *type = ir->type;
1422    dst_reg *reg = variable_storage(ir->var);
1423
1424    if (!reg) {
1425       fail("Failed to find variable storage for %s\n", ir->var->name);
1426       this->result = src_reg(brw_null_reg());
1427       return;
1428    }
1429
1430    this->result = src_reg(*reg);
1431
1432    /* System values get their swizzle from the dst_reg writemask */
1433    if (ir->var->mode == ir_var_system_value)
1434       return;
1435
1436    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1437       this->result.swizzle = swizzle_for_size(type->vector_elements);
1438 }
1439
1440 void
1441 vec4_visitor::visit(ir_dereference_array *ir)
1442 {
1443    ir_constant *constant_index;
1444    src_reg src;
1445    int element_size = type_size(ir->type);
1446
1447    constant_index = ir->array_index->constant_expression_value();
1448
1449    ir->array->accept(this);
1450    src = this->result;
1451
1452    if (constant_index) {
1453       src.reg_offset += constant_index->value.i[0] * element_size;
1454    } else {
1455       /* Variable index array dereference.  It eats the "vec4" of the
1456        * base of the array and an index that offsets the Mesa register
1457        * index.
1458        */
1459       ir->array_index->accept(this);
1460
1461       src_reg index_reg;
1462
1463       if (element_size == 1) {
1464          index_reg = this->result;
1465       } else {
1466          index_reg = src_reg(this, glsl_type::int_type);
1467
1468          emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1469       }
1470
1471       if (src.reladdr) {
1472          src_reg temp = src_reg(this, glsl_type::int_type);
1473
1474          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1475
1476          index_reg = temp;
1477       }
1478
1479       src.reladdr = ralloc(mem_ctx, src_reg);
1480       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1481    }
1482
1483    /* If the type is smaller than a vec4, replicate the last channel out. */
1484    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1485       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1486    else
1487       src.swizzle = BRW_SWIZZLE_NOOP;
1488    src.type = brw_type_for_base_type(ir->type);
1489
1490    this->result = src;
1491 }
1492
1493 void
1494 vec4_visitor::visit(ir_dereference_record *ir)
1495 {
1496    unsigned int i;
1497    const glsl_type *struct_type = ir->record->type;
1498    int offset = 0;
1499
1500    ir->record->accept(this);
1501
1502    for (i = 0; i < struct_type->length; i++) {
1503       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1504          break;
1505       offset += type_size(struct_type->fields.structure[i].type);
1506    }
1507
1508    /* If the type is smaller than a vec4, replicate the last channel out. */
1509    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1510       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1511    else
1512       this->result.swizzle = BRW_SWIZZLE_NOOP;
1513    this->result.type = brw_type_for_base_type(ir->type);
1514
1515    this->result.reg_offset += offset;
1516 }
1517
1518 /**
1519  * We want to be careful in assignment setup to hit the actual storage
1520  * instead of potentially using a temporary like we might with the
1521  * ir_dereference handler.
1522  */
1523 static dst_reg
1524 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1525 {
1526    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1527     * access of a vector, it must be separated into a series conditional moves
1528     * before reaching this point (see ir_vec_index_to_cond_assign).
1529     */
1530    assert(ir->as_dereference());
1531    ir_dereference_array *deref_array = ir->as_dereference_array();
1532    if (deref_array) {
1533       assert(!deref_array->array->type->is_vector());
1534    }
1535
1536    /* Use the rvalue deref handler for the most part.  We'll ignore
1537     * swizzles in it and write swizzles using writemask, though.
1538     */
1539    ir->accept(v);
1540    return dst_reg(v->result);
1541 }
1542
1543 void
1544 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1545                               const struct glsl_type *type, uint32_t predicate)
1546 {
1547    if (type->base_type == GLSL_TYPE_STRUCT) {
1548       for (unsigned int i = 0; i < type->length; i++) {
1549          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1550       }
1551       return;
1552    }
1553
1554    if (type->is_array()) {
1555       for (unsigned int i = 0; i < type->length; i++) {
1556          emit_block_move(dst, src, type->fields.array, predicate);
1557       }
1558       return;
1559    }
1560
1561    if (type->is_matrix()) {
1562       const struct glsl_type *vec_type;
1563
1564       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1565                                          type->vector_elements, 1);
1566
1567       for (int i = 0; i < type->matrix_columns; i++) {
1568          emit_block_move(dst, src, vec_type, predicate);
1569       }
1570       return;
1571    }
1572
1573    assert(type->is_scalar() || type->is_vector());
1574
1575    dst->type = brw_type_for_base_type(type);
1576    src->type = dst->type;
1577
1578    dst->writemask = (1 << type->vector_elements) - 1;
1579
1580    src->swizzle = swizzle_for_size(type->vector_elements);
1581
1582    vec4_instruction *inst = emit(MOV(*dst, *src));
1583    inst->predicate = predicate;
1584
1585    dst->reg_offset++;
1586    src->reg_offset++;
1587 }
1588
1589
1590 /* If the RHS processing resulted in an instruction generating a
1591  * temporary value, and it would be easy to rewrite the instruction to
1592  * generate its result right into the LHS instead, do so.  This ends
1593  * up reliably removing instructions where it can be tricky to do so
1594  * later without real UD chain information.
1595  */
1596 bool
1597 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1598                                      dst_reg dst,
1599                                      src_reg src,
1600                                      vec4_instruction *pre_rhs_inst,
1601                                      vec4_instruction *last_rhs_inst)
1602 {
1603    /* This could be supported, but it would take more smarts. */
1604    if (ir->condition)
1605       return false;
1606
1607    if (pre_rhs_inst == last_rhs_inst)
1608       return false; /* No instructions generated to work with. */
1609
1610    /* Make sure the last instruction generated our source reg. */
1611    if (src.file != GRF ||
1612        src.file != last_rhs_inst->dst.file ||
1613        src.reg != last_rhs_inst->dst.reg ||
1614        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1615        src.reladdr ||
1616        src.abs ||
1617        src.negate ||
1618        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1619       return false;
1620
1621    /* Check that that last instruction fully initialized the channels
1622     * we want to use, in the order we want to use them.  We could
1623     * potentially reswizzle the operands of many instructions so that
1624     * we could handle out of order channels, but don't yet.
1625     */
1626
1627    for (unsigned i = 0; i < 4; i++) {
1628       if (dst.writemask & (1 << i)) {
1629          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1630             return false;
1631
1632          if (BRW_GET_SWZ(src.swizzle, i) != i)
1633             return false;
1634       }
1635    }
1636
1637    /* Success!  Rewrite the instruction. */
1638    last_rhs_inst->dst.file = dst.file;
1639    last_rhs_inst->dst.reg = dst.reg;
1640    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1641    last_rhs_inst->dst.reladdr = dst.reladdr;
1642    last_rhs_inst->dst.writemask &= dst.writemask;
1643
1644    return true;
1645 }
1646
1647 void
1648 vec4_visitor::visit(ir_assignment *ir)
1649 {
1650    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1651    uint32_t predicate = BRW_PREDICATE_NONE;
1652
1653    if (!ir->lhs->type->is_scalar() &&
1654        !ir->lhs->type->is_vector()) {
1655       ir->rhs->accept(this);
1656       src_reg src = this->result;
1657
1658       if (ir->condition) {
1659          emit_bool_to_cond_code(ir->condition, &predicate);
1660       }
1661
1662       /* emit_block_move doesn't account for swizzles in the source register.
1663        * This should be ok, since the source register is a structure or an
1664        * array, and those can't be swizzled.  But double-check to be sure.
1665        */
1666       assert(src.swizzle ==
1667              (ir->rhs->type->is_matrix()
1668               ? swizzle_for_size(ir->rhs->type->vector_elements)
1669               : BRW_SWIZZLE_NOOP));
1670
1671       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1672       return;
1673    }
1674
1675    /* Now we're down to just a scalar/vector with writemasks. */
1676    int i;
1677
1678    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1679    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1680
1681    ir->rhs->accept(this);
1682
1683    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1684
1685    src_reg src = this->result;
1686
1687    int swizzles[4];
1688    int first_enabled_chan = 0;
1689    int src_chan = 0;
1690
1691    assert(ir->lhs->type->is_vector() ||
1692           ir->lhs->type->is_scalar());
1693    dst.writemask = ir->write_mask;
1694
1695    for (int i = 0; i < 4; i++) {
1696       if (dst.writemask & (1 << i)) {
1697          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1698          break;
1699       }
1700    }
1701
1702    /* Swizzle a small RHS vector into the channels being written.
1703     *
1704     * glsl ir treats write_mask as dictating how many channels are
1705     * present on the RHS while in our instructions we need to make
1706     * those channels appear in the slots of the vec4 they're written to.
1707     */
1708    for (int i = 0; i < 4; i++) {
1709       if (dst.writemask & (1 << i))
1710          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1711       else
1712          swizzles[i] = first_enabled_chan;
1713    }
1714    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1715                               swizzles[2], swizzles[3]);
1716
1717    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1718       return;
1719    }
1720
1721    if (ir->condition) {
1722       emit_bool_to_cond_code(ir->condition, &predicate);
1723    }
1724
1725    for (i = 0; i < type_size(ir->lhs->type); i++) {
1726       vec4_instruction *inst = emit(MOV(dst, src));
1727       inst->predicate = predicate;
1728
1729       dst.reg_offset++;
1730       src.reg_offset++;
1731    }
1732 }
1733
1734 void
1735 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1736 {
1737    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1738       foreach_list(node, &ir->components) {
1739          ir_constant *field_value = (ir_constant *)node;
1740
1741          emit_constant_values(dst, field_value);
1742       }
1743       return;
1744    }
1745
1746    if (ir->type->is_array()) {
1747       for (unsigned int i = 0; i < ir->type->length; i++) {
1748          emit_constant_values(dst, ir->array_elements[i]);
1749       }
1750       return;
1751    }
1752
1753    if (ir->type->is_matrix()) {
1754       for (int i = 0; i < ir->type->matrix_columns; i++) {
1755          float *vec = &ir->value.f[i * ir->type->vector_elements];
1756
1757          for (int j = 0; j < ir->type->vector_elements; j++) {
1758             dst->writemask = 1 << j;
1759             dst->type = BRW_REGISTER_TYPE_F;
1760
1761             emit(MOV(*dst, src_reg(vec[j])));
1762          }
1763          dst->reg_offset++;
1764       }
1765       return;
1766    }
1767
1768    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
1769
1770    for (int i = 0; i < ir->type->vector_elements; i++) {
1771       if (!(remaining_writemask & (1 << i)))
1772          continue;
1773
1774       dst->writemask = 1 << i;
1775       dst->type = brw_type_for_base_type(ir->type);
1776
1777       /* Find other components that match the one we're about to
1778        * write.  Emits fewer instructions for things like vec4(0.5,
1779        * 1.5, 1.5, 1.5).
1780        */
1781       for (int j = i + 1; j < ir->type->vector_elements; j++) {
1782          if (ir->type->base_type == GLSL_TYPE_BOOL) {
1783             if (ir->value.b[i] == ir->value.b[j])
1784                dst->writemask |= (1 << j);
1785          } else {
1786             /* u, i, and f storage all line up, so no need for a
1787              * switch case for comparing each type.
1788              */
1789             if (ir->value.u[i] == ir->value.u[j])
1790                dst->writemask |= (1 << j);
1791          }
1792       }
1793
1794       switch (ir->type->base_type) {
1795       case GLSL_TYPE_FLOAT:
1796          emit(MOV(*dst, src_reg(ir->value.f[i])));
1797          break;
1798       case GLSL_TYPE_INT:
1799          emit(MOV(*dst, src_reg(ir->value.i[i])));
1800          break;
1801       case GLSL_TYPE_UINT:
1802          emit(MOV(*dst, src_reg(ir->value.u[i])));
1803          break;
1804       case GLSL_TYPE_BOOL:
1805          emit(MOV(*dst, src_reg(ir->value.b[i])));
1806          break;
1807       default:
1808          assert(!"Non-float/uint/int/bool constant");
1809          break;
1810       }
1811
1812       remaining_writemask &= ~dst->writemask;
1813    }
1814    dst->reg_offset++;
1815 }
1816
1817 void
1818 vec4_visitor::visit(ir_constant *ir)
1819 {
1820    dst_reg dst = dst_reg(this, ir->type);
1821    this->result = src_reg(dst);
1822
1823    emit_constant_values(&dst, ir);
1824 }
1825
1826 void
1827 vec4_visitor::visit(ir_call *ir)
1828 {
1829    assert(!"not reached");
1830 }
1831
1832 void
1833 vec4_visitor::visit(ir_texture *ir)
1834 {
1835    int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
1836
1837    /* Should be lowered by do_lower_texture_projection */
1838    assert(!ir->projector);
1839
1840    /* Generate code to compute all the subexpression trees.  This has to be
1841     * done before loading any values into MRFs for the sampler message since
1842     * generating these values may involve SEND messages that need the MRFs.
1843     */
1844    src_reg coordinate;
1845    if (ir->coordinate) {
1846       ir->coordinate->accept(this);
1847       coordinate = this->result;
1848    }
1849
1850    src_reg shadow_comparitor;
1851    if (ir->shadow_comparitor) {
1852       ir->shadow_comparitor->accept(this);
1853       shadow_comparitor = this->result;
1854    }
1855
1856    src_reg lod, dPdx, dPdy;
1857    switch (ir->op) {
1858    case ir_txf:
1859    case ir_txl:
1860    case ir_txs:
1861       ir->lod_info.lod->accept(this);
1862       lod = this->result;
1863       break;
1864    case ir_txd:
1865       ir->lod_info.grad.dPdx->accept(this);
1866       dPdx = this->result;
1867
1868       ir->lod_info.grad.dPdy->accept(this);
1869       dPdy = this->result;
1870       break;
1871    case ir_tex:
1872    case ir_txb:
1873       break;
1874    }
1875
1876    vec4_instruction *inst = NULL;
1877    switch (ir->op) {
1878    case ir_tex:
1879    case ir_txl:
1880       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
1881       break;
1882    case ir_txd:
1883       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
1884       break;
1885    case ir_txf:
1886       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
1887       break;
1888    case ir_txs:
1889       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
1890       break;
1891    case ir_txb:
1892       assert(!"TXB is not valid for vertex shaders.");
1893    }
1894
1895    /* Texel offsets go in the message header; Gen4 also requires headers. */
1896    inst->header_present = ir->offset || intel->gen < 5;
1897    inst->base_mrf = 2;
1898    inst->mlen = inst->header_present + 1; /* always at least one */
1899    inst->sampler = sampler;
1900    inst->dst = dst_reg(this, ir->type);
1901    inst->shadow_compare = ir->shadow_comparitor != NULL;
1902
1903    if (ir->offset != NULL && ir->op != ir_txf)
1904       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
1905
1906    /* MRF for the first parameter */
1907    int param_base = inst->base_mrf + inst->header_present;
1908
1909    if (ir->op == ir_txs) {
1910       int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
1911       emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, writemask),
1912            lod));
1913    } else {
1914       int i, coord_mask = 0, zero_mask = 0;
1915       /* Load the coordinate */
1916       /* FINISHME: gl_clamp_mask and saturate */
1917       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
1918          coord_mask |= (1 << i);
1919       for (; i < 4; i++)
1920          zero_mask |= (1 << i);
1921
1922       if (ir->offset && ir->op == ir_txf) {
1923          /* It appears that the ld instruction used for txf does its
1924           * address bounds check before adding in the offset.  To work
1925           * around this, just add the integer offset to the integer
1926           * texel coordinate, and don't put the offset in the header.
1927           */
1928          ir_constant *offset = ir->offset->as_constant();
1929          assert(offset);
1930
1931          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
1932             src_reg src = coordinate;
1933             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
1934                                        BRW_GET_SWZ(src.swizzle, j),
1935                                        BRW_GET_SWZ(src.swizzle, j),
1936                                        BRW_GET_SWZ(src.swizzle, j));
1937             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
1938                      src, offset->value.i[j]));
1939          }
1940       } else {
1941          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
1942                   coordinate));
1943       }
1944       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
1945                src_reg(0)));
1946       /* Load the shadow comparitor */
1947       if (ir->shadow_comparitor) {
1948          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
1949                           WRITEMASK_X),
1950                   shadow_comparitor));
1951          inst->mlen++;
1952       }
1953
1954       /* Load the LOD info */
1955       if (ir->op == ir_txl) {
1956          int mrf, writemask;
1957          if (intel->gen >= 5) {
1958             mrf = param_base + 1;
1959             if (ir->shadow_comparitor) {
1960                writemask = WRITEMASK_Y;
1961                /* mlen already incremented */
1962             } else {
1963                writemask = WRITEMASK_X;
1964                inst->mlen++;
1965             }
1966          } else /* intel->gen == 4 */ {
1967             mrf = param_base;
1968             writemask = WRITEMASK_Z;
1969          }
1970          emit(MOV(dst_reg(MRF, mrf, ir->lod_info.lod->type, writemask), lod));
1971       } else if (ir->op == ir_txf) {
1972          emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, WRITEMASK_W),
1973                   lod));
1974       } else if (ir->op == ir_txd) {
1975          const glsl_type *type = ir->lod_info.grad.dPdx->type;
1976
1977          if (intel->gen >= 5) {
1978             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1979             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1980             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
1981             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
1982             inst->mlen++;
1983
1984             if (ir->type->vector_elements == 3) {
1985                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
1986                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
1987                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
1988                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
1989                inst->mlen++;
1990             }
1991          } else /* intel->gen == 4 */ {
1992             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
1993             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
1994             inst->mlen += 2;
1995          }
1996       }
1997    }
1998
1999    emit(inst);
2000
2001    swizzle_result(ir, src_reg(inst->dst), sampler);
2002 }
2003
2004 void
2005 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2006 {
2007    this->result = orig_val;
2008
2009    int s = c->key.tex.swizzles[sampler];
2010
2011    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2012                         || s == SWIZZLE_NOOP)
2013       return;
2014
2015    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2016    int swizzle[4];
2017
2018    for (int i = 0; i < 4; i++) {
2019       switch (GET_SWZ(s, i)) {
2020       case SWIZZLE_ZERO:
2021          zero_mask |= (1 << i);
2022          break;
2023       case SWIZZLE_ONE:
2024          one_mask |= (1 << i);
2025          break;
2026       default:
2027          copy_mask |= (1 << i);
2028          swizzle[i] = GET_SWZ(s, i);
2029          break;
2030       }
2031    }
2032
2033    this->result = src_reg(this, ir->type);
2034    dst_reg swizzled_result(this->result);
2035
2036    if (copy_mask) {
2037       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2038       swizzled_result.writemask = copy_mask;
2039       emit(MOV(swizzled_result, orig_val));
2040    }
2041
2042    if (zero_mask) {
2043       swizzled_result.writemask = zero_mask;
2044       emit(MOV(swizzled_result, src_reg(0.0f)));
2045    }
2046
2047    if (one_mask) {
2048       swizzled_result.writemask = one_mask;
2049       emit(MOV(swizzled_result, src_reg(1.0f)));
2050    }
2051 }
2052
2053 void
2054 vec4_visitor::visit(ir_return *ir)
2055 {
2056    assert(!"not reached");
2057 }
2058
2059 void
2060 vec4_visitor::visit(ir_discard *ir)
2061 {
2062    assert(!"not reached");
2063 }
2064
2065 void
2066 vec4_visitor::visit(ir_if *ir)
2067 {
2068    /* Don't point the annotation at the if statement, because then it plus
2069     * the then and else blocks get printed.
2070     */
2071    this->base_ir = ir->condition;
2072
2073    if (intel->gen == 6) {
2074       emit_if_gen6(ir);
2075    } else {
2076       uint32_t predicate;
2077       emit_bool_to_cond_code(ir->condition, &predicate);
2078       emit(IF(predicate));
2079    }
2080
2081    visit_instructions(&ir->then_instructions);
2082
2083    if (!ir->else_instructions.is_empty()) {
2084       this->base_ir = ir->condition;
2085       emit(BRW_OPCODE_ELSE);
2086
2087       visit_instructions(&ir->else_instructions);
2088    }
2089
2090    this->base_ir = ir->condition;
2091    emit(BRW_OPCODE_ENDIF);
2092 }
2093
2094 void
2095 vec4_visitor::emit_ndc_computation()
2096 {
2097    /* Get the position */
2098    src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
2099
2100    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2101    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2102    output_reg[BRW_VERT_RESULT_NDC] = ndc;
2103
2104    current_annotation = "NDC";
2105    dst_reg ndc_w = ndc;
2106    ndc_w.writemask = WRITEMASK_W;
2107    src_reg pos_w = pos;
2108    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2109    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2110
2111    dst_reg ndc_xyz = ndc;
2112    ndc_xyz.writemask = WRITEMASK_XYZ;
2113
2114    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2115 }
2116
2117 void
2118 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2119 {
2120    if (intel->gen < 6 &&
2121        ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2122         c->key.userclip_active || brw->has_negative_rhw_bug)) {
2123       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2124       dst_reg header1_w = header1;
2125       header1_w.writemask = WRITEMASK_W;
2126       GLuint i;
2127
2128       emit(MOV(header1, 0u));
2129
2130       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2131          src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2132
2133          current_annotation = "Point size";
2134          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2135          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2136       }
2137
2138       current_annotation = "Clipping flags";
2139       for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2140          vec4_instruction *inst;
2141
2142          inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2143                          src_reg(this->userplane[i])));
2144          inst->conditional_mod = BRW_CONDITIONAL_L;
2145
2146          inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2147          inst->predicate = BRW_PREDICATE_NORMAL;
2148       }
2149
2150       /* i965 clipping workaround:
2151        * 1) Test for -ve rhw
2152        * 2) If set,
2153        *      set ndc = (0,0,0,0)
2154        *      set ucp[6] = 1
2155        *
2156        * Later, clipping will detect ucp[6] and ensure the primitive is
2157        * clipped against all fixed planes.
2158        */
2159       if (brw->has_negative_rhw_bug) {
2160 #if 0
2161          /* FINISHME */
2162          brw_CMP(p,
2163                  vec8(brw_null_reg()),
2164                  BRW_CONDITIONAL_L,
2165                  brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2166                  brw_imm_f(0));
2167
2168          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2169          brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2170          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2171 #endif
2172       }
2173
2174       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2175    } else if (intel->gen < 6) {
2176       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2177    } else {
2178       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2179       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2180          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2181                   src_reg(output_reg[VERT_RESULT_PSIZ])));
2182       }
2183    }
2184 }
2185
2186 void
2187 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2188 {
2189    if (intel->gen < 6) {
2190       /* Clip distance slots are set aside in gen5, but they are not used.  It
2191        * is not clear whether we actually need to set aside space for them,
2192        * but the performance cost is negligible.
2193        */
2194       return;
2195    }
2196
2197    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2198     *
2199     *     "If a linked set of shaders forming the vertex stage contains no
2200     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2201     *     application has requested clipping against user clip planes through
2202     *     the API, then the coordinate written to gl_Position is used for
2203     *     comparison against the user clip planes."
2204     *
2205     * This function is only called if the shader didn't write to
2206     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2207     * if the user wrote to it; otherwise we use gl_Position.
2208     */
2209    gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2210    if (!(c->prog_data.outputs_written
2211          & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2212       clip_vertex = VERT_RESULT_HPOS;
2213    }
2214
2215    for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2216         ++i) {
2217       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2218                src_reg(output_reg[clip_vertex]),
2219                src_reg(this->userplane[i + offset])));
2220    }
2221 }
2222
2223 void
2224 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2225 {
2226    assert (vert_result < VERT_RESULT_MAX);
2227    reg.type = output_reg[vert_result].type;
2228    current_annotation = output_reg_annotation[vert_result];
2229    /* Copy the register, saturating if necessary */
2230    vec4_instruction *inst = emit(MOV(reg,
2231                                      src_reg(output_reg[vert_result])));
2232    if ((vert_result == VERT_RESULT_COL0 ||
2233         vert_result == VERT_RESULT_COL1 ||
2234         vert_result == VERT_RESULT_BFC0 ||
2235         vert_result == VERT_RESULT_BFC1) &&
2236        c->key.clamp_vertex_color) {
2237       inst->saturate = true;
2238    }
2239 }
2240
2241 void
2242 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2243 {
2244    struct brw_reg hw_reg = brw_message_reg(mrf);
2245    dst_reg reg = dst_reg(MRF, mrf);
2246    reg.type = BRW_REGISTER_TYPE_F;
2247
2248    switch (vert_result) {
2249    case VERT_RESULT_PSIZ:
2250       /* PSIZ is always in slot 0, and is coupled with other flags. */
2251       current_annotation = "indices, point width, clip flags";
2252       emit_psiz_and_flags(hw_reg);
2253       break;
2254    case BRW_VERT_RESULT_NDC:
2255       current_annotation = "NDC";
2256       emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2257       break;
2258    case BRW_VERT_RESULT_HPOS_DUPLICATE:
2259    case VERT_RESULT_HPOS:
2260       current_annotation = "gl_Position";
2261       emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2262       break;
2263    case VERT_RESULT_CLIP_DIST0:
2264    case VERT_RESULT_CLIP_DIST1:
2265       if (this->c->key.uses_clip_distance) {
2266          emit_generic_urb_slot(reg, vert_result);
2267       } else {
2268          current_annotation = "user clip distances";
2269          emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2270       }
2271       break;
2272    case VERT_RESULT_EDGE:
2273       /* This is present when doing unfilled polygons.  We're supposed to copy
2274        * the edge flag from the user-provided vertex array
2275        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2276        * of that attribute (starts as 1.0f).  This is then used in clipping to
2277        * determine which edges should be drawn as wireframe.
2278        */
2279       current_annotation = "edge flag";
2280       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2281                                     glsl_type::float_type, WRITEMASK_XYZW))));
2282       break;
2283    case BRW_VERT_RESULT_PAD:
2284       /* No need to write to this slot */
2285       break;
2286    default:
2287       emit_generic_urb_slot(reg, vert_result);
2288       break;
2289    }
2290 }
2291
2292 static int
2293 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2294 {
2295    struct intel_context *intel = &brw->intel;
2296
2297    if (intel->gen >= 6) {
2298       /* URB data written (does not include the message header reg) must
2299        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2300        * section 5.4.3.2.2: URB_INTERLEAVED.
2301        *
2302        * URB entries are allocated on a multiple of 1024 bits, so an
2303        * extra 128 bits written here to make the end align to 256 is
2304        * no problem.
2305        */
2306       if ((mlen % 2) != 1)
2307          mlen++;
2308    }
2309
2310    return mlen;
2311 }
2312
2313 /**
2314  * Generates the VUE payload plus the 1 or 2 URB write instructions to
2315  * complete the VS thread.
2316  *
2317  * The VUE layout is documented in Volume 2a.
2318  */
2319 void
2320 vec4_visitor::emit_urb_writes()
2321 {
2322    /* MRF 0 is reserved for the debugger, so start with message header
2323     * in MRF 1.
2324     */
2325    int base_mrf = 1;
2326    int mrf = base_mrf;
2327    /* In the process of generating our URB write message contents, we
2328     * may need to unspill a register or load from an array.  Those
2329     * reads would use MRFs 14-15.
2330     */
2331    int max_usable_mrf = 13;
2332
2333    /* The following assertion verifies that max_usable_mrf causes an
2334     * even-numbered amount of URB write data, which will meet gen6's
2335     * requirements for length alignment.
2336     */
2337    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2338
2339    /* First mrf is the g0-based message header containing URB handles and such,
2340     * which is implied in VS_OPCODE_URB_WRITE.
2341     */
2342    mrf++;
2343
2344    if (intel->gen < 6) {
2345       emit_ndc_computation();
2346    }
2347
2348    /* Set up the VUE data for the first URB write */
2349    int slot;
2350    for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) {
2351       emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2352
2353       /* If this was max_usable_mrf, we can't fit anything more into this URB
2354        * WRITE.
2355        */
2356       if (mrf > max_usable_mrf) {
2357          slot++;
2358          break;
2359       }
2360    }
2361
2362    current_annotation = "URB write";
2363    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2364    inst->base_mrf = base_mrf;
2365    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2366    inst->eot = (slot >= c->prog_data.vue_map.num_slots);
2367
2368    /* Optional second URB write */
2369    if (!inst->eot) {
2370       mrf = base_mrf + 1;
2371
2372       for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
2373          assert(mrf < max_usable_mrf);
2374
2375          emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2376       }
2377
2378       current_annotation = "URB write";
2379       inst = emit(VS_OPCODE_URB_WRITE);
2380       inst->base_mrf = base_mrf;
2381       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2382       inst->eot = true;
2383       /* URB destination offset.  In the previous write, we got MRFs
2384        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2385        * URB row increments, and each of our MRFs is half of one of
2386        * those, since we're doing interleaved writes.
2387        */
2388       inst->offset = (max_usable_mrf - base_mrf) / 2;
2389    }
2390 }
2391
2392 src_reg
2393 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2394                                  src_reg *reladdr, int reg_offset)
2395 {
2396    /* Because we store the values to scratch interleaved like our
2397     * vertex data, we need to scale the vec4 index by 2.
2398     */
2399    int message_header_scale = 2;
2400
2401    /* Pre-gen6, the message header uses byte offsets instead of vec4
2402     * (16-byte) offset units.
2403     */
2404    if (intel->gen < 6)
2405       message_header_scale *= 16;
2406
2407    if (reladdr) {
2408       src_reg index = src_reg(this, glsl_type::int_type);
2409
2410       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2411       emit_before(inst, MUL(dst_reg(index),
2412                             index, src_reg(message_header_scale)));
2413
2414       return index;
2415    } else {
2416       return src_reg(reg_offset * message_header_scale);
2417    }
2418 }
2419
2420 src_reg
2421 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2422                                        src_reg *reladdr, int reg_offset)
2423 {
2424    if (reladdr) {
2425       src_reg index = src_reg(this, glsl_type::int_type);
2426
2427       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2428
2429       /* Pre-gen6, the message header uses byte offsets instead of vec4
2430        * (16-byte) offset units.
2431        */
2432       if (intel->gen < 6) {
2433          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2434       }
2435
2436       return index;
2437    } else {
2438       int message_header_scale = intel->gen < 6 ? 16 : 1;
2439       return src_reg(reg_offset * message_header_scale);
2440    }
2441 }
2442
2443 /**
2444  * Emits an instruction before @inst to load the value named by @orig_src
2445  * from scratch space at @base_offset to @temp.
2446  *
2447  * @base_offset is measured in 32-byte units (the size of a register).
2448  */
2449 void
2450 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2451                                 dst_reg temp, src_reg orig_src,
2452                                 int base_offset)
2453 {
2454    int reg_offset = base_offset + orig_src.reg_offset;
2455    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2456
2457    emit_before(inst, SCRATCH_READ(temp, index));
2458 }
2459
2460 /**
2461  * Emits an instruction after @inst to store the value to be written
2462  * to @orig_dst to scratch space at @base_offset, from @temp.
2463  *
2464  * @base_offset is measured in 32-byte units (the size of a register).
2465  */
2466 void
2467 vec4_visitor::emit_scratch_write(vec4_instruction *inst,
2468                                  src_reg temp, dst_reg orig_dst,
2469                                  int base_offset)
2470 {
2471    int reg_offset = base_offset + orig_dst.reg_offset;
2472    src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
2473
2474    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2475                                        orig_dst.writemask));
2476    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2477    write->predicate = inst->predicate;
2478    write->ir = inst->ir;
2479    write->annotation = inst->annotation;
2480    inst->insert_after(write);
2481 }
2482
2483 /**
2484  * We can't generally support array access in GRF space, because a
2485  * single instruction's destination can only span 2 contiguous
2486  * registers.  So, we send all GRF arrays that get variable index
2487  * access to scratch space.
2488  */
2489 void
2490 vec4_visitor::move_grf_array_access_to_scratch()
2491 {
2492    int scratch_loc[this->virtual_grf_count];
2493
2494    for (int i = 0; i < this->virtual_grf_count; i++) {
2495       scratch_loc[i] = -1;
2496    }
2497
2498    /* First, calculate the set of virtual GRFs that need to be punted
2499     * to scratch due to having any array access on them, and where in
2500     * scratch.
2501     */
2502    foreach_list(node, &this->instructions) {
2503       vec4_instruction *inst = (vec4_instruction *)node;
2504
2505       if (inst->dst.file == GRF && inst->dst.reladdr &&
2506           scratch_loc[inst->dst.reg] == -1) {
2507          scratch_loc[inst->dst.reg] = c->last_scratch;
2508          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2509       }
2510
2511       for (int i = 0 ; i < 3; i++) {
2512          src_reg *src = &inst->src[i];
2513
2514          if (src->file == GRF && src->reladdr &&
2515              scratch_loc[src->reg] == -1) {
2516             scratch_loc[src->reg] = c->last_scratch;
2517             c->last_scratch += this->virtual_grf_sizes[src->reg];
2518          }
2519       }
2520    }
2521
2522    /* Now, for anything that will be accessed through scratch, rewrite
2523     * it to load/store.  Note that this is a _safe list walk, because
2524     * we may generate a new scratch_write instruction after the one
2525     * we're processing.
2526     */
2527    foreach_list_safe(node, &this->instructions) {
2528       vec4_instruction *inst = (vec4_instruction *)node;
2529
2530       /* Set up the annotation tracking for new generated instructions. */
2531       base_ir = inst->ir;
2532       current_annotation = inst->annotation;
2533
2534       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2535          src_reg temp = src_reg(this, glsl_type::vec4_type);
2536
2537          emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
2538
2539          inst->dst.file = temp.file;
2540          inst->dst.reg = temp.reg;
2541          inst->dst.reg_offset = temp.reg_offset;
2542          inst->dst.reladdr = NULL;
2543       }
2544
2545       for (int i = 0 ; i < 3; i++) {
2546          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2547             continue;
2548
2549          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2550
2551          emit_scratch_read(inst, temp, inst->src[i],
2552                            scratch_loc[inst->src[i].reg]);
2553
2554          inst->src[i].file = temp.file;
2555          inst->src[i].reg = temp.reg;
2556          inst->src[i].reg_offset = temp.reg_offset;
2557          inst->src[i].reladdr = NULL;
2558       }
2559    }
2560 }
2561
2562 /**
2563  * Emits an instruction before @inst to load the value named by @orig_src
2564  * from the pull constant buffer (surface) at @base_offset to @temp.
2565  */
2566 void
2567 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2568                                       dst_reg temp, src_reg orig_src,
2569                                       int base_offset)
2570 {
2571    int reg_offset = base_offset + orig_src.reg_offset;
2572    src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2573    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2574    vec4_instruction *load;
2575
2576    load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2577                                         temp, index, offset);
2578    load->base_mrf = 14;
2579    load->mlen = 1;
2580    emit_before(inst, load);
2581 }
2582
2583 /**
2584  * Implements array access of uniforms by inserting a
2585  * PULL_CONSTANT_LOAD instruction.
2586  *
2587  * Unlike temporary GRF array access (where we don't support it due to
2588  * the difficulty of doing relative addressing on instruction
2589  * destinations), we could potentially do array access of uniforms
2590  * that were loaded in GRF space as push constants.  In real-world
2591  * usage we've seen, though, the arrays being used are always larger
2592  * than we could load as push constants, so just always move all
2593  * uniform array access out to a pull constant buffer.
2594  */
2595 void
2596 vec4_visitor::move_uniform_array_access_to_pull_constants()
2597 {
2598    int pull_constant_loc[this->uniforms];
2599
2600    for (int i = 0; i < this->uniforms; i++) {
2601       pull_constant_loc[i] = -1;
2602    }
2603
2604    /* Walk through and find array access of uniforms.  Put a copy of that
2605     * uniform in the pull constant buffer.
2606     *
2607     * Note that we don't move constant-indexed accesses to arrays.  No
2608     * testing has been done of the performance impact of this choice.
2609     */
2610    foreach_list_safe(node, &this->instructions) {
2611       vec4_instruction *inst = (vec4_instruction *)node;
2612
2613       for (int i = 0 ; i < 3; i++) {
2614          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2615             continue;
2616
2617          int uniform = inst->src[i].reg;
2618
2619          /* If this array isn't already present in the pull constant buffer,
2620           * add it.
2621           */
2622          if (pull_constant_loc[uniform] == -1) {
2623             const float **values = &prog_data->param[uniform * 4];
2624
2625             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2626
2627             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2628                prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2629             }
2630          }
2631
2632          /* Set up the annotation tracking for new generated instructions. */
2633          base_ir = inst->ir;
2634          current_annotation = inst->annotation;
2635
2636          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2637
2638          emit_pull_constant_load(inst, temp, inst->src[i],
2639                                  pull_constant_loc[uniform]);
2640
2641          inst->src[i].file = temp.file;
2642          inst->src[i].reg = temp.reg;
2643          inst->src[i].reg_offset = temp.reg_offset;
2644          inst->src[i].reladdr = NULL;
2645       }
2646    }
2647
2648    /* Now there are no accesses of the UNIFORM file with a reladdr, so
2649     * no need to track them as larger-than-vec4 objects.  This will be
2650     * relied on in cutting out unused uniform vectors from push
2651     * constants.
2652     */
2653    split_uniform_registers();
2654 }
2655
2656 void
2657 vec4_visitor::resolve_ud_negate(src_reg *reg)
2658 {
2659    if (reg->type != BRW_REGISTER_TYPE_UD ||
2660        !reg->negate)
2661       return;
2662
2663    src_reg temp = src_reg(this, glsl_type::uvec4_type);
2664    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2665    *reg = temp;
2666 }
2667
2668 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2669                            struct gl_shader_program *prog,
2670                            struct brw_shader *shader)
2671 {
2672    this->c = c;
2673    this->p = &c->func;
2674    this->brw = p->brw;
2675    this->intel = &brw->intel;
2676    this->ctx = &intel->ctx;
2677    this->prog = prog;
2678    this->shader = shader;
2679
2680    this->mem_ctx = ralloc_context(NULL);
2681    this->failed = false;
2682
2683    this->base_ir = NULL;
2684    this->current_annotation = NULL;
2685
2686    this->c = c;
2687    this->vp = (struct gl_vertex_program *)
2688      prog->_LinkedShaders[MESA_SHADER_VERTEX]->Program;
2689    this->prog_data = &c->prog_data;
2690
2691    this->variable_ht = hash_table_ctor(0,
2692                                        hash_table_pointer_hash,
2693                                        hash_table_pointer_compare);
2694
2695    this->virtual_grf_def = NULL;
2696    this->virtual_grf_use = NULL;
2697    this->virtual_grf_sizes = NULL;
2698    this->virtual_grf_count = 0;
2699    this->virtual_grf_reg_map = NULL;
2700    this->virtual_grf_reg_count = 0;
2701    this->virtual_grf_array_size = 0;
2702    this->live_intervals_valid = false;
2703
2704    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2705
2706    this->uniforms = 0;
2707 }
2708
2709 vec4_visitor::~vec4_visitor()
2710 {
2711    ralloc_free(this->mem_ctx);
2712    hash_table_dtor(this->variable_ht);
2713 }
2714
2715
2716 void
2717 vec4_visitor::fail(const char *format, ...)
2718 {
2719    va_list va;
2720    char *msg;
2721
2722    if (failed)
2723       return;
2724
2725    failed = true;
2726
2727    va_start(va, format);
2728    msg = ralloc_vasprintf(mem_ctx, format, va);
2729    va_end(va);
2730    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2731
2732    this->fail_msg = msg;
2733
2734    if (INTEL_DEBUG & DEBUG_VS) {
2735       fprintf(stderr, "%s",  msg);
2736    }
2737 }
2738
2739 } /* namespace brw */