src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 extern "C" {
  26 #include "main/context.h"
  27 #include "main/macros.h"
  28 #include "program/prog_parameter.h"
  29 #include "program/sampler.h"
  30 }
  31
  32 namespace brw {
  33
  34 vec4_instruction::vec4_instruction(vec4_visitor *v,
  35                                    enum opcode opcode, dst_reg dst,
  36                                    src_reg src0, src_reg src1, src_reg src2)
  37 {
  38    this->opcode = opcode;
  39    this->dst = dst;
  40    this->src[0] = src0;
  41    this->src[1] = src1;
  42    this->src[2] = src2;
  43    this->ir = v->base_ir;
  44    this->annotation = v->current_annotation;
  45 }
  46
  47 vec4_instruction *
  48 vec4_visitor::emit(vec4_instruction *inst)
  49 {
  50    this->instructions.push_tail(inst);
  51
  52    return inst;
  53 }
  54
  55 vec4_instruction *
  56 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  57 {
  58    new_inst->ir = inst->ir;
  59    new_inst->annotation = inst->annotation;
  60
  61    inst->insert_before(new_inst);
  62
  63    return inst;
  64 }
  65
  66 vec4_instruction *
  67 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  68                    src_reg src0, src_reg src1, src_reg src2)
  69 {
  70    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  71                                              src0, src1, src2));
  72 }
  73
  74
  75 vec4_instruction *
  76 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  77 {
  78    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  79 }
  80
  81 vec4_instruction *
  82 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  83 {
  84    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  85 }
  86
  87 vec4_instruction *
  88 vec4_visitor::emit(enum opcode opcode)
  89 {
  90    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  91 }
  92
  93 #define ALU1(op)                                                        \
  94    vec4_instruction *                                                   \
  95    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
  96    {                                                                    \
  97       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
  98                                            src0);                       \
  99    }
 100
 101 #define ALU2(op)                                                        \
 102    vec4_instruction *                                                   \
 103    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 104    {                                                                    \
 105       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 106                                            src0, src1);                 \
 107    }
 108
 109 ALU1(NOT)
 110 ALU1(MOV)
 111 ALU1(FRC)
 112 ALU1(RNDD)
 113 ALU1(RNDE)
 114 ALU1(RNDZ)
 115 ALU2(ADD)
 116 ALU2(MUL)
 117 ALU2(MACH)
 118 ALU2(AND)
 119 ALU2(OR)
 120 ALU2(XOR)
 121 ALU2(DP3)
 122 ALU2(DP4)
 123 ALU2(DPH)
 124 ALU2(SHL)
 125 ALU2(SHR)
 126 ALU2(ASR)
 127
 128 /** Gen4 predicated IF. */
 129 vec4_instruction *
 130 vec4_visitor::IF(uint32_t predicate)
 131 {
 132    vec4_instruction *inst;
 133
 134    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 135    inst->predicate = predicate;
 136
 137    return inst;
 138 }
 139
 140 /** Gen6+ IF with embedded comparison. */
 141 vec4_instruction *
 142 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 143 {
 144    assert(intel->gen >= 6);
 145
 146    vec4_instruction *inst;
 147
 148    resolve_ud_negate(&src0);
 149    resolve_ud_negate(&src1);
 150
 151    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 152                                         src0, src1);
 153    inst->conditional_mod = condition;
 154
 155    return inst;
 156 }
 157
 158 /**
 159  * CMP: Sets the low bit of the destination channels with the result
 160  * of the comparison, while the upper bits are undefined, and updates
 161  * the flag register with the packed 16 bits of the result.
 162  */
 163 vec4_instruction *
 164 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 165 {
 166    vec4_instruction *inst;
 167
 168    /* original gen4 does type conversion to the destination type
 169     * before before comparison, producing garbage results for floating
 170     * point comparisons.
 171     */
 172    if (intel->gen == 4) {
 173       dst.type = src0.type;
 174       if (dst.file == HW_REG)
 175          dst.fixed_hw_reg.type = dst.type;
 176    }
 177
 178    resolve_ud_negate(&src0);
 179    resolve_ud_negate(&src1);
 180
 181    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 182    inst->conditional_mod = condition;
 183
 184    return inst;
 185 }
 186
 187 vec4_instruction *
 188 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 189 {
 190    vec4_instruction *inst;
 191
 192    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 193                                         dst, index);
 194    inst->base_mrf = 14;
 195    inst->mlen = 2;
 196
 197    return inst;
 198 }
 199
 200 vec4_instruction *
 201 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 202 {
 203    vec4_instruction *inst;
 204
 205    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 206                                         dst, src, index);
 207    inst->base_mrf = 13;
 208    inst->mlen = 3;
 209
 210    return inst;
 211 }
 212
 213 void
 214 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 215 {
 216    static enum opcode dot_opcodes[] = {
 217       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 218    };
 219
 220    emit(dot_opcodes[elements - 2], dst, src0, src1);
 221 }
 222
 223 void
 224 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 225 {
 226    /* The gen6 math instruction ignores the source modifiers --
 227     * swizzle, abs, negate, and at least some parts of the register
 228     * region description.
 229     *
 230     * While it would seem that this MOV could be avoided at this point
 231     * in the case that the swizzle is matched up with the destination
 232     * writemask, note that uniform packing and register allocation
 233     * could rearrange our swizzle, so let's leave this matter up to
 234     * copy propagation later.
 235     */
 236    src_reg temp_src = src_reg(this, glsl_type::vec4_type);
 237    emit(MOV(dst_reg(temp_src), src));
 238
 239    if (dst.writemask != WRITEMASK_XYZW) {
 240       /* The gen6 math instruction must be align1, so we can't do
 241        * writemasks.
 242        */
 243       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 244
 245       emit(opcode, temp_dst, temp_src);
 246
 247       emit(MOV(dst, src_reg(temp_dst)));
 248    } else {
 249       emit(opcode, dst, temp_src);
 250    }
 251 }
 252
 253 void
 254 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 255 {
 256    vec4_instruction *inst = emit(opcode, dst, src);
 257    inst->base_mrf = 1;
 258    inst->mlen = 1;
 259 }
 260
 261 void
 262 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 263 {
 264    switch (opcode) {
 265    case SHADER_OPCODE_RCP:
 266    case SHADER_OPCODE_RSQ:
 267    case SHADER_OPCODE_SQRT:
 268    case SHADER_OPCODE_EXP2:
 269    case SHADER_OPCODE_LOG2:
 270    case SHADER_OPCODE_SIN:
 271    case SHADER_OPCODE_COS:
 272       break;
 273    default:
 274       assert(!"not reached: bad math opcode");
 275       return;
 276    }
 277
 278    if (intel->gen >= 7) {
 279       emit(opcode, dst, src);
 280    } else if (intel->gen == 6) {
 281       return emit_math1_gen6(opcode, dst, src);
 282    } else {
 283       return emit_math1_gen4(opcode, dst, src);
 284    }
 285 }
 286
 287 void
 288 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 289                               dst_reg dst, src_reg src0, src_reg src1)
 290 {
 291    src_reg expanded;
 292
 293    /* The gen6 math instruction ignores the source modifiers --
 294     * swizzle, abs, negate, and at least some parts of the register
 295     * region description.  Move the sources to temporaries to make it
 296     * generally work.
 297     */
 298
 299    expanded = src_reg(this, glsl_type::vec4_type);
 300    expanded.type = src0.type;
 301    emit(MOV(dst_reg(expanded), src0));
 302    src0 = expanded;
 303
 304    expanded = src_reg(this, glsl_type::vec4_type);
 305    expanded.type = src1.type;
 306    emit(MOV(dst_reg(expanded), src1));
 307    src1 = expanded;
 308
 309    if (dst.writemask != WRITEMASK_XYZW) {
 310       /* The gen6 math instruction must be align1, so we can't do
 311        * writemasks.
 312        */
 313       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 314       temp_dst.type = dst.type;
 315
 316       emit(opcode, temp_dst, src0, src1);
 317
 318       emit(MOV(dst, src_reg(temp_dst)));
 319    } else {
 320       emit(opcode, dst, src0, src1);
 321    }
 322 }
 323
 324 void
 325 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 326                               dst_reg dst, src_reg src0, src_reg src1)
 327 {
 328    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 329    inst->base_mrf = 1;
 330    inst->mlen = 2;
 331 }
 332
 333 void
 334 vec4_visitor::emit_math(enum opcode opcode,
 335                         dst_reg dst, src_reg src0, src_reg src1)
 336 {
 337    switch (opcode) {
 338    case SHADER_OPCODE_POW:
 339    case SHADER_OPCODE_INT_QUOTIENT:
 340    case SHADER_OPCODE_INT_REMAINDER:
 341       break;
 342    default:
 343       assert(!"not reached: unsupported binary math opcode");
 344       return;
 345    }
 346
 347    if (intel->gen >= 7) {
 348       emit(opcode, dst, src0, src1);
 349    } else if (intel->gen == 6) {
 350       return emit_math2_gen6(opcode, dst, src0, src1);
 351    } else {
 352       return emit_math2_gen4(opcode, dst, src0, src1);
 353    }
 354 }
 355
 356 void
 357 vec4_visitor::visit_instructions(const exec_list *list)
 358 {
 359    foreach_list(node, list) {
 360       ir_instruction *ir = (ir_instruction *)node;
 361
 362       base_ir = ir;
 363       ir->accept(this);
 364    }
 365 }
 366
 367
 368 static int
 369 type_size(const struct glsl_type *type)
 370 {
 371    unsigned int i;
 372    int size;
 373
 374    switch (type->base_type) {
 375    case GLSL_TYPE_UINT:
 376    case GLSL_TYPE_INT:
 377    case GLSL_TYPE_FLOAT:
 378    case GLSL_TYPE_BOOL:
 379       if (type->is_matrix()) {
 380          return type->matrix_columns;
 381       } else {
 382          /* Regardless of size of vector, it gets a vec4. This is bad
 383           * packing for things like floats, but otherwise arrays become a
 384           * mess.  Hopefully a later pass over the code can pack scalars
 385           * down if appropriate.
 386           */
 387          return 1;
 388       }
 389    case GLSL_TYPE_ARRAY:
 390       assert(type->length > 0);
 391       return type_size(type->fields.array) * type->length;
 392    case GLSL_TYPE_STRUCT:
 393       size = 0;
 394       for (i = 0; i < type->length; i++) {
 395          size += type_size(type->fields.structure[i].type);
 396       }
 397       return size;
 398    case GLSL_TYPE_SAMPLER:
 399       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 400        * at link time.
 401        */
 402       return 1;
 403    default:
 404       assert(0);
 405       return 0;
 406    }
 407 }
 408
 409 int
 410 vec4_visitor::virtual_grf_alloc(int size)
 411 {
 412    if (virtual_grf_array_size <= virtual_grf_count) {
 413       if (virtual_grf_array_size == 0)
 414          virtual_grf_array_size = 16;
 415       else
 416          virtual_grf_array_size *= 2;
 417       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 418                                    virtual_grf_array_size);
 419       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 420                                      virtual_grf_array_size);
 421    }
 422    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 423    virtual_grf_reg_count += size;
 424    virtual_grf_sizes[virtual_grf_count] = size;
 425    return virtual_grf_count++;
 426 }
 427
 428 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 429 {
 430    init();
 431
 432    this->file = GRF;
 433    this->reg = v->virtual_grf_alloc(type_size(type));
 434
 435    if (type->is_array() || type->is_record()) {
 436       this->swizzle = BRW_SWIZZLE_NOOP;
 437    } else {
 438       this->swizzle = swizzle_for_size(type->vector_elements);
 439    }
 440
 441    this->type = brw_type_for_base_type(type);
 442 }
 443
 444 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 445 {
 446    init();
 447
 448    this->file = GRF;
 449    this->reg = v->virtual_grf_alloc(type_size(type));
 450
 451    if (type->is_array() || type->is_record()) {
 452       this->writemask = WRITEMASK_XYZW;
 453    } else {
 454       this->writemask = (1 << type->vector_elements) - 1;
 455    }
 456
 457    this->type = brw_type_for_base_type(type);
 458 }
 459
 460 /* Our support for uniforms is piggy-backed on the struct
 461  * gl_fragment_program, because that's where the values actually
 462  * get stored, rather than in some global gl_shader_program uniform
 463  * store.
 464  */
 465 int
 466 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
 467 {
 468    unsigned int offset = 0;
 469    float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
 470
 471    if (type->is_matrix()) {
 472       const glsl_type *column = type->column_type();
 473
 474       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 475          offset += setup_uniform_values(loc + offset, column);
 476       }
 477
 478       return offset;
 479    }
 480
 481    switch (type->base_type) {
 482    case GLSL_TYPE_FLOAT:
 483    case GLSL_TYPE_UINT:
 484    case GLSL_TYPE_INT:
 485    case GLSL_TYPE_BOOL:
 486       for (unsigned int i = 0; i < type->vector_elements; i++) {
 487          c->prog_data.param[this->uniforms * 4 + i] = &values[i];
 488       }
 489
 490       /* Set up pad elements to get things aligned to a vec4 boundary. */
 491       for (unsigned int i = type->vector_elements; i < 4; i++) {
 492          static float zero = 0;
 493
 494          c->prog_data.param[this->uniforms * 4 + i] = &zero;
 495       }
 496
 497       /* Track the size of this uniform vector, for future packing of
 498        * uniforms.
 499        */
 500       this->uniform_vector_size[this->uniforms] = type->vector_elements;
 501       this->uniforms++;
 502
 503       return 1;
 504
 505    case GLSL_TYPE_STRUCT:
 506       for (unsigned int i = 0; i < type->length; i++) {
 507          offset += setup_uniform_values(loc + offset,
 508                                         type->fields.structure[i].type);
 509       }
 510       return offset;
 511
 512    case GLSL_TYPE_ARRAY:
 513       for (unsigned int i = 0; i < type->length; i++) {
 514          offset += setup_uniform_values(loc + offset, type->fields.array);
 515       }
 516       return offset;
 517
 518    case GLSL_TYPE_SAMPLER:
 519       /* The sampler takes up a slot, but we don't use any values from it. */
 520       return 1;
 521
 522    default:
 523       assert(!"not reached");
 524       return 0;
 525    }
 526 }
 527
 528 void
 529 vec4_visitor::setup_uniform_clipplane_values()
 530 {
 531    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 532
 533    if (intel->gen < 6) {
 534       /* Pre-Gen6, we compact clip planes.  For example, if the user
 535        * enables just clip planes 0, 1, and 3, we will enable clip planes
 536        * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
 537        * plane 2.  This simplifies the implementation of the Gen6 clip
 538        * thread.
 539        */
 540       int compacted_clipplane_index = 0;
 541       for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
 542          if (!(c->key.userclip_planes_enabled_gen_4_5 & (1 << i)))
 543             continue;
 544
 545          this->uniform_vector_size[this->uniforms] = 4;
 546          this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
 547          this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
 548          for (int j = 0; j < 4; ++j) {
 549             c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 550          }
 551          ++compacted_clipplane_index;
 552          ++this->uniforms;
 553       }
 554    } else {
 555       /* In Gen6 and later, we don't compact clip planes, because this
 556        * simplifies the implementation of gl_ClipDistance.
 557        */
 558       for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
 559          this->uniform_vector_size[this->uniforms] = 4;
 560          this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 561          this->userplane[i].type = BRW_REGISTER_TYPE_F;
 562          for (int j = 0; j < 4; ++j) {
 563             c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 564          }
 565          ++this->uniforms;
 566       }
 567    }
 568 }
 569
 570 /* Our support for builtin uniforms is even scarier than non-builtin.
 571  * It sits on top of the PROG_STATE_VAR parameters that are
 572  * automatically updated from GL context state.
 573  */
 574 void
 575 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 576 {
 577    const ir_state_slot *const slots = ir->state_slots;
 578    assert(ir->state_slots != NULL);
 579
 580    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 581       /* This state reference has already been setup by ir_to_mesa,
 582        * but we'll get the same index back here.  We can reference
 583        * ParameterValues directly, since unlike brw_fs.cpp, we never
 584        * add new state references during compile.
 585        */
 586       int index = _mesa_add_state_reference(this->vp->Base.Parameters,
 587                                             (gl_state_index *)slots[i].tokens);
 588       float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
 589
 590       this->uniform_vector_size[this->uniforms] = 0;
 591       /* Add each of the unique swizzled channels of the element.
 592        * This will end up matching the size of the glsl_type of this field.
 593        */
 594       int last_swiz = -1;
 595       for (unsigned int j = 0; j < 4; j++) {
 596          int swiz = GET_SWZ(slots[i].swizzle, j);
 597          last_swiz = swiz;
 598
 599          c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
 600          if (swiz <= last_swiz)
 601             this->uniform_vector_size[this->uniforms]++;
 602       }
 603       this->uniforms++;
 604    }
 605 }
 606
 607 dst_reg *
 608 vec4_visitor::variable_storage(ir_variable *var)
 609 {
 610    return (dst_reg *)hash_table_find(this->variable_ht, var);
 611 }
 612
 613 void
 614 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 615 {
 616    ir_expression *expr = ir->as_expression();
 617
 618    *predicate = BRW_PREDICATE_NORMAL;
 619
 620    if (expr) {
 621       src_reg op[2];
 622       vec4_instruction *inst;
 623
 624       assert(expr->get_num_operands() <= 2);
 625       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 626          expr->operands[i]->accept(this);
 627          op[i] = this->result;
 628
 629          resolve_ud_negate(&op[i]);
 630       }
 631
 632       switch (expr->operation) {
 633       case ir_unop_logic_not:
 634          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 635          inst->conditional_mod = BRW_CONDITIONAL_Z;
 636          break;
 637
 638       case ir_binop_logic_xor:
 639          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 640          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 641          break;
 642
 643       case ir_binop_logic_or:
 644          inst = emit(OR(dst_null_d(), op[0], op[1]));
 645          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 646          break;
 647
 648       case ir_binop_logic_and:
 649          inst = emit(AND(dst_null_d(), op[0], op[1]));
 650          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 651          break;
 652
 653       case ir_unop_f2b:
 654          if (intel->gen >= 6) {
 655             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 656          } else {
 657             inst = emit(MOV(dst_null_f(), op[0]));
 658             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 659          }
 660          break;
 661
 662       case ir_unop_i2b:
 663          if (intel->gen >= 6) {
 664             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 665          } else {
 666             inst = emit(MOV(dst_null_d(), op[0]));
 667             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 668          }
 669          break;
 670
 671       case ir_binop_all_equal:
 672          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 673          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 674          break;
 675
 676       case ir_binop_any_nequal:
 677          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 678          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 679          break;
 680
 681       case ir_unop_any:
 682          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 683          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 684          break;
 685
 686       case ir_binop_greater:
 687       case ir_binop_gequal:
 688       case ir_binop_less:
 689       case ir_binop_lequal:
 690       case ir_binop_equal:
 691       case ir_binop_nequal:
 692          emit(CMP(dst_null_d(), op[0], op[1],
 693                   brw_conditional_for_comparison(expr->operation)));
 694          break;
 695
 696       default:
 697          assert(!"not reached");
 698          break;
 699       }
 700       return;
 701    }
 702
 703    ir->accept(this);
 704
 705    resolve_ud_negate(&this->result);
 706
 707    if (intel->gen >= 6) {
 708       vec4_instruction *inst = emit(AND(dst_null_d(),
 709                                         this->result, src_reg(1)));
 710       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 711    } else {
 712       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 713       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 714    }
 715 }
 716
 717 /**
 718  * Emit a gen6 IF statement with the comparison folded into the IF
 719  * instruction.
 720  */
 721 void
 722 vec4_visitor::emit_if_gen6(ir_if *ir)
 723 {
 724    ir_expression *expr = ir->condition->as_expression();
 725
 726    if (expr) {
 727       src_reg op[2];
 728       dst_reg temp;
 729
 730       assert(expr->get_num_operands() <= 2);
 731       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 732          expr->operands[i]->accept(this);
 733          op[i] = this->result;
 734       }
 735
 736       switch (expr->operation) {
 737       case ir_unop_logic_not:
 738          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 739          return;
 740
 741       case ir_binop_logic_xor:
 742          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 743          return;
 744
 745       case ir_binop_logic_or:
 746          temp = dst_reg(this, glsl_type::bool_type);
 747          emit(OR(temp, op[0], op[1]));
 748          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 749          return;
 750
 751       case ir_binop_logic_and:
 752          temp = dst_reg(this, glsl_type::bool_type);
 753          emit(AND(temp, op[0], op[1]));
 754          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 755          return;
 756
 757       case ir_unop_f2b:
 758          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 759          return;
 760
 761       case ir_unop_i2b:
 762          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 763          return;
 764
 765       case ir_binop_greater:
 766       case ir_binop_gequal:
 767       case ir_binop_less:
 768       case ir_binop_lequal:
 769       case ir_binop_equal:
 770       case ir_binop_nequal:
 771          emit(IF(op[0], op[1],
 772                  brw_conditional_for_comparison(expr->operation)));
 773          return;
 774
 775       case ir_binop_all_equal:
 776          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 777          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 778          return;
 779
 780       case ir_binop_any_nequal:
 781          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 782          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 783          return;
 784
 785       case ir_unop_any:
 786          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 787          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 788          return;
 789
 790       default:
 791          assert(!"not reached");
 792          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 793          return;
 794       }
 795       return;
 796    }
 797
 798    ir->condition->accept(this);
 799
 800    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 801 }
 802
 803 static dst_reg
 804 with_writemask(dst_reg const & r, int mask)
 805 {
 806    dst_reg result = r;
 807    result.writemask = mask;
 808    return result;
 809 }
 810
 811 void
 812 vec4_visitor::emit_attribute_fixups()
 813 {
 814    dst_reg sign_recovery_shift;
 815    dst_reg normalize_factor;
 816    dst_reg es3_normalize_factor;
 817
 818    for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
 819       if (prog_data->inputs_read & BITFIELD64_BIT(i)) {
 820          uint8_t wa_flags = c->key.gl_attrib_wa_flags[i];
 821          dst_reg reg(ATTR, i);
 822          dst_reg reg_d = reg;
 823          reg_d.type = BRW_REGISTER_TYPE_D;
 824          dst_reg reg_ud = reg;
 825          reg_ud.type = BRW_REGISTER_TYPE_UD;
 826
 827          /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 828           * come in as floating point conversions of the integer values.
 829           */
 830          if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
 831             dst_reg dst = reg;
 832             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 833             dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
 834             emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 835          }
 836
 837          /* Do sign recovery for 2101010 formats if required. */
 838          if (wa_flags & BRW_ATTRIB_WA_SIGN) {
 839             if (sign_recovery_shift.file == BAD_FILE) {
 840                /* shift constant: <22,22,22,30> */
 841                sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
 842                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
 843                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
 844             }
 845
 846             emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
 847             emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
 848          }
 849
 850          /* Apply BGRA swizzle if required. */
 851          if (wa_flags & BRW_ATTRIB_WA_BGRA) {
 852             src_reg temp = src_reg(reg);
 853             temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
 854             emit(MOV(reg, temp));
 855          }
 856
 857          if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
 858             /* ES 3.0 has different rules for converting signed normalized
 859              * fixed-point numbers than desktop GL.
 860              */
 861             if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
 862                /* According to equation 2.2 of the ES 3.0 specification,
 863                 * signed normalization conversion is done by:
 864                 *
 865                 * f = c / (2^(b-1)-1)
 866                 */
 867                if (es3_normalize_factor.file == BAD_FILE) {
 868                   /* mul constant: 1 / (2^(b-1) - 1) */
 869                   es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
 870                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
 871                            src_reg(1.0f / ((1<<9) - 1))));
 872                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
 873                            src_reg(1.0f / ((1<<1) - 1))));
 874                }
 875
 876                dst_reg dst = reg;
 877                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 878                emit(MOV(dst, src_reg(reg_d)));
 879                emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
 880                emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
 881             } else {
 882                /* The following equations are from the OpenGL 3.2 specification:
 883                 *
 884                 * 2.1 unsigned normalization
 885                 * f = c/(2^n-1)
 886                 *
 887                 * 2.2 signed normalization
 888                 * f = (2c+1)/(2^n-1)
 889                 *
 890                 * Both of these share a common divisor, which is represented by
 891                 * "normalize_factor" in the code below.
 892                 */
 893                if (normalize_factor.file == BAD_FILE) {
 894                   /* 1 / (2^b - 1) for b=<10,10,10,2> */
 895                   normalize_factor = dst_reg(this, glsl_type::vec4_type);
 896                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
 897                            src_reg(1.0f / ((1<<10) - 1))));
 898                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
 899                            src_reg(1.0f / ((1<<2) - 1))));
 900                }
 901
 902                dst_reg dst = reg;
 903                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 904                emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
 905
 906                /* For signed normalization, we want the numerator to be 2c+1. */
 907                if (wa_flags & BRW_ATTRIB_WA_SIGN) {
 908                   emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
 909                   emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
 910                }
 911
 912                emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
 913             }
 914          }
 915
 916          if (wa_flags & BRW_ATTRIB_WA_SCALE) {
 917             dst_reg dst = reg;
 918             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 919             emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
 920          }
 921       }
 922    }
 923 }
 924
 925 void
 926 vec4_visitor::visit(ir_variable *ir)
 927 {
 928    dst_reg *reg = NULL;
 929
 930    if (variable_storage(ir))
 931       return;
 932
 933    switch (ir->mode) {
 934    case ir_var_in:
 935       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
 936       break;
 937
 938    case ir_var_out:
 939       reg = new(mem_ctx) dst_reg(this, ir->type);
 940
 941       for (int i = 0; i < type_size(ir->type); i++) {
 942          output_reg[ir->location + i] = *reg;
 943          output_reg[ir->location + i].reg_offset = i;
 944          output_reg[ir->location + i].type =
 945             brw_type_for_base_type(ir->type->get_scalar_type());
 946          output_reg_annotation[ir->location + i] = ir->name;
 947       }
 948       break;
 949
 950    case ir_var_auto:
 951    case ir_var_temporary:
 952       reg = new(mem_ctx) dst_reg(this, ir->type);
 953       break;
 954
 955    case ir_var_uniform:
 956       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 957
 958       /* Thanks to the lower_ubo_reference pass, we will see only
 959        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 960        * variables, so no need for them to be in variable_ht.
 961        */
 962       if (ir->uniform_block != -1)
 963          return;
 964
 965       /* Track how big the whole uniform variable is, in case we need to put a
 966        * copy of its data into pull constants for array access.
 967        */
 968       this->uniform_size[this->uniforms] = type_size(ir->type);
 969
 970       if (!strncmp(ir->name, "gl_", 3)) {
 971          setup_builtin_uniform_values(ir);
 972       } else {
 973          setup_uniform_values(ir->location, ir->type);
 974       }
 975       break;
 976
 977    case ir_var_system_value:
 978       /* VertexID is stored by the VF as the last vertex element, but
 979        * we don't represent it with a flag in inputs_read, so we call
 980        * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
 981        */
 982       reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
 983       prog_data->uses_vertexid = true;
 984
 985       switch (ir->location) {
 986       case SYSTEM_VALUE_VERTEX_ID:
 987          reg->writemask = WRITEMASK_X;
 988          break;
 989       case SYSTEM_VALUE_INSTANCE_ID:
 990          reg->writemask = WRITEMASK_Y;
 991          break;
 992       default:
 993          assert(!"not reached");
 994          break;
 995       }
 996       break;
 997
 998    default:
 999       assert(!"not reached");
1000    }
1001
1002    reg->type = brw_type_for_base_type(ir->type);
1003    hash_table_insert(this->variable_ht, reg, ir);
1004 }
1005
1006 void
1007 vec4_visitor::visit(ir_loop *ir)
1008 {
1009    dst_reg counter;
1010
1011    /* We don't want debugging output to print the whole body of the
1012     * loop as the annotation.
1013     */
1014    this->base_ir = NULL;
1015
1016    if (ir->counter != NULL) {
1017       this->base_ir = ir->counter;
1018       ir->counter->accept(this);
1019       counter = *(variable_storage(ir->counter));
1020
1021       if (ir->from != NULL) {
1022          this->base_ir = ir->from;
1023          ir->from->accept(this);
1024
1025          emit(MOV(counter, this->result));
1026       }
1027    }
1028
1029    emit(BRW_OPCODE_DO);
1030
1031    if (ir->to) {
1032       this->base_ir = ir->to;
1033       ir->to->accept(this);
1034
1035       emit(CMP(dst_null_d(), src_reg(counter), this->result,
1036                brw_conditional_for_comparison(ir->cmp)));
1037
1038       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1039       inst->predicate = BRW_PREDICATE_NORMAL;
1040    }
1041
1042    visit_instructions(&ir->body_instructions);
1043
1044
1045    if (ir->increment) {
1046       this->base_ir = ir->increment;
1047       ir->increment->accept(this);
1048       emit(ADD(counter, src_reg(counter), this->result));
1049    }
1050
1051    emit(BRW_OPCODE_WHILE);
1052 }
1053
1054 void
1055 vec4_visitor::visit(ir_loop_jump *ir)
1056 {
1057    switch (ir->mode) {
1058    case ir_loop_jump::jump_break:
1059       emit(BRW_OPCODE_BREAK);
1060       break;
1061    case ir_loop_jump::jump_continue:
1062       emit(BRW_OPCODE_CONTINUE);
1063       break;
1064    }
1065 }
1066
1067
1068 void
1069 vec4_visitor::visit(ir_function_signature *ir)
1070 {
1071    assert(0);
1072    (void)ir;
1073 }
1074
1075 void
1076 vec4_visitor::visit(ir_function *ir)
1077 {
1078    /* Ignore function bodies other than main() -- we shouldn't see calls to
1079     * them since they should all be inlined.
1080     */
1081    if (strcmp(ir->name, "main") == 0) {
1082       const ir_function_signature *sig;
1083       exec_list empty;
1084
1085       sig = ir->matching_signature(&empty);
1086
1087       assert(sig);
1088
1089       visit_instructions(&sig->body);
1090    }
1091 }
1092
1093 bool
1094 vec4_visitor::try_emit_sat(ir_expression *ir)
1095 {
1096    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1097    if (!sat_src)
1098       return false;
1099
1100    sat_src->accept(this);
1101    src_reg src = this->result;
1102
1103    this->result = src_reg(this, ir->type);
1104    vec4_instruction *inst;
1105    inst = emit(MOV(dst_reg(this->result), src));
1106    inst->saturate = true;
1107
1108    return true;
1109 }
1110
1111 void
1112 vec4_visitor::emit_bool_comparison(unsigned int op,
1113                                  dst_reg dst, src_reg src0, src_reg src1)
1114 {
1115    /* original gen4 does destination conversion before comparison. */
1116    if (intel->gen < 5)
1117       dst.type = src0.type;
1118
1119    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1120
1121    dst.type = BRW_REGISTER_TYPE_D;
1122    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1123 }
1124
1125 void
1126 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1127                           src_reg src0, src_reg src1)
1128 {
1129    vec4_instruction *inst;
1130
1131    if (intel->gen >= 6) {
1132       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1133       inst->conditional_mod = conditionalmod;
1134    } else {
1135       emit(CMP(dst, src0, src1, conditionalmod));
1136
1137       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1138       inst->predicate = BRW_PREDICATE_NORMAL;
1139    }
1140 }
1141
1142 void
1143 vec4_visitor::visit(ir_expression *ir)
1144 {
1145    unsigned int operand;
1146    src_reg op[Elements(ir->operands)];
1147    src_reg result_src;
1148    dst_reg result_dst;
1149    vec4_instruction *inst;
1150
1151    if (try_emit_sat(ir))
1152       return;
1153
1154    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1155       this->result.file = BAD_FILE;
1156       ir->operands[operand]->accept(this);
1157       if (this->result.file == BAD_FILE) {
1158          printf("Failed to get tree for expression operand:\n");
1159          ir->operands[operand]->print();
1160          exit(1);
1161       }
1162       op[operand] = this->result;
1163
1164       /* Matrix expression operands should have been broken down to vector
1165        * operations already.
1166        */
1167       assert(!ir->operands[operand]->type->is_matrix());
1168    }
1169
1170    int vector_elements = ir->operands[0]->type->vector_elements;
1171    if (ir->operands[1]) {
1172       vector_elements = MAX2(vector_elements,
1173                              ir->operands[1]->type->vector_elements);
1174    }
1175
1176    this->result.file = BAD_FILE;
1177
1178    /* Storage for our result.  Ideally for an assignment we'd be using
1179     * the actual storage for the result here, instead.
1180     */
1181    result_src = src_reg(this, ir->type);
1182    /* convenience for the emit functions below. */
1183    result_dst = dst_reg(result_src);
1184    /* If nothing special happens, this is the result. */
1185    this->result = result_src;
1186    /* Limit writes to the channels that will be used by result_src later.
1187     * This does limit this temp's use as a temporary for multi-instruction
1188     * sequences.
1189     */
1190    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1191
1192    switch (ir->operation) {
1193    case ir_unop_logic_not:
1194       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1195        * ones complement of the whole register, not just bit 0.
1196        */
1197       emit(XOR(result_dst, op[0], src_reg(1)));
1198       break;
1199    case ir_unop_neg:
1200       op[0].negate = !op[0].negate;
1201       this->result = op[0];
1202       break;
1203    case ir_unop_abs:
1204       op[0].abs = true;
1205       op[0].negate = false;
1206       this->result = op[0];
1207       break;
1208
1209    case ir_unop_sign:
1210       emit(MOV(result_dst, src_reg(0.0f)));
1211
1212       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1213       inst = emit(MOV(result_dst, src_reg(1.0f)));
1214       inst->predicate = BRW_PREDICATE_NORMAL;
1215
1216       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1217       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1218       inst->predicate = BRW_PREDICATE_NORMAL;
1219
1220       break;
1221
1222    case ir_unop_rcp:
1223       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1224       break;
1225
1226    case ir_unop_exp2:
1227       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1228       break;
1229    case ir_unop_log2:
1230       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1231       break;
1232    case ir_unop_exp:
1233    case ir_unop_log:
1234       assert(!"not reached: should be handled by ir_explog_to_explog2");
1235       break;
1236    case ir_unop_sin:
1237    case ir_unop_sin_reduced:
1238       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1239       break;
1240    case ir_unop_cos:
1241    case ir_unop_cos_reduced:
1242       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1243       break;
1244
1245    case ir_unop_dFdx:
1246    case ir_unop_dFdy:
1247       assert(!"derivatives not valid in vertex shader");
1248       break;
1249
1250    case ir_unop_noise:
1251       assert(!"not reached: should be handled by lower_noise");
1252       break;
1253
1254    case ir_binop_add:
1255       emit(ADD(result_dst, op[0], op[1]));
1256       break;
1257    case ir_binop_sub:
1258       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1259       break;
1260
1261    case ir_binop_mul:
1262       if (ir->type->is_integer()) {
1263          /* For integer multiplication, the MUL uses the low 16 bits
1264           * of one of the operands (src0 on gen6, src1 on gen7).  The
1265           * MACH accumulates in the contribution of the upper 16 bits
1266           * of that operand.
1267           *
1268           * FINISHME: Emit just the MUL if we know an operand is small
1269           * enough.
1270           */
1271          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1272
1273          emit(MUL(acc, op[0], op[1]));
1274          emit(MACH(dst_null_d(), op[0], op[1]));
1275          emit(MOV(result_dst, src_reg(acc)));
1276       } else {
1277          emit(MUL(result_dst, op[0], op[1]));
1278       }
1279       break;
1280    case ir_binop_div:
1281       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1282       assert(ir->type->is_integer());
1283       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1284       break;
1285    case ir_binop_mod:
1286       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1287       assert(ir->type->is_integer());
1288       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1289       break;
1290
1291    case ir_binop_less:
1292    case ir_binop_greater:
1293    case ir_binop_lequal:
1294    case ir_binop_gequal:
1295    case ir_binop_equal:
1296    case ir_binop_nequal: {
1297       emit(CMP(result_dst, op[0], op[1],
1298                brw_conditional_for_comparison(ir->operation)));
1299       emit(AND(result_dst, result_src, src_reg(0x1)));
1300       break;
1301    }
1302
1303    case ir_binop_all_equal:
1304       /* "==" operator producing a scalar boolean. */
1305       if (ir->operands[0]->type->is_vector() ||
1306           ir->operands[1]->type->is_vector()) {
1307          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1308          emit(MOV(result_dst, src_reg(0)));
1309          inst = emit(MOV(result_dst, src_reg(1)));
1310          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1311       } else {
1312          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1313          emit(AND(result_dst, result_src, src_reg(0x1)));
1314       }
1315       break;
1316    case ir_binop_any_nequal:
1317       /* "!=" operator producing a scalar boolean. */
1318       if (ir->operands[0]->type->is_vector() ||
1319           ir->operands[1]->type->is_vector()) {
1320          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1321
1322          emit(MOV(result_dst, src_reg(0)));
1323          inst = emit(MOV(result_dst, src_reg(1)));
1324          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1325       } else {
1326          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1327          emit(AND(result_dst, result_src, src_reg(0x1)));
1328       }
1329       break;
1330
1331    case ir_unop_any:
1332       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1333       emit(MOV(result_dst, src_reg(0)));
1334
1335       inst = emit(MOV(result_dst, src_reg(1)));
1336       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1337       break;
1338
1339    case ir_binop_logic_xor:
1340       emit(XOR(result_dst, op[0], op[1]));
1341       break;
1342
1343    case ir_binop_logic_or:
1344       emit(OR(result_dst, op[0], op[1]));
1345       break;
1346
1347    case ir_binop_logic_and:
1348       emit(AND(result_dst, op[0], op[1]));
1349       break;
1350
1351    case ir_binop_dot:
1352       assert(ir->operands[0]->type->is_vector());
1353       assert(ir->operands[0]->type == ir->operands[1]->type);
1354       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1355       break;
1356
1357    case ir_unop_sqrt:
1358       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1359       break;
1360    case ir_unop_rsq:
1361       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1362       break;
1363
1364    case ir_unop_bitcast_i2f:
1365    case ir_unop_bitcast_u2f:
1366       this->result = op[0];
1367       this->result.type = BRW_REGISTER_TYPE_F;
1368       break;
1369
1370    case ir_unop_bitcast_f2i:
1371       this->result = op[0];
1372       this->result.type = BRW_REGISTER_TYPE_D;
1373       break;
1374
1375    case ir_unop_bitcast_f2u:
1376       this->result = op[0];
1377       this->result.type = BRW_REGISTER_TYPE_UD;
1378       break;
1379
1380    case ir_unop_i2f:
1381    case ir_unop_i2u:
1382    case ir_unop_u2i:
1383    case ir_unop_u2f:
1384    case ir_unop_b2f:
1385    case ir_unop_b2i:
1386    case ir_unop_f2i:
1387    case ir_unop_f2u:
1388       emit(MOV(result_dst, op[0]));
1389       break;
1390    case ir_unop_f2b:
1391    case ir_unop_i2b: {
1392       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1393       emit(AND(result_dst, result_src, src_reg(1)));
1394       break;
1395    }
1396
1397    case ir_unop_trunc:
1398       emit(RNDZ(result_dst, op[0]));
1399       break;
1400    case ir_unop_ceil:
1401       op[0].negate = !op[0].negate;
1402       inst = emit(RNDD(result_dst, op[0]));
1403       this->result.negate = true;
1404       break;
1405    case ir_unop_floor:
1406       inst = emit(RNDD(result_dst, op[0]));
1407       break;
1408    case ir_unop_fract:
1409       inst = emit(FRC(result_dst, op[0]));
1410       break;
1411    case ir_unop_round_even:
1412       emit(RNDE(result_dst, op[0]));
1413       break;
1414
1415    case ir_binop_min:
1416       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1417       break;
1418    case ir_binop_max:
1419       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1420       break;
1421
1422    case ir_binop_pow:
1423       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1424       break;
1425
1426    case ir_unop_bit_not:
1427       inst = emit(NOT(result_dst, op[0]));
1428       break;
1429    case ir_binop_bit_and:
1430       inst = emit(AND(result_dst, op[0], op[1]));
1431       break;
1432    case ir_binop_bit_xor:
1433       inst = emit(XOR(result_dst, op[0], op[1]));
1434       break;
1435    case ir_binop_bit_or:
1436       inst = emit(OR(result_dst, op[0], op[1]));
1437       break;
1438
1439    case ir_binop_lshift:
1440       inst = emit(SHL(result_dst, op[0], op[1]));
1441       break;
1442
1443    case ir_binop_rshift:
1444       if (ir->type->base_type == GLSL_TYPE_INT)
1445          inst = emit(ASR(result_dst, op[0], op[1]));
1446       else
1447          inst = emit(SHR(result_dst, op[0], op[1]));
1448       break;
1449
1450    case ir_binop_ubo_load: {
1451       ir_constant *uniform_block = ir->operands[0]->as_constant();
1452       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1453       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1454       src_reg offset = op[1];
1455
1456       /* Now, load the vector from that offset. */
1457       assert(ir->type->is_vector() || ir->type->is_scalar());
1458
1459       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1460       packed_consts.type = result.type;
1461       src_reg surf_index =
1462          src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1463       if (const_offset_ir) {
1464          offset = src_reg(const_offset / 16);
1465       } else {
1466          emit(SHR(dst_reg(offset), offset, src_reg(4)));
1467       }
1468
1469       vec4_instruction *pull =
1470          emit(new(mem_ctx) vec4_instruction(this,
1471                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1472                                             dst_reg(packed_consts),
1473                                             surf_index,
1474                                             offset));
1475       pull->base_mrf = 14;
1476       pull->mlen = 1;
1477
1478       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1479       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1480                                             const_offset % 16 / 4,
1481                                             const_offset % 16 / 4,
1482                                             const_offset % 16 / 4);
1483
1484       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1485       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1486          emit(CMP(result_dst, packed_consts, src_reg(0u),
1487                   BRW_CONDITIONAL_NZ));
1488          emit(AND(result_dst, result, src_reg(0x1)));
1489       } else {
1490          emit(MOV(result_dst, packed_consts));
1491       }
1492       break;
1493    }
1494
1495    case ir_quadop_vector:
1496       assert(!"not reached: should be handled by lower_quadop_vector");
1497       break;
1498    }
1499 }
1500
1501
1502 void
1503 vec4_visitor::visit(ir_swizzle *ir)
1504 {
1505    src_reg src;
1506    int i = 0;
1507    int swizzle[4];
1508
1509    /* Note that this is only swizzles in expressions, not those on the left
1510     * hand side of an assignment, which do write masking.  See ir_assignment
1511     * for that.
1512     */
1513
1514    ir->val->accept(this);
1515    src = this->result;
1516    assert(src.file != BAD_FILE);
1517
1518    for (i = 0; i < ir->type->vector_elements; i++) {
1519       switch (i) {
1520       case 0:
1521          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1522          break;
1523       case 1:
1524          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1525          break;
1526       case 2:
1527          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1528          break;
1529       case 3:
1530          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1531             break;
1532       }
1533    }
1534    for (; i < 4; i++) {
1535       /* Replicate the last channel out. */
1536       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1537    }
1538
1539    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1540
1541    this->result = src;
1542 }
1543
1544 void
1545 vec4_visitor::visit(ir_dereference_variable *ir)
1546 {
1547    const struct glsl_type *type = ir->type;
1548    dst_reg *reg = variable_storage(ir->var);
1549
1550    if (!reg) {
1551       fail("Failed to find variable storage for %s\n", ir->var->name);
1552       this->result = src_reg(brw_null_reg());
1553       return;
1554    }
1555
1556    this->result = src_reg(*reg);
1557
1558    /* System values get their swizzle from the dst_reg writemask */
1559    if (ir->var->mode == ir_var_system_value)
1560       return;
1561
1562    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1563       this->result.swizzle = swizzle_for_size(type->vector_elements);
1564 }
1565
1566 void
1567 vec4_visitor::visit(ir_dereference_array *ir)
1568 {
1569    ir_constant *constant_index;
1570    src_reg src;
1571    int element_size = type_size(ir->type);
1572
1573    constant_index = ir->array_index->constant_expression_value();
1574
1575    ir->array->accept(this);
1576    src = this->result;
1577
1578    if (constant_index) {
1579       src.reg_offset += constant_index->value.i[0] * element_size;
1580    } else {
1581       /* Variable index array dereference.  It eats the "vec4" of the
1582        * base of the array and an index that offsets the Mesa register
1583        * index.
1584        */
1585       ir->array_index->accept(this);
1586
1587       src_reg index_reg;
1588
1589       if (element_size == 1) {
1590          index_reg = this->result;
1591       } else {
1592          index_reg = src_reg(this, glsl_type::int_type);
1593
1594          emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1595       }
1596
1597       if (src.reladdr) {
1598          src_reg temp = src_reg(this, glsl_type::int_type);
1599
1600          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1601
1602          index_reg = temp;
1603       }
1604
1605       src.reladdr = ralloc(mem_ctx, src_reg);
1606       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1607    }
1608
1609    /* If the type is smaller than a vec4, replicate the last channel out. */
1610    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1611       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1612    else
1613       src.swizzle = BRW_SWIZZLE_NOOP;
1614    src.type = brw_type_for_base_type(ir->type);
1615
1616    this->result = src;
1617 }
1618
1619 void
1620 vec4_visitor::visit(ir_dereference_record *ir)
1621 {
1622    unsigned int i;
1623    const glsl_type *struct_type = ir->record->type;
1624    int offset = 0;
1625
1626    ir->record->accept(this);
1627
1628    for (i = 0; i < struct_type->length; i++) {
1629       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1630          break;
1631       offset += type_size(struct_type->fields.structure[i].type);
1632    }
1633
1634    /* If the type is smaller than a vec4, replicate the last channel out. */
1635    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1636       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1637    else
1638       this->result.swizzle = BRW_SWIZZLE_NOOP;
1639    this->result.type = brw_type_for_base_type(ir->type);
1640
1641    this->result.reg_offset += offset;
1642 }
1643
1644 /**
1645  * We want to be careful in assignment setup to hit the actual storage
1646  * instead of potentially using a temporary like we might with the
1647  * ir_dereference handler.
1648  */
1649 static dst_reg
1650 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1651 {
1652    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1653     * access of a vector, it must be separated into a series conditional moves
1654     * before reaching this point (see ir_vec_index_to_cond_assign).
1655     */
1656    assert(ir->as_dereference());
1657    ir_dereference_array *deref_array = ir->as_dereference_array();
1658    if (deref_array) {
1659       assert(!deref_array->array->type->is_vector());
1660    }
1661
1662    /* Use the rvalue deref handler for the most part.  We'll ignore
1663     * swizzles in it and write swizzles using writemask, though.
1664     */
1665    ir->accept(v);
1666    return dst_reg(v->result);
1667 }
1668
1669 void
1670 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1671                               const struct glsl_type *type, uint32_t predicate)
1672 {
1673    if (type->base_type == GLSL_TYPE_STRUCT) {
1674       for (unsigned int i = 0; i < type->length; i++) {
1675          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1676       }
1677       return;
1678    }
1679
1680    if (type->is_array()) {
1681       for (unsigned int i = 0; i < type->length; i++) {
1682          emit_block_move(dst, src, type->fields.array, predicate);
1683       }
1684       return;
1685    }
1686
1687    if (type->is_matrix()) {
1688       const struct glsl_type *vec_type;
1689
1690       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1691                                          type->vector_elements, 1);
1692
1693       for (int i = 0; i < type->matrix_columns; i++) {
1694          emit_block_move(dst, src, vec_type, predicate);
1695       }
1696       return;
1697    }
1698
1699    assert(type->is_scalar() || type->is_vector());
1700
1701    dst->type = brw_type_for_base_type(type);
1702    src->type = dst->type;
1703
1704    dst->writemask = (1 << type->vector_elements) - 1;
1705
1706    src->swizzle = swizzle_for_size(type->vector_elements);
1707
1708    vec4_instruction *inst = emit(MOV(*dst, *src));
1709    inst->predicate = predicate;
1710
1711    dst->reg_offset++;
1712    src->reg_offset++;
1713 }
1714
1715
1716 /* If the RHS processing resulted in an instruction generating a
1717  * temporary value, and it would be easy to rewrite the instruction to
1718  * generate its result right into the LHS instead, do so.  This ends
1719  * up reliably removing instructions where it can be tricky to do so
1720  * later without real UD chain information.
1721  */
1722 bool
1723 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1724                                      dst_reg dst,
1725                                      src_reg src,
1726                                      vec4_instruction *pre_rhs_inst,
1727                                      vec4_instruction *last_rhs_inst)
1728 {
1729    /* This could be supported, but it would take more smarts. */
1730    if (ir->condition)
1731       return false;
1732
1733    if (pre_rhs_inst == last_rhs_inst)
1734       return false; /* No instructions generated to work with. */
1735
1736    /* Make sure the last instruction generated our source reg. */
1737    if (src.file != GRF ||
1738        src.file != last_rhs_inst->dst.file ||
1739        src.reg != last_rhs_inst->dst.reg ||
1740        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1741        src.reladdr ||
1742        src.abs ||
1743        src.negate ||
1744        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1745       return false;
1746
1747    /* Check that that last instruction fully initialized the channels
1748     * we want to use, in the order we want to use them.  We could
1749     * potentially reswizzle the operands of many instructions so that
1750     * we could handle out of order channels, but don't yet.
1751     */
1752
1753    for (unsigned i = 0; i < 4; i++) {
1754       if (dst.writemask & (1 << i)) {
1755          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1756             return false;
1757
1758          if (BRW_GET_SWZ(src.swizzle, i) != i)
1759             return false;
1760       }
1761    }
1762
1763    /* Success!  Rewrite the instruction. */
1764    last_rhs_inst->dst.file = dst.file;
1765    last_rhs_inst->dst.reg = dst.reg;
1766    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1767    last_rhs_inst->dst.reladdr = dst.reladdr;
1768    last_rhs_inst->dst.writemask &= dst.writemask;
1769
1770    return true;
1771 }
1772
1773 void
1774 vec4_visitor::visit(ir_assignment *ir)
1775 {
1776    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1777    uint32_t predicate = BRW_PREDICATE_NONE;
1778
1779    if (!ir->lhs->type->is_scalar() &&
1780        !ir->lhs->type->is_vector()) {
1781       ir->rhs->accept(this);
1782       src_reg src = this->result;
1783
1784       if (ir->condition) {
1785          emit_bool_to_cond_code(ir->condition, &predicate);
1786       }
1787
1788       /* emit_block_move doesn't account for swizzles in the source register.
1789        * This should be ok, since the source register is a structure or an
1790        * array, and those can't be swizzled.  But double-check to be sure.
1791        */
1792       assert(src.swizzle ==
1793              (ir->rhs->type->is_matrix()
1794               ? swizzle_for_size(ir->rhs->type->vector_elements)
1795               : BRW_SWIZZLE_NOOP));
1796
1797       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1798       return;
1799    }
1800
1801    /* Now we're down to just a scalar/vector with writemasks. */
1802    int i;
1803
1804    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1805    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1806
1807    ir->rhs->accept(this);
1808
1809    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1810
1811    src_reg src = this->result;
1812
1813    int swizzles[4];
1814    int first_enabled_chan = 0;
1815    int src_chan = 0;
1816
1817    assert(ir->lhs->type->is_vector() ||
1818           ir->lhs->type->is_scalar());
1819    dst.writemask = ir->write_mask;
1820
1821    for (int i = 0; i < 4; i++) {
1822       if (dst.writemask & (1 << i)) {
1823          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1824          break;
1825       }
1826    }
1827
1828    /* Swizzle a small RHS vector into the channels being written.
1829     *
1830     * glsl ir treats write_mask as dictating how many channels are
1831     * present on the RHS while in our instructions we need to make
1832     * those channels appear in the slots of the vec4 they're written to.
1833     */
1834    for (int i = 0; i < 4; i++) {
1835       if (dst.writemask & (1 << i))
1836          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1837       else
1838          swizzles[i] = first_enabled_chan;
1839    }
1840    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1841                               swizzles[2], swizzles[3]);
1842
1843    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1844       return;
1845    }
1846
1847    if (ir->condition) {
1848       emit_bool_to_cond_code(ir->condition, &predicate);
1849    }
1850
1851    for (i = 0; i < type_size(ir->lhs->type); i++) {
1852       vec4_instruction *inst = emit(MOV(dst, src));
1853       inst->predicate = predicate;
1854
1855       dst.reg_offset++;
1856       src.reg_offset++;
1857    }
1858 }
1859
1860 void
1861 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1862 {
1863    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1864       foreach_list(node, &ir->components) {
1865          ir_constant *field_value = (ir_constant *)node;
1866
1867          emit_constant_values(dst, field_value);
1868       }
1869       return;
1870    }
1871
1872    if (ir->type->is_array()) {
1873       for (unsigned int i = 0; i < ir->type->length; i++) {
1874          emit_constant_values(dst, ir->array_elements[i]);
1875       }
1876       return;
1877    }
1878
1879    if (ir->type->is_matrix()) {
1880       for (int i = 0; i < ir->type->matrix_columns; i++) {
1881          float *vec = &ir->value.f[i * ir->type->vector_elements];
1882
1883          for (int j = 0; j < ir->type->vector_elements; j++) {
1884             dst->writemask = 1 << j;
1885             dst->type = BRW_REGISTER_TYPE_F;
1886
1887             emit(MOV(*dst, src_reg(vec[j])));
1888          }
1889          dst->reg_offset++;
1890       }
1891       return;
1892    }
1893
1894    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
1895
1896    for (int i = 0; i < ir->type->vector_elements; i++) {
1897       if (!(remaining_writemask & (1 << i)))
1898          continue;
1899
1900       dst->writemask = 1 << i;
1901       dst->type = brw_type_for_base_type(ir->type);
1902
1903       /* Find other components that match the one we're about to
1904        * write.  Emits fewer instructions for things like vec4(0.5,
1905        * 1.5, 1.5, 1.5).
1906        */
1907       for (int j = i + 1; j < ir->type->vector_elements; j++) {
1908          if (ir->type->base_type == GLSL_TYPE_BOOL) {
1909             if (ir->value.b[i] == ir->value.b[j])
1910                dst->writemask |= (1 << j);
1911          } else {
1912             /* u, i, and f storage all line up, so no need for a
1913              * switch case for comparing each type.
1914              */
1915             if (ir->value.u[i] == ir->value.u[j])
1916                dst->writemask |= (1 << j);
1917          }
1918       }
1919
1920       switch (ir->type->base_type) {
1921       case GLSL_TYPE_FLOAT:
1922          emit(MOV(*dst, src_reg(ir->value.f[i])));
1923          break;
1924       case GLSL_TYPE_INT:
1925          emit(MOV(*dst, src_reg(ir->value.i[i])));
1926          break;
1927       case GLSL_TYPE_UINT:
1928          emit(MOV(*dst, src_reg(ir->value.u[i])));
1929          break;
1930       case GLSL_TYPE_BOOL:
1931          emit(MOV(*dst, src_reg(ir->value.b[i])));
1932          break;
1933       default:
1934          assert(!"Non-float/uint/int/bool constant");
1935          break;
1936       }
1937
1938       remaining_writemask &= ~dst->writemask;
1939    }
1940    dst->reg_offset++;
1941 }
1942
1943 void
1944 vec4_visitor::visit(ir_constant *ir)
1945 {
1946    dst_reg dst = dst_reg(this, ir->type);
1947    this->result = src_reg(dst);
1948
1949    emit_constant_values(&dst, ir);
1950 }
1951
1952 void
1953 vec4_visitor::visit(ir_call *ir)
1954 {
1955    assert(!"not reached");
1956 }
1957
1958 void
1959 vec4_visitor::visit(ir_texture *ir)
1960 {
1961    int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
1962
1963    /* Should be lowered by do_lower_texture_projection */
1964    assert(!ir->projector);
1965
1966    /* Generate code to compute all the subexpression trees.  This has to be
1967     * done before loading any values into MRFs for the sampler message since
1968     * generating these values may involve SEND messages that need the MRFs.
1969     */
1970    src_reg coordinate;
1971    if (ir->coordinate) {
1972       ir->coordinate->accept(this);
1973       coordinate = this->result;
1974    }
1975
1976    src_reg shadow_comparitor;
1977    if (ir->shadow_comparitor) {
1978       ir->shadow_comparitor->accept(this);
1979       shadow_comparitor = this->result;
1980    }
1981
1982    src_reg lod, dPdx, dPdy;
1983    switch (ir->op) {
1984    case ir_txf:
1985    case ir_txl:
1986    case ir_txs:
1987       ir->lod_info.lod->accept(this);
1988       lod = this->result;
1989       break;
1990    case ir_txd:
1991       ir->lod_info.grad.dPdx->accept(this);
1992       dPdx = this->result;
1993
1994       ir->lod_info.grad.dPdy->accept(this);
1995       dPdy = this->result;
1996       break;
1997    case ir_tex:
1998    case ir_txb:
1999       break;
2000    }
2001
2002    vec4_instruction *inst = NULL;
2003    switch (ir->op) {
2004    case ir_tex:
2005    case ir_txl:
2006       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2007       break;
2008    case ir_txd:
2009       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2010       break;
2011    case ir_txf:
2012       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2013       break;
2014    case ir_txs:
2015       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2016       break;
2017    case ir_txb:
2018       assert(!"TXB is not valid for vertex shaders.");
2019    }
2020
2021    /* Texel offsets go in the message header; Gen4 also requires headers. */
2022    inst->header_present = ir->offset || intel->gen < 5;
2023    inst->base_mrf = 2;
2024    inst->mlen = inst->header_present + 1; /* always at least one */
2025    inst->sampler = sampler;
2026    inst->dst = dst_reg(this, ir->type);
2027    inst->shadow_compare = ir->shadow_comparitor != NULL;
2028
2029    if (ir->offset != NULL && ir->op != ir_txf)
2030       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2031
2032    /* MRF for the first parameter */
2033    int param_base = inst->base_mrf + inst->header_present;
2034
2035    if (ir->op == ir_txs) {
2036       int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2037       emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, writemask),
2038            lod));
2039    } else {
2040       int i, coord_mask = 0, zero_mask = 0;
2041       /* Load the coordinate */
2042       /* FINISHME: gl_clamp_mask and saturate */
2043       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2044          coord_mask |= (1 << i);
2045       for (; i < 4; i++)
2046          zero_mask |= (1 << i);
2047
2048       if (ir->offset && ir->op == ir_txf) {
2049          /* It appears that the ld instruction used for txf does its
2050           * address bounds check before adding in the offset.  To work
2051           * around this, just add the integer offset to the integer
2052           * texel coordinate, and don't put the offset in the header.
2053           */
2054          ir_constant *offset = ir->offset->as_constant();
2055          assert(offset);
2056
2057          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2058             src_reg src = coordinate;
2059             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2060                                        BRW_GET_SWZ(src.swizzle, j),
2061                                        BRW_GET_SWZ(src.swizzle, j),
2062                                        BRW_GET_SWZ(src.swizzle, j));
2063             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2064                      src, offset->value.i[j]));
2065          }
2066       } else {
2067          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2068                   coordinate));
2069       }
2070       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2071                src_reg(0)));
2072       /* Load the shadow comparitor */
2073       if (ir->shadow_comparitor) {
2074          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2075                           WRITEMASK_X),
2076                   shadow_comparitor));
2077          inst->mlen++;
2078       }
2079
2080       /* Load the LOD info */
2081       if (ir->op == ir_txl) {
2082          int mrf, writemask;
2083          if (intel->gen >= 5) {
2084             mrf = param_base + 1;
2085             if (ir->shadow_comparitor) {
2086                writemask = WRITEMASK_Y;
2087                /* mlen already incremented */
2088             } else {
2089                writemask = WRITEMASK_X;
2090                inst->mlen++;
2091             }
2092          } else /* intel->gen == 4 */ {
2093             mrf = param_base;
2094             writemask = WRITEMASK_Z;
2095          }
2096          emit(MOV(dst_reg(MRF, mrf, ir->lod_info.lod->type, writemask), lod));
2097       } else if (ir->op == ir_txf) {
2098          emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, WRITEMASK_W),
2099                   lod));
2100       } else if (ir->op == ir_txd) {
2101          const glsl_type *type = ir->lod_info.grad.dPdx->type;
2102
2103          if (intel->gen >= 5) {
2104             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2105             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2106             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2107             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2108             inst->mlen++;
2109
2110             if (ir->type->vector_elements == 3) {
2111                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2112                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2113                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2114                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2115                inst->mlen++;
2116             }
2117          } else /* intel->gen == 4 */ {
2118             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2119             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2120             inst->mlen += 2;
2121          }
2122       }
2123    }
2124
2125    emit(inst);
2126
2127    swizzle_result(ir, src_reg(inst->dst), sampler);
2128 }
2129
2130 void
2131 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2132 {
2133    this->result = orig_val;
2134
2135    int s = c->key.tex.swizzles[sampler];
2136
2137    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2138                         || s == SWIZZLE_NOOP)
2139       return;
2140
2141    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2142    int swizzle[4];
2143
2144    for (int i = 0; i < 4; i++) {
2145       switch (GET_SWZ(s, i)) {
2146       case SWIZZLE_ZERO:
2147          zero_mask |= (1 << i);
2148          break;
2149       case SWIZZLE_ONE:
2150          one_mask |= (1 << i);
2151          break;
2152       default:
2153          copy_mask |= (1 << i);
2154          swizzle[i] = GET_SWZ(s, i);
2155          break;
2156       }
2157    }
2158
2159    this->result = src_reg(this, ir->type);
2160    dst_reg swizzled_result(this->result);
2161
2162    if (copy_mask) {
2163       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2164       swizzled_result.writemask = copy_mask;
2165       emit(MOV(swizzled_result, orig_val));
2166    }
2167
2168    if (zero_mask) {
2169       swizzled_result.writemask = zero_mask;
2170       emit(MOV(swizzled_result, src_reg(0.0f)));
2171    }
2172
2173    if (one_mask) {
2174       swizzled_result.writemask = one_mask;
2175       emit(MOV(swizzled_result, src_reg(1.0f)));
2176    }
2177 }
2178
2179 void
2180 vec4_visitor::visit(ir_return *ir)
2181 {
2182    assert(!"not reached");
2183 }
2184
2185 void
2186 vec4_visitor::visit(ir_discard *ir)
2187 {
2188    assert(!"not reached");
2189 }
2190
2191 void
2192 vec4_visitor::visit(ir_if *ir)
2193 {
2194    /* Don't point the annotation at the if statement, because then it plus
2195     * the then and else blocks get printed.
2196     */
2197    this->base_ir = ir->condition;
2198
2199    if (intel->gen == 6) {
2200       emit_if_gen6(ir);
2201    } else {
2202       uint32_t predicate;
2203       emit_bool_to_cond_code(ir->condition, &predicate);
2204       emit(IF(predicate));
2205    }
2206
2207    visit_instructions(&ir->then_instructions);
2208
2209    if (!ir->else_instructions.is_empty()) {
2210       this->base_ir = ir->condition;
2211       emit(BRW_OPCODE_ELSE);
2212
2213       visit_instructions(&ir->else_instructions);
2214    }
2215
2216    this->base_ir = ir->condition;
2217    emit(BRW_OPCODE_ENDIF);
2218 }
2219
2220 void
2221 vec4_visitor::emit_ndc_computation()
2222 {
2223    /* Get the position */
2224    src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
2225
2226    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2227    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2228    output_reg[BRW_VERT_RESULT_NDC] = ndc;
2229
2230    current_annotation = "NDC";
2231    dst_reg ndc_w = ndc;
2232    ndc_w.writemask = WRITEMASK_W;
2233    src_reg pos_w = pos;
2234    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2235    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2236
2237    dst_reg ndc_xyz = ndc;
2238    ndc_xyz.writemask = WRITEMASK_XYZ;
2239
2240    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2241 }
2242
2243 void
2244 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2245 {
2246    if (intel->gen < 6 &&
2247        ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2248         c->key.userclip_active || brw->has_negative_rhw_bug)) {
2249       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2250       dst_reg header1_w = header1;
2251       header1_w.writemask = WRITEMASK_W;
2252       GLuint i;
2253
2254       emit(MOV(header1, 0u));
2255
2256       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2257          src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2258
2259          current_annotation = "Point size";
2260          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2261          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2262       }
2263
2264       current_annotation = "Clipping flags";
2265       for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2266          vec4_instruction *inst;
2267
2268          inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2269                          src_reg(this->userplane[i])));
2270          inst->conditional_mod = BRW_CONDITIONAL_L;
2271
2272          inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2273          inst->predicate = BRW_PREDICATE_NORMAL;
2274       }
2275
2276       /* i965 clipping workaround:
2277        * 1) Test for -ve rhw
2278        * 2) If set,
2279        *      set ndc = (0,0,0,0)
2280        *      set ucp[6] = 1
2281        *
2282        * Later, clipping will detect ucp[6] and ensure the primitive is
2283        * clipped against all fixed planes.
2284        */
2285       if (brw->has_negative_rhw_bug) {
2286 #if 0
2287          /* FINISHME */
2288          brw_CMP(p,
2289                  vec8(brw_null_reg()),
2290                  BRW_CONDITIONAL_L,
2291                  brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2292                  brw_imm_f(0));
2293
2294          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2295          brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2296          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2297 #endif
2298       }
2299
2300       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2301    } else if (intel->gen < 6) {
2302       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2303    } else {
2304       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2305       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2306          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2307                   src_reg(output_reg[VERT_RESULT_PSIZ])));
2308       }
2309    }
2310 }
2311
2312 void
2313 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2314 {
2315    if (intel->gen < 6) {
2316       /* Clip distance slots are set aside in gen5, but they are not used.  It
2317        * is not clear whether we actually need to set aside space for them,
2318        * but the performance cost is negligible.
2319        */
2320       return;
2321    }
2322
2323    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2324     *
2325     *     "If a linked set of shaders forming the vertex stage contains no
2326     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2327     *     application has requested clipping against user clip planes through
2328     *     the API, then the coordinate written to gl_Position is used for
2329     *     comparison against the user clip planes."
2330     *
2331     * This function is only called if the shader didn't write to
2332     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2333     * if the user wrote to it; otherwise we use gl_Position.
2334     */
2335    gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2336    if (!(c->prog_data.outputs_written
2337          & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2338       clip_vertex = VERT_RESULT_HPOS;
2339    }
2340
2341    for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2342         ++i) {
2343       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2344                src_reg(output_reg[clip_vertex]),
2345                src_reg(this->userplane[i + offset])));
2346    }
2347 }
2348
2349 void
2350 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2351 {
2352    assert (vert_result < VERT_RESULT_MAX);
2353    reg.type = output_reg[vert_result].type;
2354    current_annotation = output_reg_annotation[vert_result];
2355    /* Copy the register, saturating if necessary */
2356    vec4_instruction *inst = emit(MOV(reg,
2357                                      src_reg(output_reg[vert_result])));
2358    if ((vert_result == VERT_RESULT_COL0 ||
2359         vert_result == VERT_RESULT_COL1 ||
2360         vert_result == VERT_RESULT_BFC0 ||
2361         vert_result == VERT_RESULT_BFC1) &&
2362        c->key.clamp_vertex_color) {
2363       inst->saturate = true;
2364    }
2365 }
2366
2367 void
2368 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2369 {
2370    struct brw_reg hw_reg = brw_message_reg(mrf);
2371    dst_reg reg = dst_reg(MRF, mrf);
2372    reg.type = BRW_REGISTER_TYPE_F;
2373
2374    switch (vert_result) {
2375    case VERT_RESULT_PSIZ:
2376       /* PSIZ is always in slot 0, and is coupled with other flags. */
2377       current_annotation = "indices, point width, clip flags";
2378       emit_psiz_and_flags(hw_reg);
2379       break;
2380    case BRW_VERT_RESULT_NDC:
2381       current_annotation = "NDC";
2382       emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2383       break;
2384    case BRW_VERT_RESULT_HPOS_DUPLICATE:
2385    case VERT_RESULT_HPOS:
2386       current_annotation = "gl_Position";
2387       emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2388       break;
2389    case VERT_RESULT_CLIP_DIST0:
2390    case VERT_RESULT_CLIP_DIST1:
2391       if (this->c->key.uses_clip_distance) {
2392          emit_generic_urb_slot(reg, vert_result);
2393       } else {
2394          current_annotation = "user clip distances";
2395          emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2396       }
2397       break;
2398    case VERT_RESULT_EDGE:
2399       /* This is present when doing unfilled polygons.  We're supposed to copy
2400        * the edge flag from the user-provided vertex array
2401        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2402        * of that attribute (starts as 1.0f).  This is then used in clipping to
2403        * determine which edges should be drawn as wireframe.
2404        */
2405       current_annotation = "edge flag";
2406       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2407                                     glsl_type::float_type, WRITEMASK_XYZW))));
2408       break;
2409    case BRW_VERT_RESULT_PAD:
2410       /* No need to write to this slot */
2411       break;
2412    default:
2413       emit_generic_urb_slot(reg, vert_result);
2414       break;
2415    }
2416 }
2417
2418 static int
2419 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2420 {
2421    struct intel_context *intel = &brw->intel;
2422
2423    if (intel->gen >= 6) {
2424       /* URB data written (does not include the message header reg) must
2425        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2426        * section 5.4.3.2.2: URB_INTERLEAVED.
2427        *
2428        * URB entries are allocated on a multiple of 1024 bits, so an
2429        * extra 128 bits written here to make the end align to 256 is
2430        * no problem.
2431        */
2432       if ((mlen % 2) != 1)
2433          mlen++;
2434    }
2435
2436    return mlen;
2437 }
2438
2439 /**
2440  * Generates the VUE payload plus the 1 or 2 URB write instructions to
2441  * complete the VS thread.
2442  *
2443  * The VUE layout is documented in Volume 2a.
2444  */
2445 void
2446 vec4_visitor::emit_urb_writes()
2447 {
2448    /* MRF 0 is reserved for the debugger, so start with message header
2449     * in MRF 1.
2450     */
2451    int base_mrf = 1;
2452    int mrf = base_mrf;
2453    /* In the process of generating our URB write message contents, we
2454     * may need to unspill a register or load from an array.  Those
2455     * reads would use MRFs 14-15.
2456     */
2457    int max_usable_mrf = 13;
2458
2459    /* The following assertion verifies that max_usable_mrf causes an
2460     * even-numbered amount of URB write data, which will meet gen6's
2461     * requirements for length alignment.
2462     */
2463    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2464
2465    /* First mrf is the g0-based message header containing URB handles and such,
2466     * which is implied in VS_OPCODE_URB_WRITE.
2467     */
2468    mrf++;
2469
2470    if (intel->gen < 6) {
2471       emit_ndc_computation();
2472    }
2473
2474    /* Set up the VUE data for the first URB write */
2475    int slot;
2476    for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) {
2477       emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2478
2479       /* If this was max_usable_mrf, we can't fit anything more into this URB
2480        * WRITE.
2481        */
2482       if (mrf > max_usable_mrf) {
2483          slot++;
2484          break;
2485       }
2486    }
2487
2488    current_annotation = "URB write";
2489    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2490    inst->base_mrf = base_mrf;
2491    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2492    inst->eot = (slot >= c->prog_data.vue_map.num_slots);
2493
2494    /* Optional second URB write */
2495    if (!inst->eot) {
2496       mrf = base_mrf + 1;
2497
2498       for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
2499          assert(mrf < max_usable_mrf);
2500
2501          emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2502       }
2503
2504       current_annotation = "URB write";
2505       inst = emit(VS_OPCODE_URB_WRITE);
2506       inst->base_mrf = base_mrf;
2507       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2508       inst->eot = true;
2509       /* URB destination offset.  In the previous write, we got MRFs
2510        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2511        * URB row increments, and each of our MRFs is half of one of
2512        * those, since we're doing interleaved writes.
2513        */
2514       inst->offset = (max_usable_mrf - base_mrf) / 2;
2515    }
2516 }
2517
2518 src_reg
2519 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2520                                  src_reg *reladdr, int reg_offset)
2521 {
2522    /* Because we store the values to scratch interleaved like our
2523     * vertex data, we need to scale the vec4 index by 2.
2524     */
2525    int message_header_scale = 2;
2526
2527    /* Pre-gen6, the message header uses byte offsets instead of vec4
2528     * (16-byte) offset units.
2529     */
2530    if (intel->gen < 6)
2531       message_header_scale *= 16;
2532
2533    if (reladdr) {
2534       src_reg index = src_reg(this, glsl_type::int_type);
2535
2536       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2537       emit_before(inst, MUL(dst_reg(index),
2538                             index, src_reg(message_header_scale)));
2539
2540       return index;
2541    } else {
2542       return src_reg(reg_offset * message_header_scale);
2543    }
2544 }
2545
2546 src_reg
2547 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2548                                        src_reg *reladdr, int reg_offset)
2549 {
2550    if (reladdr) {
2551       src_reg index = src_reg(this, glsl_type::int_type);
2552
2553       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2554
2555       /* Pre-gen6, the message header uses byte offsets instead of vec4
2556        * (16-byte) offset units.
2557        */
2558       if (intel->gen < 6) {
2559          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2560       }
2561
2562       return index;
2563    } else {
2564       int message_header_scale = intel->gen < 6 ? 16 : 1;
2565       return src_reg(reg_offset * message_header_scale);
2566    }
2567 }
2568
2569 /**
2570  * Emits an instruction before @inst to load the value named by @orig_src
2571  * from scratch space at @base_offset to @temp.
2572  *
2573  * @base_offset is measured in 32-byte units (the size of a register).
2574  */
2575 void
2576 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2577                                 dst_reg temp, src_reg orig_src,
2578                                 int base_offset)
2579 {
2580    int reg_offset = base_offset + orig_src.reg_offset;
2581    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2582
2583    emit_before(inst, SCRATCH_READ(temp, index));
2584 }
2585
2586 /**
2587  * Emits an instruction after @inst to store the value to be written
2588  * to @orig_dst to scratch space at @base_offset, from @temp.
2589  *
2590  * @base_offset is measured in 32-byte units (the size of a register).
2591  */
2592 void
2593 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2594 {
2595    int reg_offset = base_offset + inst->dst.reg_offset;
2596    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2597
2598    /* Create a temporary register to store *inst's result in.
2599     *
2600     * We have to be careful in MOVing from our temporary result register in
2601     * the scratch write.  If we swizzle from channels of the temporary that
2602     * weren't initialized, it will confuse live interval analysis, which will
2603     * make spilling fail to make progress.
2604     */
2605    src_reg temp = src_reg(this, glsl_type::vec4_type);
2606    temp.type = inst->dst.type;
2607    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2608    int swizzles[4];
2609    for (int i = 0; i < 4; i++)
2610       if (inst->dst.writemask & (1 << i))
2611          swizzles[i] = i;
2612       else
2613          swizzles[i] = first_writemask_chan;
2614    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2615                                swizzles[2], swizzles[3]);
2616
2617    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2618                                        inst->dst.writemask));
2619    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2620    write->predicate = inst->predicate;
2621    write->ir = inst->ir;
2622    write->annotation = inst->annotation;
2623    inst->insert_after(write);
2624
2625    inst->dst.file = temp.file;
2626    inst->dst.reg = temp.reg;
2627    inst->dst.reg_offset = temp.reg_offset;
2628    inst->dst.reladdr = NULL;
2629 }
2630
2631 /**
2632  * We can't generally support array access in GRF space, because a
2633  * single instruction's destination can only span 2 contiguous
2634  * registers.  So, we send all GRF arrays that get variable index
2635  * access to scratch space.
2636  */
2637 void
2638 vec4_visitor::move_grf_array_access_to_scratch()
2639 {
2640    int scratch_loc[this->virtual_grf_count];
2641
2642    for (int i = 0; i < this->virtual_grf_count; i++) {
2643       scratch_loc[i] = -1;
2644    }
2645
2646    /* First, calculate the set of virtual GRFs that need to be punted
2647     * to scratch due to having any array access on them, and where in
2648     * scratch.
2649     */
2650    foreach_list(node, &this->instructions) {
2651       vec4_instruction *inst = (vec4_instruction *)node;
2652
2653       if (inst->dst.file == GRF && inst->dst.reladdr &&
2654           scratch_loc[inst->dst.reg] == -1) {
2655          scratch_loc[inst->dst.reg] = c->last_scratch;
2656          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2657       }
2658
2659       for (int i = 0 ; i < 3; i++) {
2660          src_reg *src = &inst->src[i];
2661
2662          if (src->file == GRF && src->reladdr &&
2663              scratch_loc[src->reg] == -1) {
2664             scratch_loc[src->reg] = c->last_scratch;
2665             c->last_scratch += this->virtual_grf_sizes[src->reg];
2666          }
2667       }
2668    }
2669
2670    /* Now, for anything that will be accessed through scratch, rewrite
2671     * it to load/store.  Note that this is a _safe list walk, because
2672     * we may generate a new scratch_write instruction after the one
2673     * we're processing.
2674     */
2675    foreach_list_safe(node, &this->instructions) {
2676       vec4_instruction *inst = (vec4_instruction *)node;
2677
2678       /* Set up the annotation tracking for new generated instructions. */
2679       base_ir = inst->ir;
2680       current_annotation = inst->annotation;
2681
2682       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2683          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2684       }
2685
2686       for (int i = 0 ; i < 3; i++) {
2687          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2688             continue;
2689
2690          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2691
2692          emit_scratch_read(inst, temp, inst->src[i],
2693                            scratch_loc[inst->src[i].reg]);
2694
2695          inst->src[i].file = temp.file;
2696          inst->src[i].reg = temp.reg;
2697          inst->src[i].reg_offset = temp.reg_offset;
2698          inst->src[i].reladdr = NULL;
2699       }
2700    }
2701 }
2702
2703 /**
2704  * Emits an instruction before @inst to load the value named by @orig_src
2705  * from the pull constant buffer (surface) at @base_offset to @temp.
2706  */
2707 void
2708 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2709                                       dst_reg temp, src_reg orig_src,
2710                                       int base_offset)
2711 {
2712    int reg_offset = base_offset + orig_src.reg_offset;
2713    src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2714    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2715    vec4_instruction *load;
2716
2717    load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2718                                         temp, index, offset);
2719    load->base_mrf = 14;
2720    load->mlen = 1;
2721    emit_before(inst, load);
2722 }
2723
2724 /**
2725  * Implements array access of uniforms by inserting a
2726  * PULL_CONSTANT_LOAD instruction.
2727  *
2728  * Unlike temporary GRF array access (where we don't support it due to
2729  * the difficulty of doing relative addressing on instruction
2730  * destinations), we could potentially do array access of uniforms
2731  * that were loaded in GRF space as push constants.  In real-world
2732  * usage we've seen, though, the arrays being used are always larger
2733  * than we could load as push constants, so just always move all
2734  * uniform array access out to a pull constant buffer.
2735  */
2736 void
2737 vec4_visitor::move_uniform_array_access_to_pull_constants()
2738 {
2739    int pull_constant_loc[this->uniforms];
2740
2741    for (int i = 0; i < this->uniforms; i++) {
2742       pull_constant_loc[i] = -1;
2743    }
2744
2745    /* Walk through and find array access of uniforms.  Put a copy of that
2746     * uniform in the pull constant buffer.
2747     *
2748     * Note that we don't move constant-indexed accesses to arrays.  No
2749     * testing has been done of the performance impact of this choice.
2750     */
2751    foreach_list_safe(node, &this->instructions) {
2752       vec4_instruction *inst = (vec4_instruction *)node;
2753
2754       for (int i = 0 ; i < 3; i++) {
2755          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2756             continue;
2757
2758          int uniform = inst->src[i].reg;
2759
2760          /* If this array isn't already present in the pull constant buffer,
2761           * add it.
2762           */
2763          if (pull_constant_loc[uniform] == -1) {
2764             const float **values = &prog_data->param[uniform * 4];
2765
2766             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2767
2768             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2769                prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2770             }
2771          }
2772
2773          /* Set up the annotation tracking for new generated instructions. */
2774          base_ir = inst->ir;
2775          current_annotation = inst->annotation;
2776
2777          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2778
2779          emit_pull_constant_load(inst, temp, inst->src[i],
2780                                  pull_constant_loc[uniform]);
2781
2782          inst->src[i].file = temp.file;
2783          inst->src[i].reg = temp.reg;
2784          inst->src[i].reg_offset = temp.reg_offset;
2785          inst->src[i].reladdr = NULL;
2786       }
2787    }
2788
2789    /* Now there are no accesses of the UNIFORM file with a reladdr, so
2790     * no need to track them as larger-than-vec4 objects.  This will be
2791     * relied on in cutting out unused uniform vectors from push
2792     * constants.
2793     */
2794    split_uniform_registers();
2795 }
2796
2797 void
2798 vec4_visitor::resolve_ud_negate(src_reg *reg)
2799 {
2800    if (reg->type != BRW_REGISTER_TYPE_UD ||
2801        !reg->negate)
2802       return;
2803
2804    src_reg temp = src_reg(this, glsl_type::uvec4_type);
2805    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2806    *reg = temp;
2807 }
2808
2809 vec4_visitor::vec4_visitor(struct brw_context *brw,
2810                            struct brw_vs_compile *c,
2811                            struct gl_shader_program *prog,
2812                            struct brw_shader *shader,
2813                            void *mem_ctx)
2814 {
2815    this->c = c;
2816    this->p = &c->func;
2817    this->brw = brw;
2818    this->intel = &brw->intel;
2819    this->ctx = &intel->ctx;
2820    this->prog = prog;
2821    this->shader = shader;
2822
2823    this->mem_ctx = mem_ctx;
2824    this->failed = false;
2825
2826    this->base_ir = NULL;
2827    this->current_annotation = NULL;
2828
2829    this->c = c;
2830    this->vp = &c->vp->program;
2831    this->prog_data = &c->prog_data;
2832
2833    this->variable_ht = hash_table_ctor(0,
2834                                        hash_table_pointer_hash,
2835                                        hash_table_pointer_compare);
2836
2837    this->virtual_grf_def = NULL;
2838    this->virtual_grf_use = NULL;
2839    this->virtual_grf_sizes = NULL;
2840    this->virtual_grf_count = 0;
2841    this->virtual_grf_reg_map = NULL;
2842    this->virtual_grf_reg_count = 0;
2843    this->virtual_grf_array_size = 0;
2844    this->live_intervals_valid = false;
2845
2846    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2847
2848    this->uniforms = 0;
2849 }
2850
2851 vec4_visitor::~vec4_visitor()
2852 {
2853    hash_table_dtor(this->variable_ht);
2854 }
2855
2856
2857 void
2858 vec4_visitor::fail(const char *format, ...)
2859 {
2860    va_list va;
2861    char *msg;
2862
2863    if (failed)
2864       return;
2865
2866    failed = true;
2867
2868    va_start(va, format);
2869    msg = ralloc_vasprintf(mem_ctx, format, va);
2870    va_end(va);
2871    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2872
2873    this->fail_msg = msg;
2874
2875    if (INTEL_DEBUG & DEBUG_VS) {
2876       fprintf(stderr, "%s",  msg);
2877    }
2878 }
2879
2880 } /* namespace brw */