src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 extern "C" {
  26 #include "main/macros.h"
  27 #include "program/prog_parameter.h"
  28 #include "program/sampler.h"
  29 }
  30
  31 namespace brw {
  32
  33 src_reg::src_reg(dst_reg reg)
  34 {
  35    init();
  36
  37    this->file = reg.file;
  38    this->reg = reg.reg;
  39    this->reg_offset = reg.reg_offset;
  40    this->type = reg.type;
  41    this->reladdr = reg.reladdr;
  42    this->fixed_hw_reg = reg.fixed_hw_reg;
  43
  44    int swizzles[4];
  45    int next_chan = 0;
  46    int last = 0;
  47
  48    for (int i = 0; i < 4; i++) {
  49       if (!(reg.writemask & (1 << i)))
  50          continue;
  51
  52       swizzles[next_chan++] = last = i;
  53    }
  54
  55    for (; next_chan < 4; next_chan++) {
  56       swizzles[next_chan] = last;
  57    }
  58
  59    this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
  60                                 swizzles[2], swizzles[3]);
  61 }
  62
  63 dst_reg::dst_reg(src_reg reg)
  64 {
  65    init();
  66
  67    this->file = reg.file;
  68    this->reg = reg.reg;
  69    this->reg_offset = reg.reg_offset;
  70    this->type = reg.type;
  71    this->writemask = WRITEMASK_XYZW;
  72    this->reladdr = reg.reladdr;
  73    this->fixed_hw_reg = reg.fixed_hw_reg;
  74 }
  75
  76 vec4_instruction::vec4_instruction(vec4_visitor *v,
  77                                    enum opcode opcode, dst_reg dst,
  78                                    src_reg src0, src_reg src1, src_reg src2)
  79 {
  80    this->opcode = opcode;
  81    this->dst = dst;
  82    this->src[0] = src0;
  83    this->src[1] = src1;
  84    this->src[2] = src2;
  85    this->ir = v->base_ir;
  86    this->annotation = v->current_annotation;
  87 }
  88
  89 vec4_instruction *
  90 vec4_visitor::emit(vec4_instruction *inst)
  91 {
  92    this->instructions.push_tail(inst);
  93
  94    return inst;
  95 }
  96
  97 vec4_instruction *
  98 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  99 {
 100    new_inst->ir = inst->ir;
 101    new_inst->annotation = inst->annotation;
 102
 103    inst->insert_before(new_inst);
 104
 105    return inst;
 106 }
 107
 108 vec4_instruction *
 109 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
 110                    src_reg src0, src_reg src1, src_reg src2)
 111 {
 112    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
 113                                              src0, src1, src2));
 114 }
 115
 116
 117 vec4_instruction *
 118 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
 119 {
 120    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
 121 }
 122
 123 vec4_instruction *
 124 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
 125 {
 126    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
 127 }
 128
 129 vec4_instruction *
 130 vec4_visitor::emit(enum opcode opcode)
 131 {
 132    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 133 }
 134
 135 #define ALU1(op)                                                        \
 136    vec4_instruction *                                                   \
 137    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
 138    {                                                                    \
 139       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 140                                            src0);                       \
 141    }
 142
 143 #define ALU2(op)                                                        \
 144    vec4_instruction *                                                   \
 145    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 146    {                                                                    \
 147       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 148                                            src0, src1);                 \
 149    }
 150
 151 ALU1(NOT)
 152 ALU1(MOV)
 153 ALU1(FRC)
 154 ALU1(RNDD)
 155 ALU1(RNDE)
 156 ALU1(RNDZ)
 157 ALU2(ADD)
 158 ALU2(MUL)
 159 ALU2(MACH)
 160 ALU2(AND)
 161 ALU2(OR)
 162 ALU2(XOR)
 163 ALU2(DP3)
 164 ALU2(DP4)
 165
 166 /** Gen4 predicated IF. */
 167 vec4_instruction *
 168 vec4_visitor::IF(uint32_t predicate)
 169 {
 170    vec4_instruction *inst;
 171
 172    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 173    inst->predicate = predicate;
 174
 175    return inst;
 176 }
 177
 178 /** Gen6+ IF with embedded comparison. */
 179 vec4_instruction *
 180 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 181 {
 182    assert(intel->gen >= 6);
 183
 184    vec4_instruction *inst;
 185
 186    resolve_ud_negate(&src0);
 187    resolve_ud_negate(&src1);
 188
 189    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 190                                         src0, src1);
 191    inst->conditional_mod = condition;
 192
 193    return inst;
 194 }
 195
 196 /**
 197  * CMP: Sets the low bit of the destination channels with the result
 198  * of the comparison, while the upper bits are undefined, and updates
 199  * the flag register with the packed 16 bits of the result.
 200  */
 201 vec4_instruction *
 202 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 203 {
 204    vec4_instruction *inst;
 205
 206    /* original gen4 does type conversion to the destination type
 207     * before before comparison, producing garbage results for floating
 208     * point comparisons.
 209     */
 210    if (intel->gen == 4) {
 211       dst.type = src0.type;
 212       if (dst.file == HW_REG)
 213          dst.fixed_hw_reg.type = dst.type;
 214    }
 215
 216    resolve_ud_negate(&src0);
 217    resolve_ud_negate(&src1);
 218
 219    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 220    inst->conditional_mod = condition;
 221
 222    return inst;
 223 }
 224
 225 vec4_instruction *
 226 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 227 {
 228    vec4_instruction *inst;
 229
 230    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 231                                         dst, index);
 232    inst->base_mrf = 14;
 233    inst->mlen = 1;
 234
 235    return inst;
 236 }
 237
 238 vec4_instruction *
 239 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 240 {
 241    vec4_instruction *inst;
 242
 243    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 244                                         dst, src, index);
 245    inst->base_mrf = 13;
 246    inst->mlen = 2;
 247
 248    return inst;
 249 }
 250
 251 void
 252 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 253 {
 254    static enum opcode dot_opcodes[] = {
 255       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 256    };
 257
 258    emit(dot_opcodes[elements - 2], dst, src0, src1);
 259 }
 260
 261 void
 262 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 263 {
 264    /* The gen6 math instruction ignores the source modifiers --
 265     * swizzle, abs, negate, and at least some parts of the register
 266     * region description.
 267     *
 268     * While it would seem that this MOV could be avoided at this point
 269     * in the case that the swizzle is matched up with the destination
 270     * writemask, note that uniform packing and register allocation
 271     * could rearrange our swizzle, so let's leave this matter up to
 272     * copy propagation later.
 273     */
 274    src_reg temp_src = src_reg(this, glsl_type::vec4_type);
 275    emit(MOV(dst_reg(temp_src), src));
 276
 277    if (dst.writemask != WRITEMASK_XYZW) {
 278       /* The gen6 math instruction must be align1, so we can't do
 279        * writemasks.
 280        */
 281       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 282
 283       emit(opcode, temp_dst, temp_src);
 284
 285       emit(MOV(dst, src_reg(temp_dst)));
 286    } else {
 287       emit(opcode, dst, temp_src);
 288    }
 289 }
 290
 291 void
 292 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 293 {
 294    vec4_instruction *inst = emit(opcode, dst, src);
 295    inst->base_mrf = 1;
 296    inst->mlen = 1;
 297 }
 298
 299 void
 300 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 301 {
 302    switch (opcode) {
 303    case SHADER_OPCODE_RCP:
 304    case SHADER_OPCODE_RSQ:
 305    case SHADER_OPCODE_SQRT:
 306    case SHADER_OPCODE_EXP2:
 307    case SHADER_OPCODE_LOG2:
 308    case SHADER_OPCODE_SIN:
 309    case SHADER_OPCODE_COS:
 310       break;
 311    default:
 312       assert(!"not reached: bad math opcode");
 313       return;
 314    }
 315
 316    if (intel->gen >= 6) {
 317       return emit_math1_gen6(opcode, dst, src);
 318    } else {
 319       return emit_math1_gen4(opcode, dst, src);
 320    }
 321 }
 322
 323 void
 324 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 325                               dst_reg dst, src_reg src0, src_reg src1)
 326 {
 327    src_reg expanded;
 328
 329    /* The gen6 math instruction ignores the source modifiers --
 330     * swizzle, abs, negate, and at least some parts of the register
 331     * region description.  Move the sources to temporaries to make it
 332     * generally work.
 333     */
 334
 335    expanded = src_reg(this, glsl_type::vec4_type);
 336    expanded.type = src0.type;
 337    emit(MOV(dst_reg(expanded), src0));
 338    src0 = expanded;
 339
 340    expanded = src_reg(this, glsl_type::vec4_type);
 341    expanded.type = src1.type;
 342    emit(MOV(dst_reg(expanded), src1));
 343    src1 = expanded;
 344
 345    if (dst.writemask != WRITEMASK_XYZW) {
 346       /* The gen6 math instruction must be align1, so we can't do
 347        * writemasks.
 348        */
 349       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 350       temp_dst.type = dst.type;
 351
 352       emit(opcode, temp_dst, src0, src1);
 353
 354       emit(MOV(dst, src_reg(temp_dst)));
 355    } else {
 356       emit(opcode, dst, src0, src1);
 357    }
 358 }
 359
 360 void
 361 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 362                               dst_reg dst, src_reg src0, src_reg src1)
 363 {
 364    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 365    inst->base_mrf = 1;
 366    inst->mlen = 2;
 367 }
 368
 369 void
 370 vec4_visitor::emit_math(enum opcode opcode,
 371                         dst_reg dst, src_reg src0, src_reg src1)
 372 {
 373    switch (opcode) {
 374    case SHADER_OPCODE_POW:
 375    case SHADER_OPCODE_INT_QUOTIENT:
 376    case SHADER_OPCODE_INT_REMAINDER:
 377       break;
 378    default:
 379       assert(!"not reached: unsupported binary math opcode");
 380       return;
 381    }
 382
 383    if (intel->gen >= 6) {
 384       return emit_math2_gen6(opcode, dst, src0, src1);
 385    } else {
 386       return emit_math2_gen4(opcode, dst, src0, src1);
 387    }
 388 }
 389
 390 void
 391 vec4_visitor::visit_instructions(const exec_list *list)
 392 {
 393    foreach_list(node, list) {
 394       ir_instruction *ir = (ir_instruction *)node;
 395
 396       base_ir = ir;
 397       ir->accept(this);
 398    }
 399 }
 400
 401
 402 static int
 403 type_size(const struct glsl_type *type)
 404 {
 405    unsigned int i;
 406    int size;
 407
 408    switch (type->base_type) {
 409    case GLSL_TYPE_UINT:
 410    case GLSL_TYPE_INT:
 411    case GLSL_TYPE_FLOAT:
 412    case GLSL_TYPE_BOOL:
 413       if (type->is_matrix()) {
 414          return type->matrix_columns;
 415       } else {
 416          /* Regardless of size of vector, it gets a vec4. This is bad
 417           * packing for things like floats, but otherwise arrays become a
 418           * mess.  Hopefully a later pass over the code can pack scalars
 419           * down if appropriate.
 420           */
 421          return 1;
 422       }
 423    case GLSL_TYPE_ARRAY:
 424       assert(type->length > 0);
 425       return type_size(type->fields.array) * type->length;
 426    case GLSL_TYPE_STRUCT:
 427       size = 0;
 428       for (i = 0; i < type->length; i++) {
 429          size += type_size(type->fields.structure[i].type);
 430       }
 431       return size;
 432    case GLSL_TYPE_SAMPLER:
 433       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 434        * at link time.
 435        */
 436       return 1;
 437    default:
 438       assert(0);
 439       return 0;
 440    }
 441 }
 442
 443 int
 444 vec4_visitor::virtual_grf_alloc(int size)
 445 {
 446    if (virtual_grf_array_size <= virtual_grf_count) {
 447       if (virtual_grf_array_size == 0)
 448          virtual_grf_array_size = 16;
 449       else
 450          virtual_grf_array_size *= 2;
 451       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 452                                    virtual_grf_array_size);
 453       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 454                                      virtual_grf_array_size);
 455    }
 456    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 457    virtual_grf_reg_count += size;
 458    virtual_grf_sizes[virtual_grf_count] = size;
 459    return virtual_grf_count++;
 460 }
 461
 462 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 463 {
 464    init();
 465
 466    this->file = GRF;
 467    this->reg = v->virtual_grf_alloc(type_size(type));
 468
 469    if (type->is_array() || type->is_record()) {
 470       this->swizzle = BRW_SWIZZLE_NOOP;
 471    } else {
 472       this->swizzle = swizzle_for_size(type->vector_elements);
 473    }
 474
 475    this->type = brw_type_for_base_type(type);
 476 }
 477
 478 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 479 {
 480    init();
 481
 482    this->file = GRF;
 483    this->reg = v->virtual_grf_alloc(type_size(type));
 484
 485    if (type->is_array() || type->is_record()) {
 486       this->writemask = WRITEMASK_XYZW;
 487    } else {
 488       this->writemask = (1 << type->vector_elements) - 1;
 489    }
 490
 491    this->type = brw_type_for_base_type(type);
 492 }
 493
 494 /* Our support for uniforms is piggy-backed on the struct
 495  * gl_fragment_program, because that's where the values actually
 496  * get stored, rather than in some global gl_shader_program uniform
 497  * store.
 498  */
 499 int
 500 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
 501 {
 502    unsigned int offset = 0;
 503    float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
 504
 505    if (type->is_matrix()) {
 506       const glsl_type *column = type->column_type();
 507
 508       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 509          offset += setup_uniform_values(loc + offset, column);
 510       }
 511
 512       return offset;
 513    }
 514
 515    switch (type->base_type) {
 516    case GLSL_TYPE_FLOAT:
 517    case GLSL_TYPE_UINT:
 518    case GLSL_TYPE_INT:
 519    case GLSL_TYPE_BOOL:
 520       for (unsigned int i = 0; i < type->vector_elements; i++) {
 521          c->prog_data.param[this->uniforms * 4 + i] = &values[i];
 522       }
 523
 524       /* Set up pad elements to get things aligned to a vec4 boundary. */
 525       for (unsigned int i = type->vector_elements; i < 4; i++) {
 526          static float zero = 0;
 527
 528          c->prog_data.param[this->uniforms * 4 + i] = &zero;
 529       }
 530
 531       /* Track the size of this uniform vector, for future packing of
 532        * uniforms.
 533        */
 534       this->uniform_vector_size[this->uniforms] = type->vector_elements;
 535       this->uniforms++;
 536
 537       return 1;
 538
 539    case GLSL_TYPE_STRUCT:
 540       for (unsigned int i = 0; i < type->length; i++) {
 541          offset += setup_uniform_values(loc + offset,
 542                                         type->fields.structure[i].type);
 543       }
 544       return offset;
 545
 546    case GLSL_TYPE_ARRAY:
 547       for (unsigned int i = 0; i < type->length; i++) {
 548          offset += setup_uniform_values(loc + offset, type->fields.array);
 549       }
 550       return offset;
 551
 552    case GLSL_TYPE_SAMPLER:
 553       /* The sampler takes up a slot, but we don't use any values from it. */
 554       return 1;
 555
 556    default:
 557       assert(!"not reached");
 558       return 0;
 559    }
 560 }
 561
 562 void
 563 vec4_visitor::setup_uniform_clipplane_values()
 564 {
 565    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 566
 567    /* Pre-Gen6, we compact clip planes.  For example, if the user
 568     * enables just clip planes 0, 1, and 3, we will enable clip planes
 569     * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
 570     * plane 2.  This simplifies the implementation of the Gen6 clip
 571     * thread.
 572     *
 573     * In Gen6 and later, we don't compact clip planes, because this
 574     * simplifies the implementation of gl_ClipDistance.
 575     */
 576    int compacted_clipplane_index = 0;
 577    for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
 578       if (intel->gen < 6 &&
 579           !(c->key.userclip_planes_enabled_gen_4_5 & (1 << i))) {
 580          continue;
 581       }
 582       this->uniform_vector_size[this->uniforms] = 4;
 583       this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
 584       this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
 585       for (int j = 0; j < 4; ++j) {
 586          c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 587       }
 588       ++compacted_clipplane_index;
 589       ++this->uniforms;
 590    }
 591 }
 592
 593 /* Our support for builtin uniforms is even scarier than non-builtin.
 594  * It sits on top of the PROG_STATE_VAR parameters that are
 595  * automatically updated from GL context state.
 596  */
 597 void
 598 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 599 {
 600    const ir_state_slot *const slots = ir->state_slots;
 601    assert(ir->state_slots != NULL);
 602
 603    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 604       /* This state reference has already been setup by ir_to_mesa,
 605        * but we'll get the same index back here.  We can reference
 606        * ParameterValues directly, since unlike brw_fs.cpp, we never
 607        * add new state references during compile.
 608        */
 609       int index = _mesa_add_state_reference(this->vp->Base.Parameters,
 610                                             (gl_state_index *)slots[i].tokens);
 611       float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
 612
 613       this->uniform_vector_size[this->uniforms] = 0;
 614       /* Add each of the unique swizzled channels of the element.
 615        * This will end up matching the size of the glsl_type of this field.
 616        */
 617       int last_swiz = -1;
 618       for (unsigned int j = 0; j < 4; j++) {
 619          int swiz = GET_SWZ(slots[i].swizzle, j);
 620          last_swiz = swiz;
 621
 622          c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
 623          if (swiz <= last_swiz)
 624             this->uniform_vector_size[this->uniforms]++;
 625       }
 626       this->uniforms++;
 627    }
 628 }
 629
 630 dst_reg *
 631 vec4_visitor::variable_storage(ir_variable *var)
 632 {
 633    return (dst_reg *)hash_table_find(this->variable_ht, var);
 634 }
 635
 636 void
 637 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 638 {
 639    ir_expression *expr = ir->as_expression();
 640
 641    *predicate = BRW_PREDICATE_NORMAL;
 642
 643    if (expr) {
 644       src_reg op[2];
 645       vec4_instruction *inst;
 646
 647       assert(expr->get_num_operands() <= 2);
 648       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 649          expr->operands[i]->accept(this);
 650          op[i] = this->result;
 651
 652          resolve_ud_negate(&op[i]);
 653       }
 654
 655       switch (expr->operation) {
 656       case ir_unop_logic_not:
 657          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 658          inst->conditional_mod = BRW_CONDITIONAL_Z;
 659          break;
 660
 661       case ir_binop_logic_xor:
 662          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 663          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 664          break;
 665
 666       case ir_binop_logic_or:
 667          inst = emit(OR(dst_null_d(), op[0], op[1]));
 668          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 669          break;
 670
 671       case ir_binop_logic_and:
 672          inst = emit(AND(dst_null_d(), op[0], op[1]));
 673          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 674          break;
 675
 676       case ir_unop_f2b:
 677          if (intel->gen >= 6) {
 678             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 679          } else {
 680             inst = emit(MOV(dst_null_f(), op[0]));
 681             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 682          }
 683          break;
 684
 685       case ir_unop_i2b:
 686          if (intel->gen >= 6) {
 687             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 688          } else {
 689             inst = emit(MOV(dst_null_d(), op[0]));
 690             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 691          }
 692          break;
 693
 694       case ir_binop_all_equal:
 695          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 696          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 697          break;
 698
 699       case ir_binop_any_nequal:
 700          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 701          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 702          break;
 703
 704       case ir_unop_any:
 705          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 706          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 707          break;
 708
 709       case ir_binop_greater:
 710       case ir_binop_gequal:
 711       case ir_binop_less:
 712       case ir_binop_lequal:
 713       case ir_binop_equal:
 714       case ir_binop_nequal:
 715          emit(CMP(dst_null_d(), op[0], op[1],
 716                   brw_conditional_for_comparison(expr->operation)));
 717          break;
 718
 719       default:
 720          assert(!"not reached");
 721          break;
 722       }
 723       return;
 724    }
 725
 726    ir->accept(this);
 727
 728    resolve_ud_negate(&this->result);
 729
 730    if (intel->gen >= 6) {
 731       vec4_instruction *inst = emit(AND(dst_null_d(),
 732                                         this->result, src_reg(1)));
 733       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 734    } else {
 735       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 736       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 737    }
 738 }
 739
 740 /**
 741  * Emit a gen6 IF statement with the comparison folded into the IF
 742  * instruction.
 743  */
 744 void
 745 vec4_visitor::emit_if_gen6(ir_if *ir)
 746 {
 747    ir_expression *expr = ir->condition->as_expression();
 748
 749    if (expr) {
 750       src_reg op[2];
 751       dst_reg temp;
 752
 753       assert(expr->get_num_operands() <= 2);
 754       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 755          expr->operands[i]->accept(this);
 756          op[i] = this->result;
 757       }
 758
 759       switch (expr->operation) {
 760       case ir_unop_logic_not:
 761          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 762          return;
 763
 764       case ir_binop_logic_xor:
 765          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 766          return;
 767
 768       case ir_binop_logic_or:
 769          temp = dst_reg(this, glsl_type::bool_type);
 770          emit(OR(temp, op[0], op[1]));
 771          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 772          return;
 773
 774       case ir_binop_logic_and:
 775          temp = dst_reg(this, glsl_type::bool_type);
 776          emit(AND(temp, op[0], op[1]));
 777          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 778          return;
 779
 780       case ir_unop_f2b:
 781          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 782          return;
 783
 784       case ir_unop_i2b:
 785          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 786          return;
 787
 788       case ir_binop_greater:
 789       case ir_binop_gequal:
 790       case ir_binop_less:
 791       case ir_binop_lequal:
 792       case ir_binop_equal:
 793       case ir_binop_nequal:
 794          emit(IF(op[0], op[1],
 795                  brw_conditional_for_comparison(expr->operation)));
 796          return;
 797
 798       case ir_binop_all_equal:
 799          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 800          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 801          return;
 802
 803       case ir_binop_any_nequal:
 804          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 805          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 806          return;
 807
 808       case ir_unop_any:
 809          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 810          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 811          return;
 812
 813       default:
 814          assert(!"not reached");
 815          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 816          return;
 817       }
 818       return;
 819    }
 820
 821    ir->condition->accept(this);
 822
 823    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 824 }
 825
 826 void
 827 vec4_visitor::visit(ir_variable *ir)
 828 {
 829    dst_reg *reg = NULL;
 830
 831    if (variable_storage(ir))
 832       return;
 833
 834    switch (ir->mode) {
 835    case ir_var_in:
 836       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
 837
 838       /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 839        * come in as floating point conversions of the integer values.
 840        */
 841       for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
 842          if (!c->key.gl_fixed_input_size[i])
 843             continue;
 844
 845          dst_reg dst = *reg;
 846          dst.type = brw_type_for_base_type(ir->type);
 847          dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
 848          emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 849       }
 850       break;
 851
 852    case ir_var_out:
 853       reg = new(mem_ctx) dst_reg(this, ir->type);
 854
 855       for (int i = 0; i < type_size(ir->type); i++) {
 856          output_reg[ir->location + i] = *reg;
 857          output_reg[ir->location + i].reg_offset = i;
 858          output_reg[ir->location + i].type =
 859             brw_type_for_base_type(ir->type->get_scalar_type());
 860          output_reg_annotation[ir->location + i] = ir->name;
 861       }
 862       break;
 863
 864    case ir_var_auto:
 865    case ir_var_temporary:
 866       reg = new(mem_ctx) dst_reg(this, ir->type);
 867       break;
 868
 869    case ir_var_uniform:
 870       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 871
 872       /* Track how big the whole uniform variable is, in case we need to put a
 873        * copy of its data into pull constants for array access.
 874        */
 875       this->uniform_size[this->uniforms] = type_size(ir->type);
 876
 877       if (!strncmp(ir->name, "gl_", 3)) {
 878          setup_builtin_uniform_values(ir);
 879       } else {
 880          setup_uniform_values(ir->location, ir->type);
 881       }
 882       break;
 883
 884    case ir_var_system_value:
 885       /* VertexID is stored by the VF as the last vertex element, but
 886        * we don't represent it with a flag in inputs_read, so we call
 887        * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
 888        */
 889       reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
 890       prog_data->uses_vertexid = true;
 891
 892       switch (ir->location) {
 893       case SYSTEM_VALUE_VERTEX_ID:
 894          reg->writemask = WRITEMASK_X;
 895          break;
 896       case SYSTEM_VALUE_INSTANCE_ID:
 897          reg->writemask = WRITEMASK_Y;
 898          break;
 899       default:
 900          assert(!"not reached");
 901          break;
 902       }
 903       break;
 904
 905    default:
 906       assert(!"not reached");
 907    }
 908
 909    reg->type = brw_type_for_base_type(ir->type);
 910    hash_table_insert(this->variable_ht, reg, ir);
 911 }
 912
 913 void
 914 vec4_visitor::visit(ir_loop *ir)
 915 {
 916    dst_reg counter;
 917
 918    /* We don't want debugging output to print the whole body of the
 919     * loop as the annotation.
 920     */
 921    this->base_ir = NULL;
 922
 923    if (ir->counter != NULL) {
 924       this->base_ir = ir->counter;
 925       ir->counter->accept(this);
 926       counter = *(variable_storage(ir->counter));
 927
 928       if (ir->from != NULL) {
 929          this->base_ir = ir->from;
 930          ir->from->accept(this);
 931
 932          emit(MOV(counter, this->result));
 933       }
 934    }
 935
 936    emit(BRW_OPCODE_DO);
 937
 938    if (ir->to) {
 939       this->base_ir = ir->to;
 940       ir->to->accept(this);
 941
 942       emit(CMP(dst_null_d(), src_reg(counter), this->result,
 943                brw_conditional_for_comparison(ir->cmp)));
 944
 945       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
 946       inst->predicate = BRW_PREDICATE_NORMAL;
 947    }
 948
 949    visit_instructions(&ir->body_instructions);
 950
 951
 952    if (ir->increment) {
 953       this->base_ir = ir->increment;
 954       ir->increment->accept(this);
 955       emit(ADD(counter, src_reg(counter), this->result));
 956    }
 957
 958    emit(BRW_OPCODE_WHILE);
 959 }
 960
 961 void
 962 vec4_visitor::visit(ir_loop_jump *ir)
 963 {
 964    switch (ir->mode) {
 965    case ir_loop_jump::jump_break:
 966       emit(BRW_OPCODE_BREAK);
 967       break;
 968    case ir_loop_jump::jump_continue:
 969       emit(BRW_OPCODE_CONTINUE);
 970       break;
 971    }
 972 }
 973
 974
 975 void
 976 vec4_visitor::visit(ir_function_signature *ir)
 977 {
 978    assert(0);
 979    (void)ir;
 980 }
 981
 982 void
 983 vec4_visitor::visit(ir_function *ir)
 984 {
 985    /* Ignore function bodies other than main() -- we shouldn't see calls to
 986     * them since they should all be inlined.
 987     */
 988    if (strcmp(ir->name, "main") == 0) {
 989       const ir_function_signature *sig;
 990       exec_list empty;
 991
 992       sig = ir->matching_signature(&empty);
 993
 994       assert(sig);
 995
 996       visit_instructions(&sig->body);
 997    }
 998 }
 999
1000 bool
1001 vec4_visitor::try_emit_sat(ir_expression *ir)
1002 {
1003    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1004    if (!sat_src)
1005       return false;
1006
1007    sat_src->accept(this);
1008    src_reg src = this->result;
1009
1010    this->result = src_reg(this, ir->type);
1011    vec4_instruction *inst;
1012    inst = emit(MOV(dst_reg(this->result), src));
1013    inst->saturate = true;
1014
1015    return true;
1016 }
1017
1018 void
1019 vec4_visitor::emit_bool_comparison(unsigned int op,
1020                                  dst_reg dst, src_reg src0, src_reg src1)
1021 {
1022    /* original gen4 does destination conversion before comparison. */
1023    if (intel->gen < 5)
1024       dst.type = src0.type;
1025
1026    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1027
1028    dst.type = BRW_REGISTER_TYPE_D;
1029    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1030 }
1031
1032 void
1033 vec4_visitor::visit(ir_expression *ir)
1034 {
1035    unsigned int operand;
1036    src_reg op[Elements(ir->operands)];
1037    src_reg result_src;
1038    dst_reg result_dst;
1039    vec4_instruction *inst;
1040
1041    if (try_emit_sat(ir))
1042       return;
1043
1044    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1045       this->result.file = BAD_FILE;
1046       ir->operands[operand]->accept(this);
1047       if (this->result.file == BAD_FILE) {
1048          printf("Failed to get tree for expression operand:\n");
1049          ir->operands[operand]->print();
1050          exit(1);
1051       }
1052       op[operand] = this->result;
1053
1054       /* Matrix expression operands should have been broken down to vector
1055        * operations already.
1056        */
1057       assert(!ir->operands[operand]->type->is_matrix());
1058    }
1059
1060    int vector_elements = ir->operands[0]->type->vector_elements;
1061    if (ir->operands[1]) {
1062       vector_elements = MAX2(vector_elements,
1063                              ir->operands[1]->type->vector_elements);
1064    }
1065
1066    this->result.file = BAD_FILE;
1067
1068    /* Storage for our result.  Ideally for an assignment we'd be using
1069     * the actual storage for the result here, instead.
1070     */
1071    result_src = src_reg(this, ir->type);
1072    /* convenience for the emit functions below. */
1073    result_dst = dst_reg(result_src);
1074    /* If nothing special happens, this is the result. */
1075    this->result = result_src;
1076    /* Limit writes to the channels that will be used by result_src later.
1077     * This does limit this temp's use as a temporary for multi-instruction
1078     * sequences.
1079     */
1080    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1081
1082    switch (ir->operation) {
1083    case ir_unop_logic_not:
1084       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1085        * ones complement of the whole register, not just bit 0.
1086        */
1087       emit(XOR(result_dst, op[0], src_reg(1)));
1088       break;
1089    case ir_unop_neg:
1090       op[0].negate = !op[0].negate;
1091       this->result = op[0];
1092       break;
1093    case ir_unop_abs:
1094       op[0].abs = true;
1095       op[0].negate = false;
1096       this->result = op[0];
1097       break;
1098
1099    case ir_unop_sign:
1100       emit(MOV(result_dst, src_reg(0.0f)));
1101
1102       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1103       inst = emit(MOV(result_dst, src_reg(1.0f)));
1104       inst->predicate = BRW_PREDICATE_NORMAL;
1105
1106       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1107       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1108       inst->predicate = BRW_PREDICATE_NORMAL;
1109
1110       break;
1111
1112    case ir_unop_rcp:
1113       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1114       break;
1115
1116    case ir_unop_exp2:
1117       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1118       break;
1119    case ir_unop_log2:
1120       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1121       break;
1122    case ir_unop_exp:
1123    case ir_unop_log:
1124       assert(!"not reached: should be handled by ir_explog_to_explog2");
1125       break;
1126    case ir_unop_sin:
1127    case ir_unop_sin_reduced:
1128       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1129       break;
1130    case ir_unop_cos:
1131    case ir_unop_cos_reduced:
1132       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1133       break;
1134
1135    case ir_unop_dFdx:
1136    case ir_unop_dFdy:
1137       assert(!"derivatives not valid in vertex shader");
1138       break;
1139
1140    case ir_unop_noise:
1141       assert(!"not reached: should be handled by lower_noise");
1142       break;
1143
1144    case ir_binop_add:
1145       emit(ADD(result_dst, op[0], op[1]));
1146       break;
1147    case ir_binop_sub:
1148       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1149       break;
1150
1151    case ir_binop_mul:
1152       if (ir->type->is_integer()) {
1153          /* For integer multiplication, the MUL uses the low 16 bits
1154           * of one of the operands (src0 on gen6, src1 on gen7).  The
1155           * MACH accumulates in the contribution of the upper 16 bits
1156           * of that operand.
1157           *
1158           * FINISHME: Emit just the MUL if we know an operand is small
1159           * enough.
1160           */
1161          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1162
1163          emit(MUL(acc, op[0], op[1]));
1164          emit(MACH(dst_null_d(), op[0], op[1]));
1165          emit(MOV(result_dst, src_reg(acc)));
1166       } else {
1167          emit(MUL(result_dst, op[0], op[1]));
1168       }
1169       break;
1170    case ir_binop_div:
1171       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1172       assert(ir->type->is_integer());
1173       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1174       break;
1175    case ir_binop_mod:
1176       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1177       assert(ir->type->is_integer());
1178       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1179       break;
1180
1181    case ir_binop_less:
1182    case ir_binop_greater:
1183    case ir_binop_lequal:
1184    case ir_binop_gequal:
1185    case ir_binop_equal:
1186    case ir_binop_nequal: {
1187       emit(CMP(result_dst, op[0], op[1],
1188                brw_conditional_for_comparison(ir->operation)));
1189       emit(AND(result_dst, result_src, src_reg(0x1)));
1190       break;
1191    }
1192
1193    case ir_binop_all_equal:
1194       /* "==" operator producing a scalar boolean. */
1195       if (ir->operands[0]->type->is_vector() ||
1196           ir->operands[1]->type->is_vector()) {
1197          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1198          emit(MOV(result_dst, src_reg(0)));
1199          inst = emit(MOV(result_dst, src_reg(1)));
1200          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1201       } else {
1202          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1203          emit(AND(result_dst, result_src, src_reg(0x1)));
1204       }
1205       break;
1206    case ir_binop_any_nequal:
1207       /* "!=" operator producing a scalar boolean. */
1208       if (ir->operands[0]->type->is_vector() ||
1209           ir->operands[1]->type->is_vector()) {
1210          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1211
1212          emit(MOV(result_dst, src_reg(0)));
1213          inst = emit(MOV(result_dst, src_reg(1)));
1214          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1215       } else {
1216          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1217          emit(AND(result_dst, result_src, src_reg(0x1)));
1218       }
1219       break;
1220
1221    case ir_unop_any:
1222       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1223       emit(MOV(result_dst, src_reg(0)));
1224
1225       inst = emit(MOV(result_dst, src_reg(1)));
1226       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1227       break;
1228
1229    case ir_binop_logic_xor:
1230       emit(XOR(result_dst, op[0], op[1]));
1231       break;
1232
1233    case ir_binop_logic_or:
1234       emit(OR(result_dst, op[0], op[1]));
1235       break;
1236
1237    case ir_binop_logic_and:
1238       emit(AND(result_dst, op[0], op[1]));
1239       break;
1240
1241    case ir_binop_dot:
1242       assert(ir->operands[0]->type->is_vector());
1243       assert(ir->operands[0]->type == ir->operands[1]->type);
1244       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1245       break;
1246
1247    case ir_unop_sqrt:
1248       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1249       break;
1250    case ir_unop_rsq:
1251       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1252       break;
1253    case ir_unop_i2f:
1254    case ir_unop_i2u:
1255    case ir_unop_u2i:
1256    case ir_unop_u2f:
1257    case ir_unop_b2f:
1258    case ir_unop_b2i:
1259    case ir_unop_f2i:
1260       emit(MOV(result_dst, op[0]));
1261       break;
1262    case ir_unop_f2b:
1263    case ir_unop_i2b: {
1264       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1265       emit(AND(result_dst, result_src, src_reg(1)));
1266       break;
1267    }
1268
1269    case ir_unop_trunc:
1270       emit(RNDZ(result_dst, op[0]));
1271       break;
1272    case ir_unop_ceil:
1273       op[0].negate = !op[0].negate;
1274       inst = emit(RNDD(result_dst, op[0]));
1275       this->result.negate = true;
1276       break;
1277    case ir_unop_floor:
1278       inst = emit(RNDD(result_dst, op[0]));
1279       break;
1280    case ir_unop_fract:
1281       inst = emit(FRC(result_dst, op[0]));
1282       break;
1283    case ir_unop_round_even:
1284       emit(RNDE(result_dst, op[0]));
1285       break;
1286
1287    case ir_binop_min:
1288       emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_L));
1289
1290       inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1291       inst->predicate = BRW_PREDICATE_NORMAL;
1292       break;
1293    case ir_binop_max:
1294       emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_G));
1295
1296       inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1297       inst->predicate = BRW_PREDICATE_NORMAL;
1298       break;
1299
1300    case ir_binop_pow:
1301       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1302       break;
1303
1304    case ir_unop_bit_not:
1305       inst = emit(NOT(result_dst, op[0]));
1306       break;
1307    case ir_binop_bit_and:
1308       inst = emit(AND(result_dst, op[0], op[1]));
1309       break;
1310    case ir_binop_bit_xor:
1311       inst = emit(XOR(result_dst, op[0], op[1]));
1312       break;
1313    case ir_binop_bit_or:
1314       inst = emit(OR(result_dst, op[0], op[1]));
1315       break;
1316
1317    case ir_binop_lshift:
1318       inst = emit(BRW_OPCODE_SHL, result_dst, op[0], op[1]);
1319       break;
1320
1321    case ir_binop_rshift:
1322       if (ir->type->base_type == GLSL_TYPE_INT)
1323          inst = emit(BRW_OPCODE_ASR, result_dst, op[0], op[1]);
1324       else
1325          inst = emit(BRW_OPCODE_SHR, result_dst, op[0], op[1]);
1326       break;
1327
1328    case ir_quadop_vector:
1329       assert(!"not reached: should be handled by lower_quadop_vector");
1330       break;
1331    }
1332 }
1333
1334
1335 void
1336 vec4_visitor::visit(ir_swizzle *ir)
1337 {
1338    src_reg src;
1339    int i = 0;
1340    int swizzle[4];
1341
1342    /* Note that this is only swizzles in expressions, not those on the left
1343     * hand side of an assignment, which do write masking.  See ir_assignment
1344     * for that.
1345     */
1346
1347    ir->val->accept(this);
1348    src = this->result;
1349    assert(src.file != BAD_FILE);
1350
1351    for (i = 0; i < ir->type->vector_elements; i++) {
1352       switch (i) {
1353       case 0:
1354          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1355          break;
1356       case 1:
1357          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1358          break;
1359       case 2:
1360          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1361          break;
1362       case 3:
1363          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1364             break;
1365       }
1366    }
1367    for (; i < 4; i++) {
1368       /* Replicate the last channel out. */
1369       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1370    }
1371
1372    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1373
1374    this->result = src;
1375 }
1376
1377 void
1378 vec4_visitor::visit(ir_dereference_variable *ir)
1379 {
1380    const struct glsl_type *type = ir->type;
1381    dst_reg *reg = variable_storage(ir->var);
1382
1383    if (!reg) {
1384       fail("Failed to find variable storage for %s\n", ir->var->name);
1385       this->result = src_reg(brw_null_reg());
1386       return;
1387    }
1388
1389    this->result = src_reg(*reg);
1390
1391    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1392       this->result.swizzle = swizzle_for_size(type->vector_elements);
1393 }
1394
1395 void
1396 vec4_visitor::visit(ir_dereference_array *ir)
1397 {
1398    ir_constant *constant_index;
1399    src_reg src;
1400    int element_size = type_size(ir->type);
1401
1402    constant_index = ir->array_index->constant_expression_value();
1403
1404    ir->array->accept(this);
1405    src = this->result;
1406
1407    if (constant_index) {
1408       src.reg_offset += constant_index->value.i[0] * element_size;
1409    } else {
1410       /* Variable index array dereference.  It eats the "vec4" of the
1411        * base of the array and an index that offsets the Mesa register
1412        * index.
1413        */
1414       ir->array_index->accept(this);
1415
1416       src_reg index_reg;
1417
1418       if (element_size == 1) {
1419          index_reg = this->result;
1420       } else {
1421          index_reg = src_reg(this, glsl_type::int_type);
1422
1423          emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1424       }
1425
1426       if (src.reladdr) {
1427          src_reg temp = src_reg(this, glsl_type::int_type);
1428
1429          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1430
1431          index_reg = temp;
1432       }
1433
1434       src.reladdr = ralloc(mem_ctx, src_reg);
1435       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1436    }
1437
1438    /* If the type is smaller than a vec4, replicate the last channel out. */
1439    if (ir->type->is_scalar() || ir->type->is_vector())
1440       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1441    else
1442       src.swizzle = BRW_SWIZZLE_NOOP;
1443    src.type = brw_type_for_base_type(ir->type);
1444
1445    this->result = src;
1446 }
1447
1448 void
1449 vec4_visitor::visit(ir_dereference_record *ir)
1450 {
1451    unsigned int i;
1452    const glsl_type *struct_type = ir->record->type;
1453    int offset = 0;
1454
1455    ir->record->accept(this);
1456
1457    for (i = 0; i < struct_type->length; i++) {
1458       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1459          break;
1460       offset += type_size(struct_type->fields.structure[i].type);
1461    }
1462
1463    /* If the type is smaller than a vec4, replicate the last channel out. */
1464    if (ir->type->is_scalar() || ir->type->is_vector())
1465       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1466    else
1467       this->result.swizzle = BRW_SWIZZLE_NOOP;
1468    this->result.type = brw_type_for_base_type(ir->type);
1469
1470    this->result.reg_offset += offset;
1471 }
1472
1473 /**
1474  * We want to be careful in assignment setup to hit the actual storage
1475  * instead of potentially using a temporary like we might with the
1476  * ir_dereference handler.
1477  */
1478 static dst_reg
1479 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1480 {
1481    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1482     * access of a vector, it must be separated into a series conditional moves
1483     * before reaching this point (see ir_vec_index_to_cond_assign).
1484     */
1485    assert(ir->as_dereference());
1486    ir_dereference_array *deref_array = ir->as_dereference_array();
1487    if (deref_array) {
1488       assert(!deref_array->array->type->is_vector());
1489    }
1490
1491    /* Use the rvalue deref handler for the most part.  We'll ignore
1492     * swizzles in it and write swizzles using writemask, though.
1493     */
1494    ir->accept(v);
1495    return dst_reg(v->result);
1496 }
1497
1498 void
1499 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1500                               const struct glsl_type *type, uint32_t predicate)
1501 {
1502    if (type->base_type == GLSL_TYPE_STRUCT) {
1503       for (unsigned int i = 0; i < type->length; i++) {
1504          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1505       }
1506       return;
1507    }
1508
1509    if (type->is_array()) {
1510       for (unsigned int i = 0; i < type->length; i++) {
1511          emit_block_move(dst, src, type->fields.array, predicate);
1512       }
1513       return;
1514    }
1515
1516    if (type->is_matrix()) {
1517       const struct glsl_type *vec_type;
1518
1519       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1520                                          type->vector_elements, 1);
1521
1522       for (int i = 0; i < type->matrix_columns; i++) {
1523          emit_block_move(dst, src, vec_type, predicate);
1524       }
1525       return;
1526    }
1527
1528    assert(type->is_scalar() || type->is_vector());
1529
1530    dst->type = brw_type_for_base_type(type);
1531    src->type = dst->type;
1532
1533    dst->writemask = (1 << type->vector_elements) - 1;
1534
1535    /* Do we need to worry about swizzling a swizzle? */
1536    assert(src->swizzle == BRW_SWIZZLE_NOOP
1537           || src->swizzle == swizzle_for_size(type->vector_elements));
1538    src->swizzle = swizzle_for_size(type->vector_elements);
1539
1540    vec4_instruction *inst = emit(MOV(*dst, *src));
1541    inst->predicate = predicate;
1542
1543    dst->reg_offset++;
1544    src->reg_offset++;
1545 }
1546
1547
1548 /* If the RHS processing resulted in an instruction generating a
1549  * temporary value, and it would be easy to rewrite the instruction to
1550  * generate its result right into the LHS instead, do so.  This ends
1551  * up reliably removing instructions where it can be tricky to do so
1552  * later without real UD chain information.
1553  */
1554 bool
1555 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1556                                      dst_reg dst,
1557                                      src_reg src,
1558                                      vec4_instruction *pre_rhs_inst,
1559                                      vec4_instruction *last_rhs_inst)
1560 {
1561    /* This could be supported, but it would take more smarts. */
1562    if (ir->condition)
1563       return false;
1564
1565    if (pre_rhs_inst == last_rhs_inst)
1566       return false; /* No instructions generated to work with. */
1567
1568    /* Make sure the last instruction generated our source reg. */
1569    if (src.file != GRF ||
1570        src.file != last_rhs_inst->dst.file ||
1571        src.reg != last_rhs_inst->dst.reg ||
1572        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1573        src.reladdr ||
1574        src.abs ||
1575        src.negate ||
1576        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1577       return false;
1578
1579    /* Check that that last instruction fully initialized the channels
1580     * we want to use, in the order we want to use them.  We could
1581     * potentially reswizzle the operands of many instructions so that
1582     * we could handle out of order channels, but don't yet.
1583     */
1584
1585    for (unsigned i = 0; i < 4; i++) {
1586       if (dst.writemask & (1 << i)) {
1587          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1588             return false;
1589
1590          if (BRW_GET_SWZ(src.swizzle, i) != i)
1591             return false;
1592       }
1593    }
1594
1595    /* Success!  Rewrite the instruction. */
1596    last_rhs_inst->dst.file = dst.file;
1597    last_rhs_inst->dst.reg = dst.reg;
1598    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1599    last_rhs_inst->dst.reladdr = dst.reladdr;
1600    last_rhs_inst->dst.writemask &= dst.writemask;
1601
1602    return true;
1603 }
1604
1605 void
1606 vec4_visitor::visit(ir_assignment *ir)
1607 {
1608    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1609    uint32_t predicate = BRW_PREDICATE_NONE;
1610
1611    if (!ir->lhs->type->is_scalar() &&
1612        !ir->lhs->type->is_vector()) {
1613       ir->rhs->accept(this);
1614       src_reg src = this->result;
1615
1616       if (ir->condition) {
1617          emit_bool_to_cond_code(ir->condition, &predicate);
1618       }
1619
1620       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1621       return;
1622    }
1623
1624    /* Now we're down to just a scalar/vector with writemasks. */
1625    int i;
1626
1627    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1628    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1629
1630    ir->rhs->accept(this);
1631
1632    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1633
1634    src_reg src = this->result;
1635
1636    int swizzles[4];
1637    int first_enabled_chan = 0;
1638    int src_chan = 0;
1639
1640    assert(ir->lhs->type->is_vector() ||
1641           ir->lhs->type->is_scalar());
1642    dst.writemask = ir->write_mask;
1643
1644    for (int i = 0; i < 4; i++) {
1645       if (dst.writemask & (1 << i)) {
1646          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1647          break;
1648       }
1649    }
1650
1651    /* Swizzle a small RHS vector into the channels being written.
1652     *
1653     * glsl ir treats write_mask as dictating how many channels are
1654     * present on the RHS while in our instructions we need to make
1655     * those channels appear in the slots of the vec4 they're written to.
1656     */
1657    for (int i = 0; i < 4; i++) {
1658       if (dst.writemask & (1 << i))
1659          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1660       else
1661          swizzles[i] = first_enabled_chan;
1662    }
1663    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1664                               swizzles[2], swizzles[3]);
1665
1666    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1667       return;
1668    }
1669
1670    if (ir->condition) {
1671       emit_bool_to_cond_code(ir->condition, &predicate);
1672    }
1673
1674    for (i = 0; i < type_size(ir->lhs->type); i++) {
1675       vec4_instruction *inst = emit(MOV(dst, src));
1676       inst->predicate = predicate;
1677
1678       dst.reg_offset++;
1679       src.reg_offset++;
1680    }
1681 }
1682
1683 void
1684 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1685 {
1686    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1687       foreach_list(node, &ir->components) {
1688          ir_constant *field_value = (ir_constant *)node;
1689
1690          emit_constant_values(dst, field_value);
1691       }
1692       return;
1693    }
1694
1695    if (ir->type->is_array()) {
1696       for (unsigned int i = 0; i < ir->type->length; i++) {
1697          emit_constant_values(dst, ir->array_elements[i]);
1698       }
1699       return;
1700    }
1701
1702    if (ir->type->is_matrix()) {
1703       for (int i = 0; i < ir->type->matrix_columns; i++) {
1704          for (int j = 0; j < ir->type->vector_elements; j++) {
1705             dst->writemask = 1 << j;
1706             dst->type = BRW_REGISTER_TYPE_F;
1707
1708             emit(MOV(*dst,
1709                      src_reg(ir->value.f[i * ir->type->vector_elements + j])));
1710          }
1711          dst->reg_offset++;
1712       }
1713       return;
1714    }
1715
1716    for (int i = 0; i < ir->type->vector_elements; i++) {
1717       dst->writemask = 1 << i;
1718       dst->type = brw_type_for_base_type(ir->type);
1719
1720       switch (ir->type->base_type) {
1721       case GLSL_TYPE_FLOAT:
1722          emit(MOV(*dst, src_reg(ir->value.f[i])));
1723          break;
1724       case GLSL_TYPE_INT:
1725          emit(MOV(*dst, src_reg(ir->value.i[i])));
1726          break;
1727       case GLSL_TYPE_UINT:
1728          emit(MOV(*dst, src_reg(ir->value.u[i])));
1729          break;
1730       case GLSL_TYPE_BOOL:
1731          emit(MOV(*dst, src_reg(ir->value.b[i])));
1732          break;
1733       default:
1734          assert(!"Non-float/uint/int/bool constant");
1735          break;
1736       }
1737    }
1738    dst->reg_offset++;
1739 }
1740
1741 void
1742 vec4_visitor::visit(ir_constant *ir)
1743 {
1744    dst_reg dst = dst_reg(this, ir->type);
1745    this->result = src_reg(dst);
1746
1747    emit_constant_values(&dst, ir);
1748 }
1749
1750 void
1751 vec4_visitor::visit(ir_call *ir)
1752 {
1753    assert(!"not reached");
1754 }
1755
1756 void
1757 vec4_visitor::visit(ir_texture *ir)
1758 {
1759    int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
1760    sampler = vp->Base.SamplerUnits[sampler];
1761
1762    /* Should be lowered by do_lower_texture_projection */
1763    assert(!ir->projector);
1764
1765    vec4_instruction *inst;
1766    switch (ir->op) {
1767    case ir_tex:
1768    case ir_txl:
1769       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
1770       break;
1771    case ir_txd:
1772       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
1773       break;
1774    case ir_txf:
1775       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
1776       break;
1777    case ir_txs:
1778       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
1779       break;
1780    case ir_txb:
1781       assert(!"TXB is not valid for vertex shaders.");
1782    }
1783
1784    inst->header_present = intel->gen < 5;
1785    inst->base_mrf = 2;
1786    inst->mlen = inst->header_present + 1; /* always at least one */
1787    inst->sampler = sampler;
1788    inst->dst = dst_reg(this, glsl_type::get_instance(ir->type->base_type,4,1));
1789    inst->shadow_compare = ir->shadow_comparitor != NULL;
1790
1791    /* MRF for the first parameter */
1792    int param_base = inst->base_mrf + inst->header_present;
1793
1794    if (ir->op == ir_txs) {
1795       ir->lod_info.lod->accept(this);
1796       int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
1797       emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, writemask),
1798            this->result));
1799    } else {
1800       int i, coord_mask = 0, zero_mask = 0;
1801       /* Load the coordinate */
1802       /* FINISHME: gl_clamp_mask and saturate */
1803       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
1804          coord_mask |= (1 << i);
1805       for (; i < 4; i++)
1806          zero_mask |= (1 << i);
1807
1808       ir->coordinate->accept(this);
1809       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
1810                this->result));
1811       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
1812                src_reg(0)));
1813       /* Load the shadow comparitor */
1814       if (ir->shadow_comparitor) {
1815          ir->shadow_comparitor->accept(this);
1816          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
1817                           WRITEMASK_X),
1818                   this->result));
1819          inst->mlen++;
1820       }
1821
1822       /* Load the LOD info */
1823       if (ir->op == ir_txl) {
1824          int mrf, writemask;
1825          if (intel->gen >= 5) {
1826             mrf = param_base + 1;
1827             if (ir->shadow_comparitor) {
1828                writemask = WRITEMASK_Y;
1829                /* mlen already incremented */
1830             } else {
1831                writemask = WRITEMASK_X;
1832                inst->mlen++;
1833             }
1834          } else /* intel->gen == 4 */ {
1835             mrf = param_base;
1836             writemask = WRITEMASK_Z;
1837          }
1838          ir->lod_info.lod->accept(this);
1839          emit(MOV(dst_reg(MRF, mrf, ir->lod_info.lod->type, writemask),
1840                   this->result));
1841       } else if (ir->op == ir_txf) {
1842          ir->lod_info.lod->accept(this);
1843          emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, WRITEMASK_W),
1844                   this->result));
1845       } else if (ir->op == ir_txd) {
1846          const glsl_type *type = ir->lod_info.grad.dPdx->type;
1847
1848          ir->lod_info.grad.dPdx->accept(this);
1849          src_reg dPdx = this->result;
1850          ir->lod_info.grad.dPdy->accept(this);
1851          src_reg dPdy = this->result;
1852
1853          if (intel->gen >= 5) {
1854             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1855             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1856             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
1857             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
1858             inst->mlen++;
1859
1860             if (ir->type->vector_elements == 3) {
1861                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
1862                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
1863                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
1864                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
1865                inst->mlen++;
1866             }
1867          } else /* intel->gen == 4 */ {
1868             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
1869             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
1870             inst->mlen += 2;
1871          }
1872       }
1873    }
1874
1875    emit(inst);
1876
1877    this->result = src_reg(inst->dst);
1878 }
1879
1880 void
1881 vec4_visitor::visit(ir_return *ir)
1882 {
1883    assert(!"not reached");
1884 }
1885
1886 void
1887 vec4_visitor::visit(ir_discard *ir)
1888 {
1889    assert(!"not reached");
1890 }
1891
1892 void
1893 vec4_visitor::visit(ir_if *ir)
1894 {
1895    /* Don't point the annotation at the if statement, because then it plus
1896     * the then and else blocks get printed.
1897     */
1898    this->base_ir = ir->condition;
1899
1900    if (intel->gen == 6) {
1901       emit_if_gen6(ir);
1902    } else {
1903       uint32_t predicate;
1904       emit_bool_to_cond_code(ir->condition, &predicate);
1905       emit(IF(predicate));
1906    }
1907
1908    visit_instructions(&ir->then_instructions);
1909
1910    if (!ir->else_instructions.is_empty()) {
1911       this->base_ir = ir->condition;
1912       emit(BRW_OPCODE_ELSE);
1913
1914       visit_instructions(&ir->else_instructions);
1915    }
1916
1917    this->base_ir = ir->condition;
1918    emit(BRW_OPCODE_ENDIF);
1919 }
1920
1921 void
1922 vec4_visitor::emit_ndc_computation()
1923 {
1924    /* Get the position */
1925    src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
1926
1927    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1928    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1929    output_reg[BRW_VERT_RESULT_NDC] = ndc;
1930
1931    current_annotation = "NDC";
1932    dst_reg ndc_w = ndc;
1933    ndc_w.writemask = WRITEMASK_W;
1934    src_reg pos_w = pos;
1935    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1936    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1937
1938    dst_reg ndc_xyz = ndc;
1939    ndc_xyz.writemask = WRITEMASK_XYZ;
1940
1941    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1942 }
1943
1944 void
1945 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
1946 {
1947    if (intel->gen < 6 &&
1948        ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1949         c->key.userclip_active || brw->has_negative_rhw_bug)) {
1950       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1951       dst_reg header1_w = header1;
1952       header1_w.writemask = WRITEMASK_W;
1953       GLuint i;
1954
1955       emit(MOV(header1, 0u));
1956
1957       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1958          src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
1959
1960          current_annotation = "Point size";
1961          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
1962          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
1963       }
1964
1965       current_annotation = "Clipping flags";
1966       for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
1967          vec4_instruction *inst;
1968
1969          inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
1970                          src_reg(this->userplane[i])));
1971          inst->conditional_mod = BRW_CONDITIONAL_L;
1972
1973          inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
1974          inst->predicate = BRW_PREDICATE_NORMAL;
1975       }
1976
1977       /* i965 clipping workaround:
1978        * 1) Test for -ve rhw
1979        * 2) If set,
1980        *      set ndc = (0,0,0,0)
1981        *      set ucp[6] = 1
1982        *
1983        * Later, clipping will detect ucp[6] and ensure the primitive is
1984        * clipped against all fixed planes.
1985        */
1986       if (brw->has_negative_rhw_bug) {
1987 #if 0
1988          /* FINISHME */
1989          brw_CMP(p,
1990                  vec8(brw_null_reg()),
1991                  BRW_CONDITIONAL_L,
1992                  brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
1993                  brw_imm_f(0));
1994
1995          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1996          brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
1997          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1998 #endif
1999       }
2000
2001       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2002    } else if (intel->gen < 6) {
2003       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2004    } else {
2005       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2006       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2007          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2008                   src_reg(output_reg[VERT_RESULT_PSIZ])));
2009       }
2010    }
2011 }
2012
2013 void
2014 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2015 {
2016    if (intel->gen < 6) {
2017       /* Clip distance slots are set aside in gen5, but they are not used.  It
2018        * is not clear whether we actually need to set aside space for them,
2019        * but the performance cost is negligible.
2020        */
2021       return;
2022    }
2023
2024    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2025     *
2026     *     "If a linked set of shaders forming the vertex stage contains no
2027     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2028     *     application has requested clipping against user clip planes through
2029     *     the API, then the coordinate written to gl_Position is used for
2030     *     comparison against the user clip planes."
2031     *
2032     * This function is only called if the shader didn't write to
2033     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2034     * if the user wrote to it; otherwise we use gl_Position.
2035     */
2036    gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2037    if (!(c->prog_data.outputs_written
2038          & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2039       clip_vertex = VERT_RESULT_HPOS;
2040    }
2041
2042    for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2043         ++i) {
2044       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2045                src_reg(output_reg[clip_vertex]),
2046                src_reg(this->userplane[i + offset])));
2047    }
2048 }
2049
2050 void
2051 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2052 {
2053    assert (vert_result < VERT_RESULT_MAX);
2054    reg.type = output_reg[vert_result].type;
2055    current_annotation = output_reg_annotation[vert_result];
2056    /* Copy the register, saturating if necessary */
2057    vec4_instruction *inst = emit(MOV(reg,
2058                                      src_reg(output_reg[vert_result])));
2059    if ((vert_result == VERT_RESULT_COL0 ||
2060         vert_result == VERT_RESULT_COL1 ||
2061         vert_result == VERT_RESULT_BFC0 ||
2062         vert_result == VERT_RESULT_BFC1) &&
2063        c->key.clamp_vertex_color) {
2064       inst->saturate = true;
2065    }
2066 }
2067
2068 void
2069 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2070 {
2071    struct brw_reg hw_reg = brw_message_reg(mrf);
2072    dst_reg reg = dst_reg(MRF, mrf);
2073    reg.type = BRW_REGISTER_TYPE_F;
2074
2075    switch (vert_result) {
2076    case VERT_RESULT_PSIZ:
2077       /* PSIZ is always in slot 0, and is coupled with other flags. */
2078       current_annotation = "indices, point width, clip flags";
2079       emit_psiz_and_flags(hw_reg);
2080       break;
2081    case BRW_VERT_RESULT_NDC:
2082       current_annotation = "NDC";
2083       emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2084       break;
2085    case BRW_VERT_RESULT_HPOS_DUPLICATE:
2086    case VERT_RESULT_HPOS:
2087       current_annotation = "gl_Position";
2088       emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2089       break;
2090    case VERT_RESULT_CLIP_DIST0:
2091    case VERT_RESULT_CLIP_DIST1:
2092       if (this->c->key.uses_clip_distance) {
2093          emit_generic_urb_slot(reg, vert_result);
2094       } else {
2095          current_annotation = "user clip distances";
2096          emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2097       }
2098       break;
2099    case BRW_VERT_RESULT_PAD:
2100       /* No need to write to this slot */
2101       break;
2102    default:
2103       emit_generic_urb_slot(reg, vert_result);
2104       break;
2105    }
2106 }
2107
2108 static int
2109 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2110 {
2111    struct intel_context *intel = &brw->intel;
2112
2113    if (intel->gen >= 6) {
2114       /* URB data written (does not include the message header reg) must
2115        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2116        * section 5.4.3.2.2: URB_INTERLEAVED.
2117        *
2118        * URB entries are allocated on a multiple of 1024 bits, so an
2119        * extra 128 bits written here to make the end align to 256 is
2120        * no problem.
2121        */
2122       if ((mlen % 2) != 1)
2123          mlen++;
2124    }
2125
2126    return mlen;
2127 }
2128
2129 /**
2130  * Generates the VUE payload plus the 1 or 2 URB write instructions to
2131  * complete the VS thread.
2132  *
2133  * The VUE layout is documented in Volume 2a.
2134  */
2135 void
2136 vec4_visitor::emit_urb_writes()
2137 {
2138    /* MRF 0 is reserved for the debugger, so start with message header
2139     * in MRF 1.
2140     */
2141    int base_mrf = 1;
2142    int mrf = base_mrf;
2143    /* In the process of generating our URB write message contents, we
2144     * may need to unspill a register or load from an array.  Those
2145     * reads would use MRFs 14-15.
2146     */
2147    int max_usable_mrf = 13;
2148
2149    /* The following assertion verifies that max_usable_mrf causes an
2150     * even-numbered amount of URB write data, which will meet gen6's
2151     * requirements for length alignment.
2152     */
2153    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2154
2155    /* FINISHME: edgeflag */
2156
2157    brw_compute_vue_map(&c->vue_map, intel, c->key.userclip_active,
2158                        c->prog_data.outputs_written);
2159
2160    /* First mrf is the g0-based message header containing URB handles and such,
2161     * which is implied in VS_OPCODE_URB_WRITE.
2162     */
2163    mrf++;
2164
2165    if (intel->gen < 6) {
2166       emit_ndc_computation();
2167    }
2168
2169    /* Set up the VUE data for the first URB write */
2170    int slot;
2171    for (slot = 0; slot < c->vue_map.num_slots; ++slot) {
2172       emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
2173
2174       /* If this was max_usable_mrf, we can't fit anything more into this URB
2175        * WRITE.
2176        */
2177       if (mrf > max_usable_mrf) {
2178          slot++;
2179          break;
2180       }
2181    }
2182
2183    current_annotation = "URB write";
2184    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2185    inst->base_mrf = base_mrf;
2186    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2187    inst->eot = (slot >= c->vue_map.num_slots);
2188
2189    /* Optional second URB write */
2190    if (!inst->eot) {
2191       mrf = base_mrf + 1;
2192
2193       for (; slot < c->vue_map.num_slots; ++slot) {
2194          assert(mrf < max_usable_mrf);
2195
2196          emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
2197       }
2198
2199       current_annotation = "URB write";
2200       inst = emit(VS_OPCODE_URB_WRITE);
2201       inst->base_mrf = base_mrf;
2202       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2203       inst->eot = true;
2204       /* URB destination offset.  In the previous write, we got MRFs
2205        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2206        * URB row increments, and each of our MRFs is half of one of
2207        * those, since we're doing interleaved writes.
2208        */
2209       inst->offset = (max_usable_mrf - base_mrf) / 2;
2210    }
2211
2212    if (intel->gen == 6)
2213       c->prog_data.urb_entry_size = ALIGN(c->vue_map.num_slots, 8) / 8;
2214    else
2215       c->prog_data.urb_entry_size = ALIGN(c->vue_map.num_slots, 4) / 4;
2216 }
2217
2218 src_reg
2219 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2220                                  src_reg *reladdr, int reg_offset)
2221 {
2222    /* Because we store the values to scratch interleaved like our
2223     * vertex data, we need to scale the vec4 index by 2.
2224     */
2225    int message_header_scale = 2;
2226
2227    /* Pre-gen6, the message header uses byte offsets instead of vec4
2228     * (16-byte) offset units.
2229     */
2230    if (intel->gen < 6)
2231       message_header_scale *= 16;
2232
2233    if (reladdr) {
2234       src_reg index = src_reg(this, glsl_type::int_type);
2235
2236       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2237       emit_before(inst, MUL(dst_reg(index),
2238                             index, src_reg(message_header_scale)));
2239
2240       return index;
2241    } else {
2242       return src_reg(reg_offset * message_header_scale);
2243    }
2244 }
2245
2246 src_reg
2247 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2248                                        src_reg *reladdr, int reg_offset)
2249 {
2250    if (reladdr) {
2251       src_reg index = src_reg(this, glsl_type::int_type);
2252
2253       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2254
2255       /* Pre-gen6, the message header uses byte offsets instead of vec4
2256        * (16-byte) offset units.
2257        */
2258       if (intel->gen < 6) {
2259          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2260       }
2261
2262       return index;
2263    } else {
2264       int message_header_scale = intel->gen < 6 ? 16 : 1;
2265       return src_reg(reg_offset * message_header_scale);
2266    }
2267 }
2268
2269 /**
2270  * Emits an instruction before @inst to load the value named by @orig_src
2271  * from scratch space at @base_offset to @temp.
2272  */
2273 void
2274 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2275                                 dst_reg temp, src_reg orig_src,
2276                                 int base_offset)
2277 {
2278    int reg_offset = base_offset + orig_src.reg_offset;
2279    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2280
2281    emit_before(inst, SCRATCH_READ(temp, index));
2282 }
2283
2284 /**
2285  * Emits an instruction after @inst to store the value to be written
2286  * to @orig_dst to scratch space at @base_offset, from @temp.
2287  */
2288 void
2289 vec4_visitor::emit_scratch_write(vec4_instruction *inst,
2290                                  src_reg temp, dst_reg orig_dst,
2291                                  int base_offset)
2292 {
2293    int reg_offset = base_offset + orig_dst.reg_offset;
2294    src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
2295
2296    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2297                                        orig_dst.writemask));
2298    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2299    write->predicate = inst->predicate;
2300    write->ir = inst->ir;
2301    write->annotation = inst->annotation;
2302    inst->insert_after(write);
2303 }
2304
2305 /**
2306  * We can't generally support array access in GRF space, because a
2307  * single instruction's destination can only span 2 contiguous
2308  * registers.  So, we send all GRF arrays that get variable index
2309  * access to scratch space.
2310  */
2311 void
2312 vec4_visitor::move_grf_array_access_to_scratch()
2313 {
2314    int scratch_loc[this->virtual_grf_count];
2315
2316    for (int i = 0; i < this->virtual_grf_count; i++) {
2317       scratch_loc[i] = -1;
2318    }
2319
2320    /* First, calculate the set of virtual GRFs that need to be punted
2321     * to scratch due to having any array access on them, and where in
2322     * scratch.
2323     */
2324    foreach_list(node, &this->instructions) {
2325       vec4_instruction *inst = (vec4_instruction *)node;
2326
2327       if (inst->dst.file == GRF && inst->dst.reladdr &&
2328           scratch_loc[inst->dst.reg] == -1) {
2329          scratch_loc[inst->dst.reg] = c->last_scratch;
2330          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg] * 8 * 4;
2331       }
2332
2333       for (int i = 0 ; i < 3; i++) {
2334          src_reg *src = &inst->src[i];
2335
2336          if (src->file == GRF && src->reladdr &&
2337              scratch_loc[src->reg] == -1) {
2338             scratch_loc[src->reg] = c->last_scratch;
2339             c->last_scratch += this->virtual_grf_sizes[src->reg] * 8 * 4;
2340          }
2341       }
2342    }
2343
2344    /* Now, for anything that will be accessed through scratch, rewrite
2345     * it to load/store.  Note that this is a _safe list walk, because
2346     * we may generate a new scratch_write instruction after the one
2347     * we're processing.
2348     */
2349    foreach_list_safe(node, &this->instructions) {
2350       vec4_instruction *inst = (vec4_instruction *)node;
2351
2352       /* Set up the annotation tracking for new generated instructions. */
2353       base_ir = inst->ir;
2354       current_annotation = inst->annotation;
2355
2356       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2357          src_reg temp = src_reg(this, glsl_type::vec4_type);
2358
2359          emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
2360
2361          inst->dst.file = temp.file;
2362          inst->dst.reg = temp.reg;
2363          inst->dst.reg_offset = temp.reg_offset;
2364          inst->dst.reladdr = NULL;
2365       }
2366
2367       for (int i = 0 ; i < 3; i++) {
2368          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2369             continue;
2370
2371          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2372
2373          emit_scratch_read(inst, temp, inst->src[i],
2374                            scratch_loc[inst->src[i].reg]);
2375
2376          inst->src[i].file = temp.file;
2377          inst->src[i].reg = temp.reg;
2378          inst->src[i].reg_offset = temp.reg_offset;
2379          inst->src[i].reladdr = NULL;
2380       }
2381    }
2382 }
2383
2384 /**
2385  * Emits an instruction before @inst to load the value named by @orig_src
2386  * from the pull constant buffer (surface) at @base_offset to @temp.
2387  */
2388 void
2389 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2390                                       dst_reg temp, src_reg orig_src,
2391                                       int base_offset)
2392 {
2393    int reg_offset = base_offset + orig_src.reg_offset;
2394    src_reg index = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2395    vec4_instruction *load;
2396
2397    load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2398                                         temp, index);
2399    load->base_mrf = 14;
2400    load->mlen = 1;
2401    emit_before(inst, load);
2402 }
2403
2404 /**
2405  * Implements array access of uniforms by inserting a
2406  * PULL_CONSTANT_LOAD instruction.
2407  *
2408  * Unlike temporary GRF array access (where we don't support it due to
2409  * the difficulty of doing relative addressing on instruction
2410  * destinations), we could potentially do array access of uniforms
2411  * that were loaded in GRF space as push constants.  In real-world
2412  * usage we've seen, though, the arrays being used are always larger
2413  * than we could load as push constants, so just always move all
2414  * uniform array access out to a pull constant buffer.
2415  */
2416 void
2417 vec4_visitor::move_uniform_array_access_to_pull_constants()
2418 {
2419    int pull_constant_loc[this->uniforms];
2420
2421    for (int i = 0; i < this->uniforms; i++) {
2422       pull_constant_loc[i] = -1;
2423    }
2424
2425    /* Walk through and find array access of uniforms.  Put a copy of that
2426     * uniform in the pull constant buffer.
2427     *
2428     * Note that we don't move constant-indexed accesses to arrays.  No
2429     * testing has been done of the performance impact of this choice.
2430     */
2431    foreach_list_safe(node, &this->instructions) {
2432       vec4_instruction *inst = (vec4_instruction *)node;
2433
2434       for (int i = 0 ; i < 3; i++) {
2435          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2436             continue;
2437
2438          int uniform = inst->src[i].reg;
2439
2440          /* If this array isn't already present in the pull constant buffer,
2441           * add it.
2442           */
2443          if (pull_constant_loc[uniform] == -1) {
2444             const float **values = &prog_data->param[uniform * 4];
2445
2446             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2447
2448             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2449                prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2450             }
2451          }
2452
2453          /* Set up the annotation tracking for new generated instructions. */
2454          base_ir = inst->ir;
2455          current_annotation = inst->annotation;
2456
2457          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2458
2459          emit_pull_constant_load(inst, temp, inst->src[i],
2460                                  pull_constant_loc[uniform]);
2461
2462          inst->src[i].file = temp.file;
2463          inst->src[i].reg = temp.reg;
2464          inst->src[i].reg_offset = temp.reg_offset;
2465          inst->src[i].reladdr = NULL;
2466       }
2467    }
2468
2469    /* Now there are no accesses of the UNIFORM file with a reladdr, so
2470     * no need to track them as larger-than-vec4 objects.  This will be
2471     * relied on in cutting out unused uniform vectors from push
2472     * constants.
2473     */
2474    split_uniform_registers();
2475 }
2476
2477 void
2478 vec4_visitor::resolve_ud_negate(src_reg *reg)
2479 {
2480    if (reg->type != BRW_REGISTER_TYPE_UD ||
2481        !reg->negate)
2482       return;
2483
2484    src_reg temp = src_reg(this, glsl_type::uvec4_type);
2485    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2486    *reg = temp;
2487 }
2488
2489 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2490                            struct gl_shader_program *prog,
2491                            struct brw_shader *shader)
2492 {
2493    this->c = c;
2494    this->p = &c->func;
2495    this->brw = p->brw;
2496    this->intel = &brw->intel;
2497    this->ctx = &intel->ctx;
2498    this->prog = prog;
2499    this->shader = shader;
2500
2501    this->mem_ctx = ralloc_context(NULL);
2502    this->failed = false;
2503
2504    this->base_ir = NULL;
2505    this->current_annotation = NULL;
2506
2507    this->c = c;
2508    this->vp = (struct gl_vertex_program *)
2509      prog->_LinkedShaders[MESA_SHADER_VERTEX]->Program;
2510    this->prog_data = &c->prog_data;
2511
2512    this->variable_ht = hash_table_ctor(0,
2513                                        hash_table_pointer_hash,
2514                                        hash_table_pointer_compare);
2515
2516    this->virtual_grf_def = NULL;
2517    this->virtual_grf_use = NULL;
2518    this->virtual_grf_sizes = NULL;
2519    this->virtual_grf_count = 0;
2520    this->virtual_grf_reg_map = NULL;
2521    this->virtual_grf_reg_count = 0;
2522    this->virtual_grf_array_size = 0;
2523    this->live_intervals_valid = false;
2524
2525    this->uniforms = 0;
2526
2527    this->variable_ht = hash_table_ctor(0,
2528                                        hash_table_pointer_hash,
2529                                        hash_table_pointer_compare);
2530 }
2531
2532 vec4_visitor::~vec4_visitor()
2533 {
2534    ralloc_free(this->mem_ctx);
2535    hash_table_dtor(this->variable_ht);
2536 }
2537
2538
2539 void
2540 vec4_visitor::fail(const char *format, ...)
2541 {
2542    va_list va;
2543    char *msg;
2544
2545    if (failed)
2546       return;
2547
2548    failed = true;
2549
2550    va_start(va, format);
2551    msg = ralloc_vasprintf(mem_ctx, format, va);
2552    va_end(va);
2553    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2554
2555    this->fail_msg = msg;
2556
2557    if (INTEL_DEBUG & DEBUG_VS) {
2558       fprintf(stderr, "%s",  msg);
2559    }
2560 }
2561
2562 } /* namespace brw */