src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 extern "C" {
  26 #include "main/macros.h"
  27 #include "program/prog_parameter.h"
  28 #include "program/sampler.h"
  29 }
  30
  31 namespace brw {
  32
  33 src_reg::src_reg(dst_reg reg)
  34 {
  35    init();
  36
  37    this->file = reg.file;
  38    this->reg = reg.reg;
  39    this->reg_offset = reg.reg_offset;
  40    this->type = reg.type;
  41    this->reladdr = reg.reladdr;
  42    this->fixed_hw_reg = reg.fixed_hw_reg;
  43
  44    int swizzles[4];
  45    int next_chan = 0;
  46    int last = 0;
  47
  48    for (int i = 0; i < 4; i++) {
  49       if (!(reg.writemask & (1 << i)))
  50          continue;
  51
  52       swizzles[next_chan++] = last = i;
  53    }
  54
  55    for (; next_chan < 4; next_chan++) {
  56       swizzles[next_chan] = last;
  57    }
  58
  59    this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
  60                                 swizzles[2], swizzles[3]);
  61 }
  62
  63 dst_reg::dst_reg(src_reg reg)
  64 {
  65    init();
  66
  67    this->file = reg.file;
  68    this->reg = reg.reg;
  69    this->reg_offset = reg.reg_offset;
  70    this->type = reg.type;
  71    this->writemask = WRITEMASK_XYZW;
  72    this->reladdr = reg.reladdr;
  73    this->fixed_hw_reg = reg.fixed_hw_reg;
  74 }
  75
  76 vec4_instruction::vec4_instruction(vec4_visitor *v,
  77                                    enum opcode opcode, dst_reg dst,
  78                                    src_reg src0, src_reg src1, src_reg src2)
  79 {
  80    this->opcode = opcode;
  81    this->dst = dst;
  82    this->src[0] = src0;
  83    this->src[1] = src1;
  84    this->src[2] = src2;
  85    this->ir = v->base_ir;
  86    this->annotation = v->current_annotation;
  87 }
  88
  89 vec4_instruction *
  90 vec4_visitor::emit(vec4_instruction *inst)
  91 {
  92    this->instructions.push_tail(inst);
  93
  94    return inst;
  95 }
  96
  97 vec4_instruction *
  98 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  99 {
 100    new_inst->ir = inst->ir;
 101    new_inst->annotation = inst->annotation;
 102
 103    inst->insert_before(new_inst);
 104
 105    return inst;
 106 }
 107
 108 vec4_instruction *
 109 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
 110                    src_reg src0, src_reg src1, src_reg src2)
 111 {
 112    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
 113                                              src0, src1, src2));
 114 }
 115
 116
 117 vec4_instruction *
 118 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
 119 {
 120    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
 121 }
 122
 123 vec4_instruction *
 124 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
 125 {
 126    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
 127 }
 128
 129 vec4_instruction *
 130 vec4_visitor::emit(enum opcode opcode)
 131 {
 132    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 133 }
 134
 135 #define ALU1(op)                                                        \
 136    vec4_instruction *                                                   \
 137    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
 138    {                                                                    \
 139       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 140                                            src0);                       \
 141    }
 142
 143 #define ALU2(op)                                                        \
 144    vec4_instruction *                                                   \
 145    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 146    {                                                                    \
 147       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 148                                            src0, src1);                 \
 149    }
 150
 151 ALU1(NOT)
 152 ALU1(MOV)
 153 ALU1(FRC)
 154 ALU1(RNDD)
 155 ALU1(RNDE)
 156 ALU1(RNDZ)
 157 ALU2(ADD)
 158 ALU2(MUL)
 159 ALU2(MACH)
 160 ALU2(AND)
 161 ALU2(OR)
 162 ALU2(XOR)
 163 ALU2(DP3)
 164 ALU2(DP4)
 165
 166 /** Gen4 predicated IF. */
 167 vec4_instruction *
 168 vec4_visitor::IF(uint32_t predicate)
 169 {
 170    vec4_instruction *inst;
 171
 172    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 173    inst->predicate = predicate;
 174
 175    return inst;
 176 }
 177
 178 /** Gen6+ IF with embedded comparison. */
 179 vec4_instruction *
 180 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 181 {
 182    assert(intel->gen >= 6);
 183
 184    vec4_instruction *inst;
 185
 186    resolve_ud_negate(&src0);
 187    resolve_ud_negate(&src1);
 188
 189    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 190                                         src0, src1);
 191    inst->conditional_mod = condition;
 192
 193    return inst;
 194 }
 195
 196 /**
 197  * CMP: Sets the low bit of the destination channels with the result
 198  * of the comparison, while the upper bits are undefined, and updates
 199  * the flag register with the packed 16 bits of the result.
 200  */
 201 vec4_instruction *
 202 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 203 {
 204    vec4_instruction *inst;
 205
 206    /* original gen4 does type conversion to the destination type
 207     * before before comparison, producing garbage results for floating
 208     * point comparisons.
 209     */
 210    if (intel->gen == 4) {
 211       dst.type = src0.type;
 212       if (dst.file == HW_REG)
 213          dst.fixed_hw_reg.type = dst.type;
 214    }
 215
 216    resolve_ud_negate(&src0);
 217    resolve_ud_negate(&src1);
 218
 219    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 220    inst->conditional_mod = condition;
 221
 222    return inst;
 223 }
 224
 225 vec4_instruction *
 226 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 227 {
 228    vec4_instruction *inst;
 229
 230    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 231                                         dst, index);
 232    inst->base_mrf = 14;
 233    inst->mlen = 1;
 234
 235    return inst;
 236 }
 237
 238 vec4_instruction *
 239 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 240 {
 241    vec4_instruction *inst;
 242
 243    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 244                                         dst, src, index);
 245    inst->base_mrf = 13;
 246    inst->mlen = 2;
 247
 248    return inst;
 249 }
 250
 251 void
 252 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 253 {
 254    static enum opcode dot_opcodes[] = {
 255       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 256    };
 257
 258    emit(dot_opcodes[elements - 2], dst, src0, src1);
 259 }
 260
 261 void
 262 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 263 {
 264    /* The gen6 math instruction ignores the source modifiers --
 265     * swizzle, abs, negate, and at least some parts of the register
 266     * region description.
 267     *
 268     * While it would seem that this MOV could be avoided at this point
 269     * in the case that the swizzle is matched up with the destination
 270     * writemask, note that uniform packing and register allocation
 271     * could rearrange our swizzle, so let's leave this matter up to
 272     * copy propagation later.
 273     */
 274    src_reg temp_src = src_reg(this, glsl_type::vec4_type);
 275    emit(MOV(dst_reg(temp_src), src));
 276
 277    if (dst.writemask != WRITEMASK_XYZW) {
 278       /* The gen6 math instruction must be align1, so we can't do
 279        * writemasks.
 280        */
 281       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 282
 283       emit(opcode, temp_dst, temp_src);
 284
 285       emit(MOV(dst, src_reg(temp_dst)));
 286    } else {
 287       emit(opcode, dst, temp_src);
 288    }
 289 }
 290
 291 void
 292 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 293 {
 294    vec4_instruction *inst = emit(opcode, dst, src);
 295    inst->base_mrf = 1;
 296    inst->mlen = 1;
 297 }
 298
 299 void
 300 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 301 {
 302    switch (opcode) {
 303    case SHADER_OPCODE_RCP:
 304    case SHADER_OPCODE_RSQ:
 305    case SHADER_OPCODE_SQRT:
 306    case SHADER_OPCODE_EXP2:
 307    case SHADER_OPCODE_LOG2:
 308    case SHADER_OPCODE_SIN:
 309    case SHADER_OPCODE_COS:
 310       break;
 311    default:
 312       assert(!"not reached: bad math opcode");
 313       return;
 314    }
 315
 316    if (intel->gen >= 6) {
 317       return emit_math1_gen6(opcode, dst, src);
 318    } else {
 319       return emit_math1_gen4(opcode, dst, src);
 320    }
 321 }
 322
 323 void
 324 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 325                               dst_reg dst, src_reg src0, src_reg src1)
 326 {
 327    src_reg expanded;
 328
 329    /* The gen6 math instruction ignores the source modifiers --
 330     * swizzle, abs, negate, and at least some parts of the register
 331     * region description.  Move the sources to temporaries to make it
 332     * generally work.
 333     */
 334
 335    expanded = src_reg(this, glsl_type::vec4_type);
 336    expanded.type = src0.type;
 337    emit(MOV(dst_reg(expanded), src0));
 338    src0 = expanded;
 339
 340    expanded = src_reg(this, glsl_type::vec4_type);
 341    expanded.type = src1.type;
 342    emit(MOV(dst_reg(expanded), src1));
 343    src1 = expanded;
 344
 345    if (dst.writemask != WRITEMASK_XYZW) {
 346       /* The gen6 math instruction must be align1, so we can't do
 347        * writemasks.
 348        */
 349       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 350       temp_dst.type = dst.type;
 351
 352       emit(opcode, temp_dst, src0, src1);
 353
 354       emit(MOV(dst, src_reg(temp_dst)));
 355    } else {
 356       emit(opcode, dst, src0, src1);
 357    }
 358 }
 359
 360 void
 361 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 362                               dst_reg dst, src_reg src0, src_reg src1)
 363 {
 364    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 365    inst->base_mrf = 1;
 366    inst->mlen = 2;
 367 }
 368
 369 void
 370 vec4_visitor::emit_math(enum opcode opcode,
 371                         dst_reg dst, src_reg src0, src_reg src1)
 372 {
 373    switch (opcode) {
 374    case SHADER_OPCODE_POW:
 375    case SHADER_OPCODE_INT_QUOTIENT:
 376    case SHADER_OPCODE_INT_REMAINDER:
 377       break;
 378    default:
 379       assert(!"not reached: unsupported binary math opcode");
 380       return;
 381    }
 382
 383    if (intel->gen >= 6) {
 384       return emit_math2_gen6(opcode, dst, src0, src1);
 385    } else {
 386       return emit_math2_gen4(opcode, dst, src0, src1);
 387    }
 388 }
 389
 390 void
 391 vec4_visitor::visit_instructions(const exec_list *list)
 392 {
 393    foreach_list(node, list) {
 394       ir_instruction *ir = (ir_instruction *)node;
 395
 396       base_ir = ir;
 397       ir->accept(this);
 398    }
 399 }
 400
 401
 402 static int
 403 type_size(const struct glsl_type *type)
 404 {
 405    unsigned int i;
 406    int size;
 407
 408    switch (type->base_type) {
 409    case GLSL_TYPE_UINT:
 410    case GLSL_TYPE_INT:
 411    case GLSL_TYPE_FLOAT:
 412    case GLSL_TYPE_BOOL:
 413       if (type->is_matrix()) {
 414          return type->matrix_columns;
 415       } else {
 416          /* Regardless of size of vector, it gets a vec4. This is bad
 417           * packing for things like floats, but otherwise arrays become a
 418           * mess.  Hopefully a later pass over the code can pack scalars
 419           * down if appropriate.
 420           */
 421          return 1;
 422       }
 423    case GLSL_TYPE_ARRAY:
 424       assert(type->length > 0);
 425       return type_size(type->fields.array) * type->length;
 426    case GLSL_TYPE_STRUCT:
 427       size = 0;
 428       for (i = 0; i < type->length; i++) {
 429          size += type_size(type->fields.structure[i].type);
 430       }
 431       return size;
 432    case GLSL_TYPE_SAMPLER:
 433       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 434        * at link time.
 435        */
 436       return 1;
 437    default:
 438       assert(0);
 439       return 0;
 440    }
 441 }
 442
 443 int
 444 vec4_visitor::virtual_grf_alloc(int size)
 445 {
 446    if (virtual_grf_array_size <= virtual_grf_count) {
 447       if (virtual_grf_array_size == 0)
 448          virtual_grf_array_size = 16;
 449       else
 450          virtual_grf_array_size *= 2;
 451       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 452                                    virtual_grf_array_size);
 453       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 454                                      virtual_grf_array_size);
 455    }
 456    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 457    virtual_grf_reg_count += size;
 458    virtual_grf_sizes[virtual_grf_count] = size;
 459    return virtual_grf_count++;
 460 }
 461
 462 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 463 {
 464    init();
 465
 466    this->file = GRF;
 467    this->reg = v->virtual_grf_alloc(type_size(type));
 468
 469    if (type->is_array() || type->is_record()) {
 470       this->swizzle = BRW_SWIZZLE_NOOP;
 471    } else {
 472       this->swizzle = swizzle_for_size(type->vector_elements);
 473    }
 474
 475    this->type = brw_type_for_base_type(type);
 476 }
 477
 478 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 479 {
 480    init();
 481
 482    this->file = GRF;
 483    this->reg = v->virtual_grf_alloc(type_size(type));
 484
 485    if (type->is_array() || type->is_record()) {
 486       this->writemask = WRITEMASK_XYZW;
 487    } else {
 488       this->writemask = (1 << type->vector_elements) - 1;
 489    }
 490
 491    this->type = brw_type_for_base_type(type);
 492 }
 493
 494 /* Our support for uniforms is piggy-backed on the struct
 495  * gl_fragment_program, because that's where the values actually
 496  * get stored, rather than in some global gl_shader_program uniform
 497  * store.
 498  */
 499 int
 500 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
 501 {
 502    unsigned int offset = 0;
 503    float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
 504
 505    if (type->is_matrix()) {
 506       const glsl_type *column = type->column_type();
 507
 508       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 509          offset += setup_uniform_values(loc + offset, column);
 510       }
 511
 512       return offset;
 513    }
 514
 515    switch (type->base_type) {
 516    case GLSL_TYPE_FLOAT:
 517    case GLSL_TYPE_UINT:
 518    case GLSL_TYPE_INT:
 519    case GLSL_TYPE_BOOL:
 520       for (unsigned int i = 0; i < type->vector_elements; i++) {
 521          c->prog_data.param[this->uniforms * 4 + i] = &values[i];
 522       }
 523
 524       /* Set up pad elements to get things aligned to a vec4 boundary. */
 525       for (unsigned int i = type->vector_elements; i < 4; i++) {
 526          static float zero = 0;
 527
 528          c->prog_data.param[this->uniforms * 4 + i] = &zero;
 529       }
 530
 531       /* Track the size of this uniform vector, for future packing of
 532        * uniforms.
 533        */
 534       this->uniform_vector_size[this->uniforms] = type->vector_elements;
 535       this->uniforms++;
 536
 537       return 1;
 538
 539    case GLSL_TYPE_STRUCT:
 540       for (unsigned int i = 0; i < type->length; i++) {
 541          offset += setup_uniform_values(loc + offset,
 542                                         type->fields.structure[i].type);
 543       }
 544       return offset;
 545
 546    case GLSL_TYPE_ARRAY:
 547       for (unsigned int i = 0; i < type->length; i++) {
 548          offset += setup_uniform_values(loc + offset, type->fields.array);
 549       }
 550       return offset;
 551
 552    case GLSL_TYPE_SAMPLER:
 553       /* The sampler takes up a slot, but we don't use any values from it. */
 554       return 1;
 555
 556    default:
 557       assert(!"not reached");
 558       return 0;
 559    }
 560 }
 561
 562 void
 563 vec4_visitor::setup_uniform_clipplane_values()
 564 {
 565    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 566
 567    /* Pre-Gen6, we compact clip planes.  For example, if the user
 568     * enables just clip planes 0, 1, and 3, we will enable clip planes
 569     * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
 570     * plane 2.  This simplifies the implementation of the Gen6 clip
 571     * thread.
 572     *
 573     * In Gen6 and later, we don't compact clip planes, because this
 574     * simplifies the implementation of gl_ClipDistance.
 575     */
 576    int compacted_clipplane_index = 0;
 577    for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
 578       if (intel->gen < 6 &&
 579           !(c->key.userclip_planes_enabled_gen_4_5 & (1 << i))) {
 580          continue;
 581       }
 582       this->uniform_vector_size[this->uniforms] = 4;
 583       this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
 584       this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
 585       for (int j = 0; j < 4; ++j) {
 586          c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 587       }
 588       ++compacted_clipplane_index;
 589       ++this->uniforms;
 590    }
 591 }
 592
 593 /* Our support for builtin uniforms is even scarier than non-builtin.
 594  * It sits on top of the PROG_STATE_VAR parameters that are
 595  * automatically updated from GL context state.
 596  */
 597 void
 598 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 599 {
 600    const ir_state_slot *const slots = ir->state_slots;
 601    assert(ir->state_slots != NULL);
 602
 603    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 604       /* This state reference has already been setup by ir_to_mesa,
 605        * but we'll get the same index back here.  We can reference
 606        * ParameterValues directly, since unlike brw_fs.cpp, we never
 607        * add new state references during compile.
 608        */
 609       int index = _mesa_add_state_reference(this->vp->Base.Parameters,
 610                                             (gl_state_index *)slots[i].tokens);
 611       float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
 612
 613       this->uniform_vector_size[this->uniforms] = 0;
 614       /* Add each of the unique swizzled channels of the element.
 615        * This will end up matching the size of the glsl_type of this field.
 616        */
 617       int last_swiz = -1;
 618       for (unsigned int j = 0; j < 4; j++) {
 619          int swiz = GET_SWZ(slots[i].swizzle, j);
 620          last_swiz = swiz;
 621
 622          c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
 623          if (swiz <= last_swiz)
 624             this->uniform_vector_size[this->uniforms]++;
 625       }
 626       this->uniforms++;
 627    }
 628 }
 629
 630 dst_reg *
 631 vec4_visitor::variable_storage(ir_variable *var)
 632 {
 633    return (dst_reg *)hash_table_find(this->variable_ht, var);
 634 }
 635
 636 void
 637 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 638 {
 639    ir_expression *expr = ir->as_expression();
 640
 641    *predicate = BRW_PREDICATE_NORMAL;
 642
 643    if (expr) {
 644       src_reg op[2];
 645       vec4_instruction *inst;
 646
 647       assert(expr->get_num_operands() <= 2);
 648       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 649          expr->operands[i]->accept(this);
 650          op[i] = this->result;
 651
 652          resolve_ud_negate(&op[i]);
 653       }
 654
 655       switch (expr->operation) {
 656       case ir_unop_logic_not:
 657          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 658          inst->conditional_mod = BRW_CONDITIONAL_Z;
 659          break;
 660
 661       case ir_binop_logic_xor:
 662          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 663          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 664          break;
 665
 666       case ir_binop_logic_or:
 667          inst = emit(OR(dst_null_d(), op[0], op[1]));
 668          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 669          break;
 670
 671       case ir_binop_logic_and:
 672          inst = emit(AND(dst_null_d(), op[0], op[1]));
 673          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 674          break;
 675
 676       case ir_unop_f2b:
 677          if (intel->gen >= 6) {
 678             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 679          } else {
 680             inst = emit(MOV(dst_null_f(), op[0]));
 681             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 682          }
 683          break;
 684
 685       case ir_unop_i2b:
 686          if (intel->gen >= 6) {
 687             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 688          } else {
 689             inst = emit(MOV(dst_null_d(), op[0]));
 690             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 691          }
 692          break;
 693
 694       case ir_binop_all_equal:
 695          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 696          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 697          break;
 698
 699       case ir_binop_any_nequal:
 700          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 701          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 702          break;
 703
 704       case ir_unop_any:
 705          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 706          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 707          break;
 708
 709       case ir_binop_greater:
 710       case ir_binop_gequal:
 711       case ir_binop_less:
 712       case ir_binop_lequal:
 713       case ir_binop_equal:
 714       case ir_binop_nequal:
 715          emit(CMP(dst_null_d(), op[0], op[1],
 716                   brw_conditional_for_comparison(expr->operation)));
 717          break;
 718
 719       default:
 720          assert(!"not reached");
 721          break;
 722       }
 723       return;
 724    }
 725
 726    ir->accept(this);
 727
 728    resolve_ud_negate(&this->result);
 729
 730    if (intel->gen >= 6) {
 731       vec4_instruction *inst = emit(AND(dst_null_d(),
 732                                         this->result, src_reg(1)));
 733       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 734    } else {
 735       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 736       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 737    }
 738 }
 739
 740 /**
 741  * Emit a gen6 IF statement with the comparison folded into the IF
 742  * instruction.
 743  */
 744 void
 745 vec4_visitor::emit_if_gen6(ir_if *ir)
 746 {
 747    ir_expression *expr = ir->condition->as_expression();
 748
 749    if (expr) {
 750       src_reg op[2];
 751       dst_reg temp;
 752
 753       assert(expr->get_num_operands() <= 2);
 754       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 755          expr->operands[i]->accept(this);
 756          op[i] = this->result;
 757       }
 758
 759       switch (expr->operation) {
 760       case ir_unop_logic_not:
 761          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 762          return;
 763
 764       case ir_binop_logic_xor:
 765          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 766          return;
 767
 768       case ir_binop_logic_or:
 769          temp = dst_reg(this, glsl_type::bool_type);
 770          emit(OR(temp, op[0], op[1]));
 771          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 772          return;
 773
 774       case ir_binop_logic_and:
 775          temp = dst_reg(this, glsl_type::bool_type);
 776          emit(AND(temp, op[0], op[1]));
 777          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 778          return;
 779
 780       case ir_unop_f2b:
 781          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 782          return;
 783
 784       case ir_unop_i2b:
 785          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 786          return;
 787
 788       case ir_binop_greater:
 789       case ir_binop_gequal:
 790       case ir_binop_less:
 791       case ir_binop_lequal:
 792       case ir_binop_equal:
 793       case ir_binop_nequal:
 794          emit(IF(op[0], op[1],
 795                  brw_conditional_for_comparison(expr->operation)));
 796          return;
 797
 798       case ir_binop_all_equal:
 799          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 800          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 801          return;
 802
 803       case ir_binop_any_nequal:
 804          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 805          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 806          return;
 807
 808       case ir_unop_any:
 809          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 810          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 811          return;
 812
 813       default:
 814          assert(!"not reached");
 815          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 816          return;
 817       }
 818       return;
 819    }
 820
 821    ir->condition->accept(this);
 822
 823    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 824 }
 825
 826 void
 827 vec4_visitor::visit(ir_variable *ir)
 828 {
 829    dst_reg *reg = NULL;
 830
 831    if (variable_storage(ir))
 832       return;
 833
 834    switch (ir->mode) {
 835    case ir_var_in:
 836       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
 837
 838       /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 839        * come in as floating point conversions of the integer values.
 840        */
 841       for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
 842          if (!c->key.gl_fixed_input_size[i])
 843             continue;
 844
 845          dst_reg dst = *reg;
 846          dst.type = brw_type_for_base_type(ir->type);
 847          dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
 848          emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 849       }
 850       break;
 851
 852    case ir_var_out:
 853       reg = new(mem_ctx) dst_reg(this, ir->type);
 854
 855       for (int i = 0; i < type_size(ir->type); i++) {
 856          output_reg[ir->location + i] = *reg;
 857          output_reg[ir->location + i].reg_offset = i;
 858          output_reg[ir->location + i].type =
 859             brw_type_for_base_type(ir->type->get_scalar_type());
 860          output_reg_annotation[ir->location + i] = ir->name;
 861       }
 862       break;
 863
 864    case ir_var_auto:
 865    case ir_var_temporary:
 866       reg = new(mem_ctx) dst_reg(this, ir->type);
 867       break;
 868
 869    case ir_var_uniform:
 870       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 871
 872       /* Track how big the whole uniform variable is, in case we need to put a
 873        * copy of its data into pull constants for array access.
 874        */
 875       this->uniform_size[this->uniforms] = type_size(ir->type);
 876
 877       if (!strncmp(ir->name, "gl_", 3)) {
 878          setup_builtin_uniform_values(ir);
 879       } else {
 880          setup_uniform_values(ir->location, ir->type);
 881       }
 882       break;
 883
 884    case ir_var_system_value:
 885       /* VertexID is stored by the VF as the last vertex element, but
 886        * we don't represent it with a flag in inputs_read, so we call
 887        * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
 888        */
 889       reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
 890       prog_data->uses_vertexid = true;
 891
 892       switch (ir->location) {
 893       case SYSTEM_VALUE_VERTEX_ID:
 894          reg->writemask = WRITEMASK_X;
 895          break;
 896       case SYSTEM_VALUE_INSTANCE_ID:
 897          reg->writemask = WRITEMASK_Y;
 898          break;
 899       default:
 900          assert(!"not reached");
 901          break;
 902       }
 903       break;
 904
 905    default:
 906       assert(!"not reached");
 907    }
 908
 909    reg->type = brw_type_for_base_type(ir->type);
 910    hash_table_insert(this->variable_ht, reg, ir);
 911 }
 912
 913 void
 914 vec4_visitor::visit(ir_loop *ir)
 915 {
 916    dst_reg counter;
 917
 918    /* We don't want debugging output to print the whole body of the
 919     * loop as the annotation.
 920     */
 921    this->base_ir = NULL;
 922
 923    if (ir->counter != NULL) {
 924       this->base_ir = ir->counter;
 925       ir->counter->accept(this);
 926       counter = *(variable_storage(ir->counter));
 927
 928       if (ir->from != NULL) {
 929          this->base_ir = ir->from;
 930          ir->from->accept(this);
 931
 932          emit(MOV(counter, this->result));
 933       }
 934    }
 935
 936    emit(BRW_OPCODE_DO);
 937
 938    if (ir->to) {
 939       this->base_ir = ir->to;
 940       ir->to->accept(this);
 941
 942       emit(CMP(dst_null_d(), src_reg(counter), this->result,
 943                brw_conditional_for_comparison(ir->cmp)));
 944
 945       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
 946       inst->predicate = BRW_PREDICATE_NORMAL;
 947    }
 948
 949    visit_instructions(&ir->body_instructions);
 950
 951
 952    if (ir->increment) {
 953       this->base_ir = ir->increment;
 954       ir->increment->accept(this);
 955       emit(ADD(counter, src_reg(counter), this->result));
 956    }
 957
 958    emit(BRW_OPCODE_WHILE);
 959 }
 960
 961 void
 962 vec4_visitor::visit(ir_loop_jump *ir)
 963 {
 964    switch (ir->mode) {
 965    case ir_loop_jump::jump_break:
 966       emit(BRW_OPCODE_BREAK);
 967       break;
 968    case ir_loop_jump::jump_continue:
 969       emit(BRW_OPCODE_CONTINUE);
 970       break;
 971    }
 972 }
 973
 974
 975 void
 976 vec4_visitor::visit(ir_function_signature *ir)
 977 {
 978    assert(0);
 979    (void)ir;
 980 }
 981
 982 void
 983 vec4_visitor::visit(ir_function *ir)
 984 {
 985    /* Ignore function bodies other than main() -- we shouldn't see calls to
 986     * them since they should all be inlined.
 987     */
 988    if (strcmp(ir->name, "main") == 0) {
 989       const ir_function_signature *sig;
 990       exec_list empty;
 991
 992       sig = ir->matching_signature(&empty);
 993
 994       assert(sig);
 995
 996       visit_instructions(&sig->body);
 997    }
 998 }
 999
1000 bool
1001 vec4_visitor::try_emit_sat(ir_expression *ir)
1002 {
1003    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1004    if (!sat_src)
1005       return false;
1006
1007    sat_src->accept(this);
1008    src_reg src = this->result;
1009
1010    this->result = src_reg(this, ir->type);
1011    vec4_instruction *inst;
1012    inst = emit(MOV(dst_reg(this->result), src));
1013    inst->saturate = true;
1014
1015    return true;
1016 }
1017
1018 void
1019 vec4_visitor::emit_bool_comparison(unsigned int op,
1020                                  dst_reg dst, src_reg src0, src_reg src1)
1021 {
1022    /* original gen4 does destination conversion before comparison. */
1023    if (intel->gen < 5)
1024       dst.type = src0.type;
1025
1026    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1027
1028    dst.type = BRW_REGISTER_TYPE_D;
1029    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1030 }
1031
1032 void
1033 vec4_visitor::visit(ir_expression *ir)
1034 {
1035    unsigned int operand;
1036    src_reg op[Elements(ir->operands)];
1037    src_reg result_src;
1038    dst_reg result_dst;
1039    vec4_instruction *inst;
1040
1041    if (try_emit_sat(ir))
1042       return;
1043
1044    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1045       this->result.file = BAD_FILE;
1046       ir->operands[operand]->accept(this);
1047       if (this->result.file == BAD_FILE) {
1048          printf("Failed to get tree for expression operand:\n");
1049          ir->operands[operand]->print();
1050          exit(1);
1051       }
1052       op[operand] = this->result;
1053
1054       /* Matrix expression operands should have been broken down to vector
1055        * operations already.
1056        */
1057       assert(!ir->operands[operand]->type->is_matrix());
1058    }
1059
1060    int vector_elements = ir->operands[0]->type->vector_elements;
1061    if (ir->operands[1]) {
1062       vector_elements = MAX2(vector_elements,
1063                              ir->operands[1]->type->vector_elements);
1064    }
1065
1066    this->result.file = BAD_FILE;
1067
1068    /* Storage for our result.  Ideally for an assignment we'd be using
1069     * the actual storage for the result here, instead.
1070     */
1071    result_src = src_reg(this, ir->type);
1072    /* convenience for the emit functions below. */
1073    result_dst = dst_reg(result_src);
1074    /* If nothing special happens, this is the result. */
1075    this->result = result_src;
1076    /* Limit writes to the channels that will be used by result_src later.
1077     * This does limit this temp's use as a temporary for multi-instruction
1078     * sequences.
1079     */
1080    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1081
1082    switch (ir->operation) {
1083    case ir_unop_logic_not:
1084       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1085        * ones complement of the whole register, not just bit 0.
1086        */
1087       emit(XOR(result_dst, op[0], src_reg(1)));
1088       break;
1089    case ir_unop_neg:
1090       op[0].negate = !op[0].negate;
1091       this->result = op[0];
1092       break;
1093    case ir_unop_abs:
1094       op[0].abs = true;
1095       op[0].negate = false;
1096       this->result = op[0];
1097       break;
1098
1099    case ir_unop_sign:
1100       emit(MOV(result_dst, src_reg(0.0f)));
1101
1102       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1103       inst = emit(MOV(result_dst, src_reg(1.0f)));
1104       inst->predicate = BRW_PREDICATE_NORMAL;
1105
1106       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1107       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1108       inst->predicate = BRW_PREDICATE_NORMAL;
1109
1110       break;
1111
1112    case ir_unop_rcp:
1113       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1114       break;
1115
1116    case ir_unop_exp2:
1117       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1118       break;
1119    case ir_unop_log2:
1120       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1121       break;
1122    case ir_unop_exp:
1123    case ir_unop_log:
1124       assert(!"not reached: should be handled by ir_explog_to_explog2");
1125       break;
1126    case ir_unop_sin:
1127    case ir_unop_sin_reduced:
1128       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1129       break;
1130    case ir_unop_cos:
1131    case ir_unop_cos_reduced:
1132       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1133       break;
1134
1135    case ir_unop_dFdx:
1136    case ir_unop_dFdy:
1137       assert(!"derivatives not valid in vertex shader");
1138       break;
1139
1140    case ir_unop_noise:
1141       assert(!"not reached: should be handled by lower_noise");
1142       break;
1143
1144    case ir_binop_add:
1145       emit(ADD(result_dst, op[0], op[1]));
1146       break;
1147    case ir_binop_sub:
1148       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1149       break;
1150
1151    case ir_binop_mul:
1152       if (ir->type->is_integer()) {
1153          /* For integer multiplication, the MUL uses the low 16 bits
1154           * of one of the operands (src0 on gen6, src1 on gen7).  The
1155           * MACH accumulates in the contribution of the upper 16 bits
1156           * of that operand.
1157           *
1158           * FINISHME: Emit just the MUL if we know an operand is small
1159           * enough.
1160           */
1161          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1162
1163          emit(MUL(acc, op[0], op[1]));
1164          emit(MACH(dst_null_d(), op[0], op[1]));
1165          emit(MOV(result_dst, src_reg(acc)));
1166       } else {
1167          emit(MUL(result_dst, op[0], op[1]));
1168       }
1169       break;
1170    case ir_binop_div:
1171       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1172       assert(ir->type->is_integer());
1173       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1174       break;
1175    case ir_binop_mod:
1176       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1177       assert(ir->type->is_integer());
1178       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1179       break;
1180
1181    case ir_binop_less:
1182    case ir_binop_greater:
1183    case ir_binop_lequal:
1184    case ir_binop_gequal:
1185    case ir_binop_equal:
1186    case ir_binop_nequal: {
1187       emit(CMP(result_dst, op[0], op[1],
1188                brw_conditional_for_comparison(ir->operation)));
1189       emit(AND(result_dst, result_src, src_reg(0x1)));
1190       break;
1191    }
1192
1193    case ir_binop_all_equal:
1194       /* "==" operator producing a scalar boolean. */
1195       if (ir->operands[0]->type->is_vector() ||
1196           ir->operands[1]->type->is_vector()) {
1197          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1198          emit(MOV(result_dst, src_reg(0)));
1199          inst = emit(MOV(result_dst, src_reg(1)));
1200          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1201       } else {
1202          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1203          emit(AND(result_dst, result_src, src_reg(0x1)));
1204       }
1205       break;
1206    case ir_binop_any_nequal:
1207       /* "!=" operator producing a scalar boolean. */
1208       if (ir->operands[0]->type->is_vector() ||
1209           ir->operands[1]->type->is_vector()) {
1210          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1211
1212          emit(MOV(result_dst, src_reg(0)));
1213          inst = emit(MOV(result_dst, src_reg(1)));
1214          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1215       } else {
1216          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1217          emit(AND(result_dst, result_src, src_reg(0x1)));
1218       }
1219       break;
1220
1221    case ir_unop_any:
1222       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1223       emit(MOV(result_dst, src_reg(0)));
1224
1225       inst = emit(MOV(result_dst, src_reg(1)));
1226       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1227       break;
1228
1229    case ir_binop_logic_xor:
1230       emit(XOR(result_dst, op[0], op[1]));
1231       break;
1232
1233    case ir_binop_logic_or:
1234       emit(OR(result_dst, op[0], op[1]));
1235       break;
1236
1237    case ir_binop_logic_and:
1238       emit(AND(result_dst, op[0], op[1]));
1239       break;
1240
1241    case ir_binop_dot:
1242       assert(ir->operands[0]->type->is_vector());
1243       assert(ir->operands[0]->type == ir->operands[1]->type);
1244       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1245       break;
1246
1247    case ir_unop_sqrt:
1248       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1249       break;
1250    case ir_unop_rsq:
1251       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1252       break;
1253    case ir_unop_i2f:
1254    case ir_unop_i2u:
1255    case ir_unop_u2i:
1256    case ir_unop_u2f:
1257    case ir_unop_b2f:
1258    case ir_unop_b2i:
1259    case ir_unop_f2i:
1260       emit(MOV(result_dst, op[0]));
1261       break;
1262    case ir_unop_f2b:
1263    case ir_unop_i2b: {
1264       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1265       emit(AND(result_dst, result_src, src_reg(1)));
1266       break;
1267    }
1268
1269    case ir_unop_trunc:
1270       emit(RNDZ(result_dst, op[0]));
1271       break;
1272    case ir_unop_ceil:
1273       op[0].negate = !op[0].negate;
1274       inst = emit(RNDD(result_dst, op[0]));
1275       this->result.negate = true;
1276       break;
1277    case ir_unop_floor:
1278       inst = emit(RNDD(result_dst, op[0]));
1279       break;
1280    case ir_unop_fract:
1281       inst = emit(FRC(result_dst, op[0]));
1282       break;
1283    case ir_unop_round_even:
1284       emit(RNDE(result_dst, op[0]));
1285       break;
1286
1287    case ir_binop_min:
1288       emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_L));
1289
1290       inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1291       inst->predicate = BRW_PREDICATE_NORMAL;
1292       break;
1293    case ir_binop_max:
1294       emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_G));
1295
1296       inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1297       inst->predicate = BRW_PREDICATE_NORMAL;
1298       break;
1299
1300    case ir_binop_pow:
1301       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1302       break;
1303
1304    case ir_unop_bit_not:
1305       inst = emit(NOT(result_dst, op[0]));
1306       break;
1307    case ir_binop_bit_and:
1308       inst = emit(AND(result_dst, op[0], op[1]));
1309       break;
1310    case ir_binop_bit_xor:
1311       inst = emit(XOR(result_dst, op[0], op[1]));
1312       break;
1313    case ir_binop_bit_or:
1314       inst = emit(OR(result_dst, op[0], op[1]));
1315       break;
1316
1317    case ir_binop_lshift:
1318       inst = emit(BRW_OPCODE_SHL, result_dst, op[0], op[1]);
1319       break;
1320
1321    case ir_binop_rshift:
1322       if (ir->type->base_type == GLSL_TYPE_INT)
1323          inst = emit(BRW_OPCODE_ASR, result_dst, op[0], op[1]);
1324       else
1325          inst = emit(BRW_OPCODE_SHR, result_dst, op[0], op[1]);
1326       break;
1327
1328    case ir_quadop_vector:
1329       assert(!"not reached: should be handled by lower_quadop_vector");
1330       break;
1331    }
1332 }
1333
1334
1335 void
1336 vec4_visitor::visit(ir_swizzle *ir)
1337 {
1338    src_reg src;
1339    int i = 0;
1340    int swizzle[4];
1341
1342    /* Note that this is only swizzles in expressions, not those on the left
1343     * hand side of an assignment, which do write masking.  See ir_assignment
1344     * for that.
1345     */
1346
1347    ir->val->accept(this);
1348    src = this->result;
1349    assert(src.file != BAD_FILE);
1350
1351    for (i = 0; i < ir->type->vector_elements; i++) {
1352       switch (i) {
1353       case 0:
1354          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1355          break;
1356       case 1:
1357          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1358          break;
1359       case 2:
1360          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1361          break;
1362       case 3:
1363          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1364             break;
1365       }
1366    }
1367    for (; i < 4; i++) {
1368       /* Replicate the last channel out. */
1369       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1370    }
1371
1372    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1373
1374    this->result = src;
1375 }
1376
1377 void
1378 vec4_visitor::visit(ir_dereference_variable *ir)
1379 {
1380    const struct glsl_type *type = ir->type;
1381    dst_reg *reg = variable_storage(ir->var);
1382
1383    if (!reg) {
1384       fail("Failed to find variable storage for %s\n", ir->var->name);
1385       this->result = src_reg(brw_null_reg());
1386       return;
1387    }
1388
1389    this->result = src_reg(*reg);
1390
1391    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1392       this->result.swizzle = swizzle_for_size(type->vector_elements);
1393 }
1394
1395 void
1396 vec4_visitor::visit(ir_dereference_array *ir)
1397 {
1398    ir_constant *constant_index;
1399    src_reg src;
1400    int element_size = type_size(ir->type);
1401
1402    constant_index = ir->array_index->constant_expression_value();
1403
1404    ir->array->accept(this);
1405    src = this->result;
1406
1407    if (constant_index) {
1408       src.reg_offset += constant_index->value.i[0] * element_size;
1409    } else {
1410       /* Variable index array dereference.  It eats the "vec4" of the
1411        * base of the array and an index that offsets the Mesa register
1412        * index.
1413        */
1414       ir->array_index->accept(this);
1415
1416       src_reg index_reg;
1417
1418       if (element_size == 1) {
1419          index_reg = this->result;
1420       } else {
1421          index_reg = src_reg(this, glsl_type::int_type);
1422
1423          emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1424       }
1425
1426       if (src.reladdr) {
1427          src_reg temp = src_reg(this, glsl_type::int_type);
1428
1429          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1430
1431          index_reg = temp;
1432       }
1433
1434       src.reladdr = ralloc(mem_ctx, src_reg);
1435       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1436    }
1437
1438    /* If the type is smaller than a vec4, replicate the last channel out. */
1439    if (ir->type->is_scalar() || ir->type->is_vector())
1440       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1441    else
1442       src.swizzle = BRW_SWIZZLE_NOOP;
1443    src.type = brw_type_for_base_type(ir->type);
1444
1445    this->result = src;
1446 }
1447
1448 void
1449 vec4_visitor::visit(ir_dereference_record *ir)
1450 {
1451    unsigned int i;
1452    const glsl_type *struct_type = ir->record->type;
1453    int offset = 0;
1454
1455    ir->record->accept(this);
1456
1457    for (i = 0; i < struct_type->length; i++) {
1458       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1459          break;
1460       offset += type_size(struct_type->fields.structure[i].type);
1461    }
1462
1463    /* If the type is smaller than a vec4, replicate the last channel out. */
1464    if (ir->type->is_scalar() || ir->type->is_vector())
1465       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1466    else
1467       this->result.swizzle = BRW_SWIZZLE_NOOP;
1468    this->result.type = brw_type_for_base_type(ir->type);
1469
1470    this->result.reg_offset += offset;
1471 }
1472
1473 /**
1474  * We want to be careful in assignment setup to hit the actual storage
1475  * instead of potentially using a temporary like we might with the
1476  * ir_dereference handler.
1477  */
1478 static dst_reg
1479 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1480 {
1481    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1482     * access of a vector, it must be separated into a series conditional moves
1483     * before reaching this point (see ir_vec_index_to_cond_assign).
1484     */
1485    assert(ir->as_dereference());
1486    ir_dereference_array *deref_array = ir->as_dereference_array();
1487    if (deref_array) {
1488       assert(!deref_array->array->type->is_vector());
1489    }
1490
1491    /* Use the rvalue deref handler for the most part.  We'll ignore
1492     * swizzles in it and write swizzles using writemask, though.
1493     */
1494    ir->accept(v);
1495    return dst_reg(v->result);
1496 }
1497
1498 void
1499 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1500                               const struct glsl_type *type, uint32_t predicate)
1501 {
1502    if (type->base_type == GLSL_TYPE_STRUCT) {
1503       for (unsigned int i = 0; i < type->length; i++) {
1504          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1505       }
1506       return;
1507    }
1508
1509    if (type->is_array()) {
1510       for (unsigned int i = 0; i < type->length; i++) {
1511          emit_block_move(dst, src, type->fields.array, predicate);
1512       }
1513       return;
1514    }
1515
1516    if (type->is_matrix()) {
1517       const struct glsl_type *vec_type;
1518
1519       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1520                                          type->vector_elements, 1);
1521
1522       for (int i = 0; i < type->matrix_columns; i++) {
1523          emit_block_move(dst, src, vec_type, predicate);
1524       }
1525       return;
1526    }
1527
1528    assert(type->is_scalar() || type->is_vector());
1529
1530    dst->type = brw_type_for_base_type(type);
1531    src->type = dst->type;
1532
1533    dst->writemask = (1 << type->vector_elements) - 1;
1534
1535    /* Do we need to worry about swizzling a swizzle? */
1536    assert(src->swizzle == BRW_SWIZZLE_NOOP
1537           || src->swizzle == swizzle_for_size(type->vector_elements));
1538    src->swizzle = swizzle_for_size(type->vector_elements);
1539
1540    vec4_instruction *inst = emit(MOV(*dst, *src));
1541    inst->predicate = predicate;
1542
1543    dst->reg_offset++;
1544    src->reg_offset++;
1545 }
1546
1547
1548 /* If the RHS processing resulted in an instruction generating a
1549  * temporary value, and it would be easy to rewrite the instruction to
1550  * generate its result right into the LHS instead, do so.  This ends
1551  * up reliably removing instructions where it can be tricky to do so
1552  * later without real UD chain information.
1553  */
1554 bool
1555 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1556                                      dst_reg dst,
1557                                      src_reg src,
1558                                      vec4_instruction *pre_rhs_inst,
1559                                      vec4_instruction *last_rhs_inst)
1560 {
1561    /* This could be supported, but it would take more smarts. */
1562    if (ir->condition)
1563       return false;
1564
1565    if (pre_rhs_inst == last_rhs_inst)
1566       return false; /* No instructions generated to work with. */
1567
1568    /* Make sure the last instruction generated our source reg. */
1569    if (src.file != GRF ||
1570        src.file != last_rhs_inst->dst.file ||
1571        src.reg != last_rhs_inst->dst.reg ||
1572        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1573        src.reladdr ||
1574        src.abs ||
1575        src.negate ||
1576        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1577       return false;
1578
1579    /* Check that that last instruction fully initialized the channels
1580     * we want to use, in the order we want to use them.  We could
1581     * potentially reswizzle the operands of many instructions so that
1582     * we could handle out of order channels, but don't yet.
1583     */
1584
1585    for (unsigned i = 0; i < 4; i++) {
1586       if (dst.writemask & (1 << i)) {
1587          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1588             return false;
1589
1590          if (BRW_GET_SWZ(src.swizzle, i) != i)
1591             return false;
1592       }
1593    }
1594
1595    /* Success!  Rewrite the instruction. */
1596    last_rhs_inst->dst.file = dst.file;
1597    last_rhs_inst->dst.reg = dst.reg;
1598    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1599    last_rhs_inst->dst.reladdr = dst.reladdr;
1600    last_rhs_inst->dst.writemask &= dst.writemask;
1601
1602    return true;
1603 }
1604
1605 void
1606 vec4_visitor::visit(ir_assignment *ir)
1607 {
1608    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1609    uint32_t predicate = BRW_PREDICATE_NONE;
1610
1611    if (!ir->lhs->type->is_scalar() &&
1612        !ir->lhs->type->is_vector()) {
1613       ir->rhs->accept(this);
1614       src_reg src = this->result;
1615
1616       if (ir->condition) {
1617          emit_bool_to_cond_code(ir->condition, &predicate);
1618       }
1619
1620       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1621       return;
1622    }
1623
1624    /* Now we're down to just a scalar/vector with writemasks. */
1625    int i;
1626
1627    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1628    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1629
1630    ir->rhs->accept(this);
1631
1632    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1633
1634    src_reg src = this->result;
1635
1636    int swizzles[4];
1637    int first_enabled_chan = 0;
1638    int src_chan = 0;
1639
1640    assert(ir->lhs->type->is_vector() ||
1641           ir->lhs->type->is_scalar());
1642    dst.writemask = ir->write_mask;
1643
1644    for (int i = 0; i < 4; i++) {
1645       if (dst.writemask & (1 << i)) {
1646          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1647          break;
1648       }
1649    }
1650
1651    /* Swizzle a small RHS vector into the channels being written.
1652     *
1653     * glsl ir treats write_mask as dictating how many channels are
1654     * present on the RHS while in our instructions we need to make
1655     * those channels appear in the slots of the vec4 they're written to.
1656     */
1657    for (int i = 0; i < 4; i++) {
1658       if (dst.writemask & (1 << i))
1659          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1660       else
1661          swizzles[i] = first_enabled_chan;
1662    }
1663    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1664                               swizzles[2], swizzles[3]);
1665
1666    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1667       return;
1668    }
1669
1670    if (ir->condition) {
1671       emit_bool_to_cond_code(ir->condition, &predicate);
1672    }
1673
1674    for (i = 0; i < type_size(ir->lhs->type); i++) {
1675       vec4_instruction *inst = emit(MOV(dst, src));
1676       inst->predicate = predicate;
1677
1678       dst.reg_offset++;
1679       src.reg_offset++;
1680    }
1681 }
1682
1683 void
1684 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1685 {
1686    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1687       foreach_list(node, &ir->components) {
1688          ir_constant *field_value = (ir_constant *)node;
1689
1690          emit_constant_values(dst, field_value);
1691       }
1692       return;
1693    }
1694
1695    if (ir->type->is_array()) {
1696       for (unsigned int i = 0; i < ir->type->length; i++) {
1697          emit_constant_values(dst, ir->array_elements[i]);
1698       }
1699       return;
1700    }
1701
1702    if (ir->type->is_matrix()) {
1703       for (int i = 0; i < ir->type->matrix_columns; i++) {
1704          for (int j = 0; j < ir->type->vector_elements; j++) {
1705             dst->writemask = 1 << j;
1706             dst->type = BRW_REGISTER_TYPE_F;
1707
1708             emit(MOV(*dst,
1709                      src_reg(ir->value.f[i * ir->type->vector_elements + j])));
1710          }
1711          dst->reg_offset++;
1712       }
1713       return;
1714    }
1715
1716    for (int i = 0; i < ir->type->vector_elements; i++) {
1717       dst->writemask = 1 << i;
1718       dst->type = brw_type_for_base_type(ir->type);
1719
1720       switch (ir->type->base_type) {
1721       case GLSL_TYPE_FLOAT:
1722          emit(MOV(*dst, src_reg(ir->value.f[i])));
1723          break;
1724       case GLSL_TYPE_INT:
1725          emit(MOV(*dst, src_reg(ir->value.i[i])));
1726          break;
1727       case GLSL_TYPE_UINT:
1728          emit(MOV(*dst, src_reg(ir->value.u[i])));
1729          break;
1730       case GLSL_TYPE_BOOL:
1731          emit(MOV(*dst, src_reg(ir->value.b[i])));
1732          break;
1733       default:
1734          assert(!"Non-float/uint/int/bool constant");
1735          break;
1736       }
1737    }
1738    dst->reg_offset++;
1739 }
1740
1741 void
1742 vec4_visitor::visit(ir_constant *ir)
1743 {
1744    dst_reg dst = dst_reg(this, ir->type);
1745    this->result = src_reg(dst);
1746
1747    emit_constant_values(&dst, ir);
1748 }
1749
1750 void
1751 vec4_visitor::visit(ir_call *ir)
1752 {
1753    assert(!"not reached");
1754 }
1755
1756 void
1757 vec4_visitor::visit(ir_texture *ir)
1758 {
1759    int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
1760    sampler = vp->Base.SamplerUnits[sampler];
1761
1762    /* Should be lowered by do_lower_texture_projection */
1763    assert(!ir->projector);
1764
1765    vec4_instruction *inst;
1766    switch (ir->op) {
1767    case ir_tex:
1768    case ir_txl:
1769       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
1770       break;
1771    case ir_txd:
1772       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
1773       break;
1774    case ir_txf:
1775       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
1776       break;
1777    case ir_txs:
1778       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
1779       break;
1780    case ir_txb:
1781       assert(!"TXB is not valid for vertex shaders.");
1782    }
1783
1784    /* Texel offsets go in the message header; Gen4 also requires headers. */
1785    inst->header_present = ir->offset || intel->gen < 5;
1786    inst->base_mrf = 2;
1787    inst->mlen = inst->header_present + 1; /* always at least one */
1788    inst->sampler = sampler;
1789    inst->dst = dst_reg(this, glsl_type::get_instance(ir->type->base_type,4,1));
1790    inst->shadow_compare = ir->shadow_comparitor != NULL;
1791
1792    if (ir->offset != NULL)
1793       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
1794
1795    /* MRF for the first parameter */
1796    int param_base = inst->base_mrf + inst->header_present;
1797
1798    if (ir->op == ir_txs) {
1799       ir->lod_info.lod->accept(this);
1800       int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
1801       emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, writemask),
1802            this->result));
1803    } else {
1804       int i, coord_mask = 0, zero_mask = 0;
1805       /* Load the coordinate */
1806       /* FINISHME: gl_clamp_mask and saturate */
1807       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
1808          coord_mask |= (1 << i);
1809       for (; i < 4; i++)
1810          zero_mask |= (1 << i);
1811
1812       ir->coordinate->accept(this);
1813       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
1814                this->result));
1815       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
1816                src_reg(0)));
1817       /* Load the shadow comparitor */
1818       if (ir->shadow_comparitor) {
1819          ir->shadow_comparitor->accept(this);
1820          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
1821                           WRITEMASK_X),
1822                   this->result));
1823          inst->mlen++;
1824       }
1825
1826       /* Load the LOD info */
1827       if (ir->op == ir_txl) {
1828          int mrf, writemask;
1829          if (intel->gen >= 5) {
1830             mrf = param_base + 1;
1831             if (ir->shadow_comparitor) {
1832                writemask = WRITEMASK_Y;
1833                /* mlen already incremented */
1834             } else {
1835                writemask = WRITEMASK_X;
1836                inst->mlen++;
1837             }
1838          } else /* intel->gen == 4 */ {
1839             mrf = param_base;
1840             writemask = WRITEMASK_Z;
1841          }
1842          ir->lod_info.lod->accept(this);
1843          emit(MOV(dst_reg(MRF, mrf, ir->lod_info.lod->type, writemask),
1844                   this->result));
1845       } else if (ir->op == ir_txf) {
1846          ir->lod_info.lod->accept(this);
1847          emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, WRITEMASK_W),
1848                   this->result));
1849       } else if (ir->op == ir_txd) {
1850          const glsl_type *type = ir->lod_info.grad.dPdx->type;
1851
1852          ir->lod_info.grad.dPdx->accept(this);
1853          src_reg dPdx = this->result;
1854          ir->lod_info.grad.dPdy->accept(this);
1855          src_reg dPdy = this->result;
1856
1857          if (intel->gen >= 5) {
1858             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1859             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1860             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
1861             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
1862             inst->mlen++;
1863
1864             if (ir->type->vector_elements == 3) {
1865                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
1866                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
1867                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
1868                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
1869                inst->mlen++;
1870             }
1871          } else /* intel->gen == 4 */ {
1872             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
1873             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
1874             inst->mlen += 2;
1875          }
1876       }
1877    }
1878
1879    emit(inst);
1880
1881    swizzle_result(ir, src_reg(inst->dst), sampler);
1882 }
1883
1884 void
1885 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
1886 {
1887    this->result = orig_val;
1888
1889    int s = c->key.tex.swizzles[sampler];
1890
1891    if (ir->op == ir_txs || ir->type == glsl_type::float_type
1892                         || s == SWIZZLE_NOOP)
1893       return;
1894
1895    int zero_mask = 0, one_mask = 0, copy_mask = 0;
1896    int swizzle[4];
1897
1898    for (int i = 0; i < 4; i++) {
1899       switch (GET_SWZ(s, i)) {
1900       case SWIZZLE_ZERO:
1901          zero_mask |= (1 << i);
1902          break;
1903       case SWIZZLE_ONE:
1904          one_mask |= (1 << i);
1905          break;
1906       default:
1907          copy_mask |= (1 << i);
1908          swizzle[i] = GET_SWZ(s, i);
1909          break;
1910       }
1911    }
1912
1913    this->result = src_reg(this, ir->type);
1914    dst_reg swizzled_result(this->result);
1915
1916    if (copy_mask) {
1917       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1918       swizzled_result.writemask = copy_mask;
1919       emit(MOV(swizzled_result, orig_val));
1920    }
1921
1922    if (zero_mask) {
1923       swizzled_result.writemask = zero_mask;
1924       emit(MOV(swizzled_result, src_reg(0.0f)));
1925    }
1926
1927    if (one_mask) {
1928       swizzled_result.writemask = one_mask;
1929       emit(MOV(swizzled_result, src_reg(1.0f)));
1930    }
1931 }
1932
1933 void
1934 vec4_visitor::visit(ir_return *ir)
1935 {
1936    assert(!"not reached");
1937 }
1938
1939 void
1940 vec4_visitor::visit(ir_discard *ir)
1941 {
1942    assert(!"not reached");
1943 }
1944
1945 void
1946 vec4_visitor::visit(ir_if *ir)
1947 {
1948    /* Don't point the annotation at the if statement, because then it plus
1949     * the then and else blocks get printed.
1950     */
1951    this->base_ir = ir->condition;
1952
1953    if (intel->gen == 6) {
1954       emit_if_gen6(ir);
1955    } else {
1956       uint32_t predicate;
1957       emit_bool_to_cond_code(ir->condition, &predicate);
1958       emit(IF(predicate));
1959    }
1960
1961    visit_instructions(&ir->then_instructions);
1962
1963    if (!ir->else_instructions.is_empty()) {
1964       this->base_ir = ir->condition;
1965       emit(BRW_OPCODE_ELSE);
1966
1967       visit_instructions(&ir->else_instructions);
1968    }
1969
1970    this->base_ir = ir->condition;
1971    emit(BRW_OPCODE_ENDIF);
1972 }
1973
1974 void
1975 vec4_visitor::emit_ndc_computation()
1976 {
1977    /* Get the position */
1978    src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
1979
1980    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1981    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1982    output_reg[BRW_VERT_RESULT_NDC] = ndc;
1983
1984    current_annotation = "NDC";
1985    dst_reg ndc_w = ndc;
1986    ndc_w.writemask = WRITEMASK_W;
1987    src_reg pos_w = pos;
1988    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1989    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1990
1991    dst_reg ndc_xyz = ndc;
1992    ndc_xyz.writemask = WRITEMASK_XYZ;
1993
1994    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1995 }
1996
1997 void
1998 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
1999 {
2000    if (intel->gen < 6 &&
2001        ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2002         c->key.userclip_active || brw->has_negative_rhw_bug)) {
2003       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2004       dst_reg header1_w = header1;
2005       header1_w.writemask = WRITEMASK_W;
2006       GLuint i;
2007
2008       emit(MOV(header1, 0u));
2009
2010       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2011          src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2012
2013          current_annotation = "Point size";
2014          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2015          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2016       }
2017
2018       current_annotation = "Clipping flags";
2019       for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2020          vec4_instruction *inst;
2021
2022          inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2023                          src_reg(this->userplane[i])));
2024          inst->conditional_mod = BRW_CONDITIONAL_L;
2025
2026          inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2027          inst->predicate = BRW_PREDICATE_NORMAL;
2028       }
2029
2030       /* i965 clipping workaround:
2031        * 1) Test for -ve rhw
2032        * 2) If set,
2033        *      set ndc = (0,0,0,0)
2034        *      set ucp[6] = 1
2035        *
2036        * Later, clipping will detect ucp[6] and ensure the primitive is
2037        * clipped against all fixed planes.
2038        */
2039       if (brw->has_negative_rhw_bug) {
2040 #if 0
2041          /* FINISHME */
2042          brw_CMP(p,
2043                  vec8(brw_null_reg()),
2044                  BRW_CONDITIONAL_L,
2045                  brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2046                  brw_imm_f(0));
2047
2048          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2049          brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2050          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2051 #endif
2052       }
2053
2054       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2055    } else if (intel->gen < 6) {
2056       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2057    } else {
2058       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2059       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2060          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2061                   src_reg(output_reg[VERT_RESULT_PSIZ])));
2062       }
2063    }
2064 }
2065
2066 void
2067 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2068 {
2069    if (intel->gen < 6) {
2070       /* Clip distance slots are set aside in gen5, but they are not used.  It
2071        * is not clear whether we actually need to set aside space for them,
2072        * but the performance cost is negligible.
2073        */
2074       return;
2075    }
2076
2077    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2078     *
2079     *     "If a linked set of shaders forming the vertex stage contains no
2080     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2081     *     application has requested clipping against user clip planes through
2082     *     the API, then the coordinate written to gl_Position is used for
2083     *     comparison against the user clip planes."
2084     *
2085     * This function is only called if the shader didn't write to
2086     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2087     * if the user wrote to it; otherwise we use gl_Position.
2088     */
2089    gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2090    if (!(c->prog_data.outputs_written
2091          & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2092       clip_vertex = VERT_RESULT_HPOS;
2093    }
2094
2095    for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2096         ++i) {
2097       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2098                src_reg(output_reg[clip_vertex]),
2099                src_reg(this->userplane[i + offset])));
2100    }
2101 }
2102
2103 void
2104 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2105 {
2106    assert (vert_result < VERT_RESULT_MAX);
2107    reg.type = output_reg[vert_result].type;
2108    current_annotation = output_reg_annotation[vert_result];
2109    /* Copy the register, saturating if necessary */
2110    vec4_instruction *inst = emit(MOV(reg,
2111                                      src_reg(output_reg[vert_result])));
2112    if ((vert_result == VERT_RESULT_COL0 ||
2113         vert_result == VERT_RESULT_COL1 ||
2114         vert_result == VERT_RESULT_BFC0 ||
2115         vert_result == VERT_RESULT_BFC1) &&
2116        c->key.clamp_vertex_color) {
2117       inst->saturate = true;
2118    }
2119 }
2120
2121 void
2122 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2123 {
2124    struct brw_reg hw_reg = brw_message_reg(mrf);
2125    dst_reg reg = dst_reg(MRF, mrf);
2126    reg.type = BRW_REGISTER_TYPE_F;
2127
2128    switch (vert_result) {
2129    case VERT_RESULT_PSIZ:
2130       /* PSIZ is always in slot 0, and is coupled with other flags. */
2131       current_annotation = "indices, point width, clip flags";
2132       emit_psiz_and_flags(hw_reg);
2133       break;
2134    case BRW_VERT_RESULT_NDC:
2135       current_annotation = "NDC";
2136       emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2137       break;
2138    case BRW_VERT_RESULT_HPOS_DUPLICATE:
2139    case VERT_RESULT_HPOS:
2140       current_annotation = "gl_Position";
2141       emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2142       break;
2143    case VERT_RESULT_CLIP_DIST0:
2144    case VERT_RESULT_CLIP_DIST1:
2145       if (this->c->key.uses_clip_distance) {
2146          emit_generic_urb_slot(reg, vert_result);
2147       } else {
2148          current_annotation = "user clip distances";
2149          emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2150       }
2151       break;
2152    case BRW_VERT_RESULT_PAD:
2153       /* No need to write to this slot */
2154       break;
2155    default:
2156       emit_generic_urb_slot(reg, vert_result);
2157       break;
2158    }
2159 }
2160
2161 static int
2162 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2163 {
2164    struct intel_context *intel = &brw->intel;
2165
2166    if (intel->gen >= 6) {
2167       /* URB data written (does not include the message header reg) must
2168        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2169        * section 5.4.3.2.2: URB_INTERLEAVED.
2170        *
2171        * URB entries are allocated on a multiple of 1024 bits, so an
2172        * extra 128 bits written here to make the end align to 256 is
2173        * no problem.
2174        */
2175       if ((mlen % 2) != 1)
2176          mlen++;
2177    }
2178
2179    return mlen;
2180 }
2181
2182 /**
2183  * Generates the VUE payload plus the 1 or 2 URB write instructions to
2184  * complete the VS thread.
2185  *
2186  * The VUE layout is documented in Volume 2a.
2187  */
2188 void
2189 vec4_visitor::emit_urb_writes()
2190 {
2191    /* MRF 0 is reserved for the debugger, so start with message header
2192     * in MRF 1.
2193     */
2194    int base_mrf = 1;
2195    int mrf = base_mrf;
2196    /* In the process of generating our URB write message contents, we
2197     * may need to unspill a register or load from an array.  Those
2198     * reads would use MRFs 14-15.
2199     */
2200    int max_usable_mrf = 13;
2201
2202    /* The following assertion verifies that max_usable_mrf causes an
2203     * even-numbered amount of URB write data, which will meet gen6's
2204     * requirements for length alignment.
2205     */
2206    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2207
2208    /* FINISHME: edgeflag */
2209
2210    brw_compute_vue_map(&c->vue_map, intel, c->key.userclip_active,
2211                        c->prog_data.outputs_written);
2212
2213    /* First mrf is the g0-based message header containing URB handles and such,
2214     * which is implied in VS_OPCODE_URB_WRITE.
2215     */
2216    mrf++;
2217
2218    if (intel->gen < 6) {
2219       emit_ndc_computation();
2220    }
2221
2222    /* Set up the VUE data for the first URB write */
2223    int slot;
2224    for (slot = 0; slot < c->vue_map.num_slots; ++slot) {
2225       emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
2226
2227       /* If this was max_usable_mrf, we can't fit anything more into this URB
2228        * WRITE.
2229        */
2230       if (mrf > max_usable_mrf) {
2231          slot++;
2232          break;
2233       }
2234    }
2235
2236    current_annotation = "URB write";
2237    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2238    inst->base_mrf = base_mrf;
2239    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2240    inst->eot = (slot >= c->vue_map.num_slots);
2241
2242    /* Optional second URB write */
2243    if (!inst->eot) {
2244       mrf = base_mrf + 1;
2245
2246       for (; slot < c->vue_map.num_slots; ++slot) {
2247          assert(mrf < max_usable_mrf);
2248
2249          emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
2250       }
2251
2252       current_annotation = "URB write";
2253       inst = emit(VS_OPCODE_URB_WRITE);
2254       inst->base_mrf = base_mrf;
2255       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2256       inst->eot = true;
2257       /* URB destination offset.  In the previous write, we got MRFs
2258        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2259        * URB row increments, and each of our MRFs is half of one of
2260        * those, since we're doing interleaved writes.
2261        */
2262       inst->offset = (max_usable_mrf - base_mrf) / 2;
2263    }
2264
2265    if (intel->gen == 6)
2266       c->prog_data.urb_entry_size = ALIGN(c->vue_map.num_slots, 8) / 8;
2267    else
2268       c->prog_data.urb_entry_size = ALIGN(c->vue_map.num_slots, 4) / 4;
2269 }
2270
2271 src_reg
2272 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2273                                  src_reg *reladdr, int reg_offset)
2274 {
2275    /* Because we store the values to scratch interleaved like our
2276     * vertex data, we need to scale the vec4 index by 2.
2277     */
2278    int message_header_scale = 2;
2279
2280    /* Pre-gen6, the message header uses byte offsets instead of vec4
2281     * (16-byte) offset units.
2282     */
2283    if (intel->gen < 6)
2284       message_header_scale *= 16;
2285
2286    if (reladdr) {
2287       src_reg index = src_reg(this, glsl_type::int_type);
2288
2289       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2290       emit_before(inst, MUL(dst_reg(index),
2291                             index, src_reg(message_header_scale)));
2292
2293       return index;
2294    } else {
2295       return src_reg(reg_offset * message_header_scale);
2296    }
2297 }
2298
2299 src_reg
2300 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2301                                        src_reg *reladdr, int reg_offset)
2302 {
2303    if (reladdr) {
2304       src_reg index = src_reg(this, glsl_type::int_type);
2305
2306       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2307
2308       /* Pre-gen6, the message header uses byte offsets instead of vec4
2309        * (16-byte) offset units.
2310        */
2311       if (intel->gen < 6) {
2312          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2313       }
2314
2315       return index;
2316    } else {
2317       int message_header_scale = intel->gen < 6 ? 16 : 1;
2318       return src_reg(reg_offset * message_header_scale);
2319    }
2320 }
2321
2322 /**
2323  * Emits an instruction before @inst to load the value named by @orig_src
2324  * from scratch space at @base_offset to @temp.
2325  */
2326 void
2327 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2328                                 dst_reg temp, src_reg orig_src,
2329                                 int base_offset)
2330 {
2331    int reg_offset = base_offset + orig_src.reg_offset;
2332    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2333
2334    emit_before(inst, SCRATCH_READ(temp, index));
2335 }
2336
2337 /**
2338  * Emits an instruction after @inst to store the value to be written
2339  * to @orig_dst to scratch space at @base_offset, from @temp.
2340  */
2341 void
2342 vec4_visitor::emit_scratch_write(vec4_instruction *inst,
2343                                  src_reg temp, dst_reg orig_dst,
2344                                  int base_offset)
2345 {
2346    int reg_offset = base_offset + orig_dst.reg_offset;
2347    src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
2348
2349    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2350                                        orig_dst.writemask));
2351    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2352    write->predicate = inst->predicate;
2353    write->ir = inst->ir;
2354    write->annotation = inst->annotation;
2355    inst->insert_after(write);
2356 }
2357
2358 /**
2359  * We can't generally support array access in GRF space, because a
2360  * single instruction's destination can only span 2 contiguous
2361  * registers.  So, we send all GRF arrays that get variable index
2362  * access to scratch space.
2363  */
2364 void
2365 vec4_visitor::move_grf_array_access_to_scratch()
2366 {
2367    int scratch_loc[this->virtual_grf_count];
2368
2369    for (int i = 0; i < this->virtual_grf_count; i++) {
2370       scratch_loc[i] = -1;
2371    }
2372
2373    /* First, calculate the set of virtual GRFs that need to be punted
2374     * to scratch due to having any array access on them, and where in
2375     * scratch.
2376     */
2377    foreach_list(node, &this->instructions) {
2378       vec4_instruction *inst = (vec4_instruction *)node;
2379
2380       if (inst->dst.file == GRF && inst->dst.reladdr &&
2381           scratch_loc[inst->dst.reg] == -1) {
2382          scratch_loc[inst->dst.reg] = c->last_scratch;
2383          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg] * 8 * 4;
2384       }
2385
2386       for (int i = 0 ; i < 3; i++) {
2387          src_reg *src = &inst->src[i];
2388
2389          if (src->file == GRF && src->reladdr &&
2390              scratch_loc[src->reg] == -1) {
2391             scratch_loc[src->reg] = c->last_scratch;
2392             c->last_scratch += this->virtual_grf_sizes[src->reg] * 8 * 4;
2393          }
2394       }
2395    }
2396
2397    /* Now, for anything that will be accessed through scratch, rewrite
2398     * it to load/store.  Note that this is a _safe list walk, because
2399     * we may generate a new scratch_write instruction after the one
2400     * we're processing.
2401     */
2402    foreach_list_safe(node, &this->instructions) {
2403       vec4_instruction *inst = (vec4_instruction *)node;
2404
2405       /* Set up the annotation tracking for new generated instructions. */
2406       base_ir = inst->ir;
2407       current_annotation = inst->annotation;
2408
2409       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2410          src_reg temp = src_reg(this, glsl_type::vec4_type);
2411
2412          emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
2413
2414          inst->dst.file = temp.file;
2415          inst->dst.reg = temp.reg;
2416          inst->dst.reg_offset = temp.reg_offset;
2417          inst->dst.reladdr = NULL;
2418       }
2419
2420       for (int i = 0 ; i < 3; i++) {
2421          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2422             continue;
2423
2424          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2425
2426          emit_scratch_read(inst, temp, inst->src[i],
2427                            scratch_loc[inst->src[i].reg]);
2428
2429          inst->src[i].file = temp.file;
2430          inst->src[i].reg = temp.reg;
2431          inst->src[i].reg_offset = temp.reg_offset;
2432          inst->src[i].reladdr = NULL;
2433       }
2434    }
2435 }
2436
2437 /**
2438  * Emits an instruction before @inst to load the value named by @orig_src
2439  * from the pull constant buffer (surface) at @base_offset to @temp.
2440  */
2441 void
2442 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2443                                       dst_reg temp, src_reg orig_src,
2444                                       int base_offset)
2445 {
2446    int reg_offset = base_offset + orig_src.reg_offset;
2447    src_reg index = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2448    vec4_instruction *load;
2449
2450    load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2451                                         temp, index);
2452    load->base_mrf = 14;
2453    load->mlen = 1;
2454    emit_before(inst, load);
2455 }
2456
2457 /**
2458  * Implements array access of uniforms by inserting a
2459  * PULL_CONSTANT_LOAD instruction.
2460  *
2461  * Unlike temporary GRF array access (where we don't support it due to
2462  * the difficulty of doing relative addressing on instruction
2463  * destinations), we could potentially do array access of uniforms
2464  * that were loaded in GRF space as push constants.  In real-world
2465  * usage we've seen, though, the arrays being used are always larger
2466  * than we could load as push constants, so just always move all
2467  * uniform array access out to a pull constant buffer.
2468  */
2469 void
2470 vec4_visitor::move_uniform_array_access_to_pull_constants()
2471 {
2472    int pull_constant_loc[this->uniforms];
2473
2474    for (int i = 0; i < this->uniforms; i++) {
2475       pull_constant_loc[i] = -1;
2476    }
2477
2478    /* Walk through and find array access of uniforms.  Put a copy of that
2479     * uniform in the pull constant buffer.
2480     *
2481     * Note that we don't move constant-indexed accesses to arrays.  No
2482     * testing has been done of the performance impact of this choice.
2483     */
2484    foreach_list_safe(node, &this->instructions) {
2485       vec4_instruction *inst = (vec4_instruction *)node;
2486
2487       for (int i = 0 ; i < 3; i++) {
2488          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2489             continue;
2490
2491          int uniform = inst->src[i].reg;
2492
2493          /* If this array isn't already present in the pull constant buffer,
2494           * add it.
2495           */
2496          if (pull_constant_loc[uniform] == -1) {
2497             const float **values = &prog_data->param[uniform * 4];
2498
2499             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2500
2501             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2502                prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2503             }
2504          }
2505
2506          /* Set up the annotation tracking for new generated instructions. */
2507          base_ir = inst->ir;
2508          current_annotation = inst->annotation;
2509
2510          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2511
2512          emit_pull_constant_load(inst, temp, inst->src[i],
2513                                  pull_constant_loc[uniform]);
2514
2515          inst->src[i].file = temp.file;
2516          inst->src[i].reg = temp.reg;
2517          inst->src[i].reg_offset = temp.reg_offset;
2518          inst->src[i].reladdr = NULL;
2519       }
2520    }
2521
2522    /* Now there are no accesses of the UNIFORM file with a reladdr, so
2523     * no need to track them as larger-than-vec4 objects.  This will be
2524     * relied on in cutting out unused uniform vectors from push
2525     * constants.
2526     */
2527    split_uniform_registers();
2528 }
2529
2530 void
2531 vec4_visitor::resolve_ud_negate(src_reg *reg)
2532 {
2533    if (reg->type != BRW_REGISTER_TYPE_UD ||
2534        !reg->negate)
2535       return;
2536
2537    src_reg temp = src_reg(this, glsl_type::uvec4_type);
2538    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2539    *reg = temp;
2540 }
2541
2542 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2543                            struct gl_shader_program *prog,
2544                            struct brw_shader *shader)
2545 {
2546    this->c = c;
2547    this->p = &c->func;
2548    this->brw = p->brw;
2549    this->intel = &brw->intel;
2550    this->ctx = &intel->ctx;
2551    this->prog = prog;
2552    this->shader = shader;
2553
2554    this->mem_ctx = ralloc_context(NULL);
2555    this->failed = false;
2556
2557    this->base_ir = NULL;
2558    this->current_annotation = NULL;
2559
2560    this->c = c;
2561    this->vp = (struct gl_vertex_program *)
2562      prog->_LinkedShaders[MESA_SHADER_VERTEX]->Program;
2563    this->prog_data = &c->prog_data;
2564
2565    this->variable_ht = hash_table_ctor(0,
2566                                        hash_table_pointer_hash,
2567                                        hash_table_pointer_compare);
2568
2569    this->virtual_grf_def = NULL;
2570    this->virtual_grf_use = NULL;
2571    this->virtual_grf_sizes = NULL;
2572    this->virtual_grf_count = 0;
2573    this->virtual_grf_reg_map = NULL;
2574    this->virtual_grf_reg_count = 0;
2575    this->virtual_grf_array_size = 0;
2576    this->live_intervals_valid = false;
2577
2578    this->uniforms = 0;
2579
2580    this->variable_ht = hash_table_ctor(0,
2581                                        hash_table_pointer_hash,
2582                                        hash_table_pointer_compare);
2583 }
2584
2585 vec4_visitor::~vec4_visitor()
2586 {
2587    ralloc_free(this->mem_ctx);
2588    hash_table_dtor(this->variable_ht);
2589 }
2590
2591
2592 void
2593 vec4_visitor::fail(const char *format, ...)
2594 {
2595    va_list va;
2596    char *msg;
2597
2598    if (failed)
2599       return;
2600
2601    failed = true;
2602
2603    va_start(va, format);
2604    msg = ralloc_vasprintf(mem_ctx, format, va);
2605    va_end(va);
2606    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2607
2608    this->fail_msg = msg;
2609
2610    if (INTEL_DEBUG & DEBUG_VS) {
2611       fprintf(stderr, "%s",  msg);
2612    }
2613 }
2614
2615 } /* namespace brw */