src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 extern "C" {
  26 #include "main/macros.h"
  27 #include "program/prog_parameter.h"
  28 #include "program/sampler.h"
  29 }
  30
  31 namespace brw {
  32
  33 vec4_instruction::vec4_instruction(vec4_visitor *v,
  34                                    enum opcode opcode, dst_reg dst,
  35                                    src_reg src0, src_reg src1, src_reg src2)
  36 {
  37    this->opcode = opcode;
  38    this->dst = dst;
  39    this->src[0] = src0;
  40    this->src[1] = src1;
  41    this->src[2] = src2;
  42    this->ir = v->base_ir;
  43    this->annotation = v->current_annotation;
  44 }
  45
  46 vec4_instruction *
  47 vec4_visitor::emit(vec4_instruction *inst)
  48 {
  49    this->instructions.push_tail(inst);
  50
  51    return inst;
  52 }
  53
  54 vec4_instruction *
  55 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  56 {
  57    new_inst->ir = inst->ir;
  58    new_inst->annotation = inst->annotation;
  59
  60    inst->insert_before(new_inst);
  61
  62    return inst;
  63 }
  64
  65 vec4_instruction *
  66 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  67                    src_reg src0, src_reg src1, src_reg src2)
  68 {
  69    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  70                                              src0, src1, src2));
  71 }
  72
  73
  74 vec4_instruction *
  75 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  76 {
  77    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  78 }
  79
  80 vec4_instruction *
  81 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  82 {
  83    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  84 }
  85
  86 vec4_instruction *
  87 vec4_visitor::emit(enum opcode opcode)
  88 {
  89    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  90 }
  91
  92 #define ALU1(op)                                                        \
  93    vec4_instruction *                                                   \
  94    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
  95    {                                                                    \
  96       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
  97                                            src0);                       \
  98    }
  99
 100 #define ALU2(op)                                                        \
 101    vec4_instruction *                                                   \
 102    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 103    {                                                                    \
 104       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 105                                            src0, src1);                 \
 106    }
 107
 108 ALU1(NOT)
 109 ALU1(MOV)
 110 ALU1(FRC)
 111 ALU1(RNDD)
 112 ALU1(RNDE)
 113 ALU1(RNDZ)
 114 ALU2(ADD)
 115 ALU2(MUL)
 116 ALU2(MACH)
 117 ALU2(AND)
 118 ALU2(OR)
 119 ALU2(XOR)
 120 ALU2(DP3)
 121 ALU2(DP4)
 122 ALU2(DPH)
 123 ALU2(SHL)
 124 ALU2(SHR)
 125 ALU2(ASR)
 126
 127 /** Gen4 predicated IF. */
 128 vec4_instruction *
 129 vec4_visitor::IF(uint32_t predicate)
 130 {
 131    vec4_instruction *inst;
 132
 133    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 134    inst->predicate = predicate;
 135
 136    return inst;
 137 }
 138
 139 /** Gen6+ IF with embedded comparison. */
 140 vec4_instruction *
 141 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 142 {
 143    assert(intel->gen >= 6);
 144
 145    vec4_instruction *inst;
 146
 147    resolve_ud_negate(&src0);
 148    resolve_ud_negate(&src1);
 149
 150    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 151                                         src0, src1);
 152    inst->conditional_mod = condition;
 153
 154    return inst;
 155 }
 156
 157 /**
 158  * CMP: Sets the low bit of the destination channels with the result
 159  * of the comparison, while the upper bits are undefined, and updates
 160  * the flag register with the packed 16 bits of the result.
 161  */
 162 vec4_instruction *
 163 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 164 {
 165    vec4_instruction *inst;
 166
 167    /* original gen4 does type conversion to the destination type
 168     * before before comparison, producing garbage results for floating
 169     * point comparisons.
 170     */
 171    if (intel->gen == 4) {
 172       dst.type = src0.type;
 173       if (dst.file == HW_REG)
 174          dst.fixed_hw_reg.type = dst.type;
 175    }
 176
 177    resolve_ud_negate(&src0);
 178    resolve_ud_negate(&src1);
 179
 180    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 181    inst->conditional_mod = condition;
 182
 183    return inst;
 184 }
 185
 186 vec4_instruction *
 187 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 188 {
 189    vec4_instruction *inst;
 190
 191    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 192                                         dst, index);
 193    inst->base_mrf = 14;
 194    inst->mlen = 2;
 195
 196    return inst;
 197 }
 198
 199 vec4_instruction *
 200 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 201 {
 202    vec4_instruction *inst;
 203
 204    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 205                                         dst, src, index);
 206    inst->base_mrf = 13;
 207    inst->mlen = 3;
 208
 209    return inst;
 210 }
 211
 212 void
 213 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 214 {
 215    static enum opcode dot_opcodes[] = {
 216       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 217    };
 218
 219    emit(dot_opcodes[elements - 2], dst, src0, src1);
 220 }
 221
 222 void
 223 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 224 {
 225    /* The gen6 math instruction ignores the source modifiers --
 226     * swizzle, abs, negate, and at least some parts of the register
 227     * region description.
 228     *
 229     * While it would seem that this MOV could be avoided at this point
 230     * in the case that the swizzle is matched up with the destination
 231     * writemask, note that uniform packing and register allocation
 232     * could rearrange our swizzle, so let's leave this matter up to
 233     * copy propagation later.
 234     */
 235    src_reg temp_src = src_reg(this, glsl_type::vec4_type);
 236    emit(MOV(dst_reg(temp_src), src));
 237
 238    if (dst.writemask != WRITEMASK_XYZW) {
 239       /* The gen6 math instruction must be align1, so we can't do
 240        * writemasks.
 241        */
 242       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 243
 244       emit(opcode, temp_dst, temp_src);
 245
 246       emit(MOV(dst, src_reg(temp_dst)));
 247    } else {
 248       emit(opcode, dst, temp_src);
 249    }
 250 }
 251
 252 void
 253 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 254 {
 255    vec4_instruction *inst = emit(opcode, dst, src);
 256    inst->base_mrf = 1;
 257    inst->mlen = 1;
 258 }
 259
 260 void
 261 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 262 {
 263    switch (opcode) {
 264    case SHADER_OPCODE_RCP:
 265    case SHADER_OPCODE_RSQ:
 266    case SHADER_OPCODE_SQRT:
 267    case SHADER_OPCODE_EXP2:
 268    case SHADER_OPCODE_LOG2:
 269    case SHADER_OPCODE_SIN:
 270    case SHADER_OPCODE_COS:
 271       break;
 272    default:
 273       assert(!"not reached: bad math opcode");
 274       return;
 275    }
 276
 277    if (intel->gen >= 7) {
 278       emit(opcode, dst, src);
 279    } else if (intel->gen == 6) {
 280       return emit_math1_gen6(opcode, dst, src);
 281    } else {
 282       return emit_math1_gen4(opcode, dst, src);
 283    }
 284 }
 285
 286 void
 287 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 288                               dst_reg dst, src_reg src0, src_reg src1)
 289 {
 290    src_reg expanded;
 291
 292    /* The gen6 math instruction ignores the source modifiers --
 293     * swizzle, abs, negate, and at least some parts of the register
 294     * region description.  Move the sources to temporaries to make it
 295     * generally work.
 296     */
 297
 298    expanded = src_reg(this, glsl_type::vec4_type);
 299    expanded.type = src0.type;
 300    emit(MOV(dst_reg(expanded), src0));
 301    src0 = expanded;
 302
 303    expanded = src_reg(this, glsl_type::vec4_type);
 304    expanded.type = src1.type;
 305    emit(MOV(dst_reg(expanded), src1));
 306    src1 = expanded;
 307
 308    if (dst.writemask != WRITEMASK_XYZW) {
 309       /* The gen6 math instruction must be align1, so we can't do
 310        * writemasks.
 311        */
 312       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 313       temp_dst.type = dst.type;
 314
 315       emit(opcode, temp_dst, src0, src1);
 316
 317       emit(MOV(dst, src_reg(temp_dst)));
 318    } else {
 319       emit(opcode, dst, src0, src1);
 320    }
 321 }
 322
 323 void
 324 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 325                               dst_reg dst, src_reg src0, src_reg src1)
 326 {
 327    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 328    inst->base_mrf = 1;
 329    inst->mlen = 2;
 330 }
 331
 332 void
 333 vec4_visitor::emit_math(enum opcode opcode,
 334                         dst_reg dst, src_reg src0, src_reg src1)
 335 {
 336    switch (opcode) {
 337    case SHADER_OPCODE_POW:
 338    case SHADER_OPCODE_INT_QUOTIENT:
 339    case SHADER_OPCODE_INT_REMAINDER:
 340       break;
 341    default:
 342       assert(!"not reached: unsupported binary math opcode");
 343       return;
 344    }
 345
 346    if (intel->gen >= 7) {
 347       emit(opcode, dst, src0, src1);
 348    } else if (intel->gen == 6) {
 349       return emit_math2_gen6(opcode, dst, src0, src1);
 350    } else {
 351       return emit_math2_gen4(opcode, dst, src0, src1);
 352    }
 353 }
 354
 355 void
 356 vec4_visitor::visit_instructions(const exec_list *list)
 357 {
 358    foreach_list(node, list) {
 359       ir_instruction *ir = (ir_instruction *)node;
 360
 361       base_ir = ir;
 362       ir->accept(this);
 363    }
 364 }
 365
 366
 367 static int
 368 type_size(const struct glsl_type *type)
 369 {
 370    unsigned int i;
 371    int size;
 372
 373    switch (type->base_type) {
 374    case GLSL_TYPE_UINT:
 375    case GLSL_TYPE_INT:
 376    case GLSL_TYPE_FLOAT:
 377    case GLSL_TYPE_BOOL:
 378       if (type->is_matrix()) {
 379          return type->matrix_columns;
 380       } else {
 381          /* Regardless of size of vector, it gets a vec4. This is bad
 382           * packing for things like floats, but otherwise arrays become a
 383           * mess.  Hopefully a later pass over the code can pack scalars
 384           * down if appropriate.
 385           */
 386          return 1;
 387       }
 388    case GLSL_TYPE_ARRAY:
 389       assert(type->length > 0);
 390       return type_size(type->fields.array) * type->length;
 391    case GLSL_TYPE_STRUCT:
 392       size = 0;
 393       for (i = 0; i < type->length; i++) {
 394          size += type_size(type->fields.structure[i].type);
 395       }
 396       return size;
 397    case GLSL_TYPE_SAMPLER:
 398       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 399        * at link time.
 400        */
 401       return 1;
 402    default:
 403       assert(0);
 404       return 0;
 405    }
 406 }
 407
 408 int
 409 vec4_visitor::virtual_grf_alloc(int size)
 410 {
 411    if (virtual_grf_array_size <= virtual_grf_count) {
 412       if (virtual_grf_array_size == 0)
 413          virtual_grf_array_size = 16;
 414       else
 415          virtual_grf_array_size *= 2;
 416       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 417                                    virtual_grf_array_size);
 418       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 419                                      virtual_grf_array_size);
 420    }
 421    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 422    virtual_grf_reg_count += size;
 423    virtual_grf_sizes[virtual_grf_count] = size;
 424    return virtual_grf_count++;
 425 }
 426
 427 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 428 {
 429    init();
 430
 431    this->file = GRF;
 432    this->reg = v->virtual_grf_alloc(type_size(type));
 433
 434    if (type->is_array() || type->is_record()) {
 435       this->swizzle = BRW_SWIZZLE_NOOP;
 436    } else {
 437       this->swizzle = swizzle_for_size(type->vector_elements);
 438    }
 439
 440    this->type = brw_type_for_base_type(type);
 441 }
 442
 443 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 444 {
 445    init();
 446
 447    this->file = GRF;
 448    this->reg = v->virtual_grf_alloc(type_size(type));
 449
 450    if (type->is_array() || type->is_record()) {
 451       this->writemask = WRITEMASK_XYZW;
 452    } else {
 453       this->writemask = (1 << type->vector_elements) - 1;
 454    }
 455
 456    this->type = brw_type_for_base_type(type);
 457 }
 458
 459 /* Our support for uniforms is piggy-backed on the struct
 460  * gl_fragment_program, because that's where the values actually
 461  * get stored, rather than in some global gl_shader_program uniform
 462  * store.
 463  */
 464 int
 465 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
 466 {
 467    unsigned int offset = 0;
 468    float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
 469
 470    if (type->is_matrix()) {
 471       const glsl_type *column = type->column_type();
 472
 473       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 474          offset += setup_uniform_values(loc + offset, column);
 475       }
 476
 477       return offset;
 478    }
 479
 480    switch (type->base_type) {
 481    case GLSL_TYPE_FLOAT:
 482    case GLSL_TYPE_UINT:
 483    case GLSL_TYPE_INT:
 484    case GLSL_TYPE_BOOL:
 485       for (unsigned int i = 0; i < type->vector_elements; i++) {
 486          c->prog_data.param[this->uniforms * 4 + i] = &values[i];
 487       }
 488
 489       /* Set up pad elements to get things aligned to a vec4 boundary. */
 490       for (unsigned int i = type->vector_elements; i < 4; i++) {
 491          static float zero = 0;
 492
 493          c->prog_data.param[this->uniforms * 4 + i] = &zero;
 494       }
 495
 496       /* Track the size of this uniform vector, for future packing of
 497        * uniforms.
 498        */
 499       this->uniform_vector_size[this->uniforms] = type->vector_elements;
 500       this->uniforms++;
 501
 502       return 1;
 503
 504    case GLSL_TYPE_STRUCT:
 505       for (unsigned int i = 0; i < type->length; i++) {
 506          offset += setup_uniform_values(loc + offset,
 507                                         type->fields.structure[i].type);
 508       }
 509       return offset;
 510
 511    case GLSL_TYPE_ARRAY:
 512       for (unsigned int i = 0; i < type->length; i++) {
 513          offset += setup_uniform_values(loc + offset, type->fields.array);
 514       }
 515       return offset;
 516
 517    case GLSL_TYPE_SAMPLER:
 518       /* The sampler takes up a slot, but we don't use any values from it. */
 519       return 1;
 520
 521    default:
 522       assert(!"not reached");
 523       return 0;
 524    }
 525 }
 526
 527 void
 528 vec4_visitor::setup_uniform_clipplane_values()
 529 {
 530    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 531
 532    if (intel->gen < 6) {
 533       /* Pre-Gen6, we compact clip planes.  For example, if the user
 534        * enables just clip planes 0, 1, and 3, we will enable clip planes
 535        * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
 536        * plane 2.  This simplifies the implementation of the Gen6 clip
 537        * thread.
 538        */
 539       int compacted_clipplane_index = 0;
 540       for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
 541          if (!(c->key.userclip_planes_enabled_gen_4_5 & (1 << i)))
 542             continue;
 543
 544          this->uniform_vector_size[this->uniforms] = 4;
 545          this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
 546          this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
 547          for (int j = 0; j < 4; ++j) {
 548             c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 549          }
 550          ++compacted_clipplane_index;
 551          ++this->uniforms;
 552       }
 553    } else {
 554       /* In Gen6 and later, we don't compact clip planes, because this
 555        * simplifies the implementation of gl_ClipDistance.
 556        */
 557       for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
 558          this->uniform_vector_size[this->uniforms] = 4;
 559          this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 560          this->userplane[i].type = BRW_REGISTER_TYPE_F;
 561          for (int j = 0; j < 4; ++j) {
 562             c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 563          }
 564          ++this->uniforms;
 565       }
 566    }
 567 }
 568
 569 /* Our support for builtin uniforms is even scarier than non-builtin.
 570  * It sits on top of the PROG_STATE_VAR parameters that are
 571  * automatically updated from GL context state.
 572  */
 573 void
 574 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 575 {
 576    const ir_state_slot *const slots = ir->state_slots;
 577    assert(ir->state_slots != NULL);
 578
 579    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 580       /* This state reference has already been setup by ir_to_mesa,
 581        * but we'll get the same index back here.  We can reference
 582        * ParameterValues directly, since unlike brw_fs.cpp, we never
 583        * add new state references during compile.
 584        */
 585       int index = _mesa_add_state_reference(this->vp->Base.Parameters,
 586                                             (gl_state_index *)slots[i].tokens);
 587       float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
 588
 589       this->uniform_vector_size[this->uniforms] = 0;
 590       /* Add each of the unique swizzled channels of the element.
 591        * This will end up matching the size of the glsl_type of this field.
 592        */
 593       int last_swiz = -1;
 594       for (unsigned int j = 0; j < 4; j++) {
 595          int swiz = GET_SWZ(slots[i].swizzle, j);
 596          last_swiz = swiz;
 597
 598          c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
 599          if (swiz <= last_swiz)
 600             this->uniform_vector_size[this->uniforms]++;
 601       }
 602       this->uniforms++;
 603    }
 604 }
 605
 606 dst_reg *
 607 vec4_visitor::variable_storage(ir_variable *var)
 608 {
 609    return (dst_reg *)hash_table_find(this->variable_ht, var);
 610 }
 611
 612 void
 613 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 614 {
 615    ir_expression *expr = ir->as_expression();
 616
 617    *predicate = BRW_PREDICATE_NORMAL;
 618
 619    if (expr) {
 620       src_reg op[2];
 621       vec4_instruction *inst;
 622
 623       assert(expr->get_num_operands() <= 2);
 624       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 625          expr->operands[i]->accept(this);
 626          op[i] = this->result;
 627
 628          resolve_ud_negate(&op[i]);
 629       }
 630
 631       switch (expr->operation) {
 632       case ir_unop_logic_not:
 633          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 634          inst->conditional_mod = BRW_CONDITIONAL_Z;
 635          break;
 636
 637       case ir_binop_logic_xor:
 638          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 639          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 640          break;
 641
 642       case ir_binop_logic_or:
 643          inst = emit(OR(dst_null_d(), op[0], op[1]));
 644          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 645          break;
 646
 647       case ir_binop_logic_and:
 648          inst = emit(AND(dst_null_d(), op[0], op[1]));
 649          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 650          break;
 651
 652       case ir_unop_f2b:
 653          if (intel->gen >= 6) {
 654             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 655          } else {
 656             inst = emit(MOV(dst_null_f(), op[0]));
 657             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 658          }
 659          break;
 660
 661       case ir_unop_i2b:
 662          if (intel->gen >= 6) {
 663             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 664          } else {
 665             inst = emit(MOV(dst_null_d(), op[0]));
 666             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 667          }
 668          break;
 669
 670       case ir_binop_all_equal:
 671          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 672          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 673          break;
 674
 675       case ir_binop_any_nequal:
 676          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 677          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 678          break;
 679
 680       case ir_unop_any:
 681          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 682          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 683          break;
 684
 685       case ir_binop_greater:
 686       case ir_binop_gequal:
 687       case ir_binop_less:
 688       case ir_binop_lequal:
 689       case ir_binop_equal:
 690       case ir_binop_nequal:
 691          emit(CMP(dst_null_d(), op[0], op[1],
 692                   brw_conditional_for_comparison(expr->operation)));
 693          break;
 694
 695       default:
 696          assert(!"not reached");
 697          break;
 698       }
 699       return;
 700    }
 701
 702    ir->accept(this);
 703
 704    resolve_ud_negate(&this->result);
 705
 706    if (intel->gen >= 6) {
 707       vec4_instruction *inst = emit(AND(dst_null_d(),
 708                                         this->result, src_reg(1)));
 709       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 710    } else {
 711       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 712       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 713    }
 714 }
 715
 716 /**
 717  * Emit a gen6 IF statement with the comparison folded into the IF
 718  * instruction.
 719  */
 720 void
 721 vec4_visitor::emit_if_gen6(ir_if *ir)
 722 {
 723    ir_expression *expr = ir->condition->as_expression();
 724
 725    if (expr) {
 726       src_reg op[2];
 727       dst_reg temp;
 728
 729       assert(expr->get_num_operands() <= 2);
 730       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 731          expr->operands[i]->accept(this);
 732          op[i] = this->result;
 733       }
 734
 735       switch (expr->operation) {
 736       case ir_unop_logic_not:
 737          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 738          return;
 739
 740       case ir_binop_logic_xor:
 741          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 742          return;
 743
 744       case ir_binop_logic_or:
 745          temp = dst_reg(this, glsl_type::bool_type);
 746          emit(OR(temp, op[0], op[1]));
 747          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 748          return;
 749
 750       case ir_binop_logic_and:
 751          temp = dst_reg(this, glsl_type::bool_type);
 752          emit(AND(temp, op[0], op[1]));
 753          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 754          return;
 755
 756       case ir_unop_f2b:
 757          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 758          return;
 759
 760       case ir_unop_i2b:
 761          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 762          return;
 763
 764       case ir_binop_greater:
 765       case ir_binop_gequal:
 766       case ir_binop_less:
 767       case ir_binop_lequal:
 768       case ir_binop_equal:
 769       case ir_binop_nequal:
 770          emit(IF(op[0], op[1],
 771                  brw_conditional_for_comparison(expr->operation)));
 772          return;
 773
 774       case ir_binop_all_equal:
 775          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 776          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 777          return;
 778
 779       case ir_binop_any_nequal:
 780          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 781          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 782          return;
 783
 784       case ir_unop_any:
 785          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 786          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 787          return;
 788
 789       default:
 790          assert(!"not reached");
 791          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 792          return;
 793       }
 794       return;
 795    }
 796
 797    ir->condition->accept(this);
 798
 799    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 800 }
 801
 802 void
 803 vec4_visitor::visit(ir_variable *ir)
 804 {
 805    dst_reg *reg = NULL;
 806
 807    if (variable_storage(ir))
 808       return;
 809
 810    switch (ir->mode) {
 811    case ir_var_in:
 812       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
 813
 814       /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 815        * come in as floating point conversions of the integer values.
 816        */
 817       for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
 818          if (!c->key.gl_fixed_input_size[i])
 819             continue;
 820
 821          dst_reg dst = *reg;
 822          dst.type = brw_type_for_base_type(ir->type);
 823          dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
 824          emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 825       }
 826       break;
 827
 828    case ir_var_out:
 829       reg = new(mem_ctx) dst_reg(this, ir->type);
 830
 831       for (int i = 0; i < type_size(ir->type); i++) {
 832          output_reg[ir->location + i] = *reg;
 833          output_reg[ir->location + i].reg_offset = i;
 834          output_reg[ir->location + i].type =
 835             brw_type_for_base_type(ir->type->get_scalar_type());
 836          output_reg_annotation[ir->location + i] = ir->name;
 837       }
 838       break;
 839
 840    case ir_var_auto:
 841    case ir_var_temporary:
 842       reg = new(mem_ctx) dst_reg(this, ir->type);
 843       break;
 844
 845    case ir_var_uniform:
 846       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 847
 848       /* Thanks to the lower_ubo_reference pass, we will see only
 849        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 850        * variables, so no need for them to be in variable_ht.
 851        */
 852       if (ir->uniform_block != -1)
 853          return;
 854
 855       /* Track how big the whole uniform variable is, in case we need to put a
 856        * copy of its data into pull constants for array access.
 857        */
 858       this->uniform_size[this->uniforms] = type_size(ir->type);
 859
 860       if (!strncmp(ir->name, "gl_", 3)) {
 861          setup_builtin_uniform_values(ir);
 862       } else {
 863          setup_uniform_values(ir->location, ir->type);
 864       }
 865       break;
 866
 867    case ir_var_system_value:
 868       /* VertexID is stored by the VF as the last vertex element, but
 869        * we don't represent it with a flag in inputs_read, so we call
 870        * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
 871        */
 872       reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
 873       prog_data->uses_vertexid = true;
 874
 875       switch (ir->location) {
 876       case SYSTEM_VALUE_VERTEX_ID:
 877          reg->writemask = WRITEMASK_X;
 878          break;
 879       case SYSTEM_VALUE_INSTANCE_ID:
 880          reg->writemask = WRITEMASK_Y;
 881          break;
 882       default:
 883          assert(!"not reached");
 884          break;
 885       }
 886       break;
 887
 888    default:
 889       assert(!"not reached");
 890    }
 891
 892    reg->type = brw_type_for_base_type(ir->type);
 893    hash_table_insert(this->variable_ht, reg, ir);
 894 }
 895
 896 void
 897 vec4_visitor::visit(ir_loop *ir)
 898 {
 899    dst_reg counter;
 900
 901    /* We don't want debugging output to print the whole body of the
 902     * loop as the annotation.
 903     */
 904    this->base_ir = NULL;
 905
 906    if (ir->counter != NULL) {
 907       this->base_ir = ir->counter;
 908       ir->counter->accept(this);
 909       counter = *(variable_storage(ir->counter));
 910
 911       if (ir->from != NULL) {
 912          this->base_ir = ir->from;
 913          ir->from->accept(this);
 914
 915          emit(MOV(counter, this->result));
 916       }
 917    }
 918
 919    emit(BRW_OPCODE_DO);
 920
 921    if (ir->to) {
 922       this->base_ir = ir->to;
 923       ir->to->accept(this);
 924
 925       emit(CMP(dst_null_d(), src_reg(counter), this->result,
 926                brw_conditional_for_comparison(ir->cmp)));
 927
 928       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
 929       inst->predicate = BRW_PREDICATE_NORMAL;
 930    }
 931
 932    visit_instructions(&ir->body_instructions);
 933
 934
 935    if (ir->increment) {
 936       this->base_ir = ir->increment;
 937       ir->increment->accept(this);
 938       emit(ADD(counter, src_reg(counter), this->result));
 939    }
 940
 941    emit(BRW_OPCODE_WHILE);
 942 }
 943
 944 void
 945 vec4_visitor::visit(ir_loop_jump *ir)
 946 {
 947    switch (ir->mode) {
 948    case ir_loop_jump::jump_break:
 949       emit(BRW_OPCODE_BREAK);
 950       break;
 951    case ir_loop_jump::jump_continue:
 952       emit(BRW_OPCODE_CONTINUE);
 953       break;
 954    }
 955 }
 956
 957
 958 void
 959 vec4_visitor::visit(ir_function_signature *ir)
 960 {
 961    assert(0);
 962    (void)ir;
 963 }
 964
 965 void
 966 vec4_visitor::visit(ir_function *ir)
 967 {
 968    /* Ignore function bodies other than main() -- we shouldn't see calls to
 969     * them since they should all be inlined.
 970     */
 971    if (strcmp(ir->name, "main") == 0) {
 972       const ir_function_signature *sig;
 973       exec_list empty;
 974
 975       sig = ir->matching_signature(&empty);
 976
 977       assert(sig);
 978
 979       visit_instructions(&sig->body);
 980    }
 981 }
 982
 983 bool
 984 vec4_visitor::try_emit_sat(ir_expression *ir)
 985 {
 986    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
 987    if (!sat_src)
 988       return false;
 989
 990    sat_src->accept(this);
 991    src_reg src = this->result;
 992
 993    this->result = src_reg(this, ir->type);
 994    vec4_instruction *inst;
 995    inst = emit(MOV(dst_reg(this->result), src));
 996    inst->saturate = true;
 997
 998    return true;
 999 }
1000
1001 void
1002 vec4_visitor::emit_bool_comparison(unsigned int op,
1003                                  dst_reg dst, src_reg src0, src_reg src1)
1004 {
1005    /* original gen4 does destination conversion before comparison. */
1006    if (intel->gen < 5)
1007       dst.type = src0.type;
1008
1009    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1010
1011    dst.type = BRW_REGISTER_TYPE_D;
1012    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1013 }
1014
1015 void
1016 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1017                           src_reg src0, src_reg src1)
1018 {
1019    vec4_instruction *inst;
1020
1021    if (intel->gen >= 6) {
1022       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1023       inst->conditional_mod = conditionalmod;
1024    } else {
1025       emit(CMP(dst, src0, src1, conditionalmod));
1026
1027       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1028       inst->predicate = BRW_PREDICATE_NORMAL;
1029    }
1030 }
1031
1032 void
1033 vec4_visitor::visit(ir_expression *ir)
1034 {
1035    unsigned int operand;
1036    src_reg op[Elements(ir->operands)];
1037    src_reg result_src;
1038    dst_reg result_dst;
1039    vec4_instruction *inst;
1040
1041    if (try_emit_sat(ir))
1042       return;
1043
1044    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1045       this->result.file = BAD_FILE;
1046       ir->operands[operand]->accept(this);
1047       if (this->result.file == BAD_FILE) {
1048          printf("Failed to get tree for expression operand:\n");
1049          ir->operands[operand]->print();
1050          exit(1);
1051       }
1052       op[operand] = this->result;
1053
1054       /* Matrix expression operands should have been broken down to vector
1055        * operations already.
1056        */
1057       assert(!ir->operands[operand]->type->is_matrix());
1058    }
1059
1060    int vector_elements = ir->operands[0]->type->vector_elements;
1061    if (ir->operands[1]) {
1062       vector_elements = MAX2(vector_elements,
1063                              ir->operands[1]->type->vector_elements);
1064    }
1065
1066    this->result.file = BAD_FILE;
1067
1068    /* Storage for our result.  Ideally for an assignment we'd be using
1069     * the actual storage for the result here, instead.
1070     */
1071    result_src = src_reg(this, ir->type);
1072    /* convenience for the emit functions below. */
1073    result_dst = dst_reg(result_src);
1074    /* If nothing special happens, this is the result. */
1075    this->result = result_src;
1076    /* Limit writes to the channels that will be used by result_src later.
1077     * This does limit this temp's use as a temporary for multi-instruction
1078     * sequences.
1079     */
1080    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1081
1082    switch (ir->operation) {
1083    case ir_unop_logic_not:
1084       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1085        * ones complement of the whole register, not just bit 0.
1086        */
1087       emit(XOR(result_dst, op[0], src_reg(1)));
1088       break;
1089    case ir_unop_neg:
1090       op[0].negate = !op[0].negate;
1091       this->result = op[0];
1092       break;
1093    case ir_unop_abs:
1094       op[0].abs = true;
1095       op[0].negate = false;
1096       this->result = op[0];
1097       break;
1098
1099    case ir_unop_sign:
1100       emit(MOV(result_dst, src_reg(0.0f)));
1101
1102       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1103       inst = emit(MOV(result_dst, src_reg(1.0f)));
1104       inst->predicate = BRW_PREDICATE_NORMAL;
1105
1106       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1107       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1108       inst->predicate = BRW_PREDICATE_NORMAL;
1109
1110       break;
1111
1112    case ir_unop_rcp:
1113       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1114       break;
1115
1116    case ir_unop_exp2:
1117       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1118       break;
1119    case ir_unop_log2:
1120       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1121       break;
1122    case ir_unop_exp:
1123    case ir_unop_log:
1124       assert(!"not reached: should be handled by ir_explog_to_explog2");
1125       break;
1126    case ir_unop_sin:
1127    case ir_unop_sin_reduced:
1128       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1129       break;
1130    case ir_unop_cos:
1131    case ir_unop_cos_reduced:
1132       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1133       break;
1134
1135    case ir_unop_dFdx:
1136    case ir_unop_dFdy:
1137       assert(!"derivatives not valid in vertex shader");
1138       break;
1139
1140    case ir_unop_noise:
1141       assert(!"not reached: should be handled by lower_noise");
1142       break;
1143
1144    case ir_binop_add:
1145       emit(ADD(result_dst, op[0], op[1]));
1146       break;
1147    case ir_binop_sub:
1148       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1149       break;
1150
1151    case ir_binop_mul:
1152       if (ir->type->is_integer()) {
1153          /* For integer multiplication, the MUL uses the low 16 bits
1154           * of one of the operands (src0 on gen6, src1 on gen7).  The
1155           * MACH accumulates in the contribution of the upper 16 bits
1156           * of that operand.
1157           *
1158           * FINISHME: Emit just the MUL if we know an operand is small
1159           * enough.
1160           */
1161          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1162
1163          emit(MUL(acc, op[0], op[1]));
1164          emit(MACH(dst_null_d(), op[0], op[1]));
1165          emit(MOV(result_dst, src_reg(acc)));
1166       } else {
1167          emit(MUL(result_dst, op[0], op[1]));
1168       }
1169       break;
1170    case ir_binop_div:
1171       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1172       assert(ir->type->is_integer());
1173       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1174       break;
1175    case ir_binop_mod:
1176       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1177       assert(ir->type->is_integer());
1178       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1179       break;
1180
1181    case ir_binop_less:
1182    case ir_binop_greater:
1183    case ir_binop_lequal:
1184    case ir_binop_gequal:
1185    case ir_binop_equal:
1186    case ir_binop_nequal: {
1187       emit(CMP(result_dst, op[0], op[1],
1188                brw_conditional_for_comparison(ir->operation)));
1189       emit(AND(result_dst, result_src, src_reg(0x1)));
1190       break;
1191    }
1192
1193    case ir_binop_all_equal:
1194       /* "==" operator producing a scalar boolean. */
1195       if (ir->operands[0]->type->is_vector() ||
1196           ir->operands[1]->type->is_vector()) {
1197          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1198          emit(MOV(result_dst, src_reg(0)));
1199          inst = emit(MOV(result_dst, src_reg(1)));
1200          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1201       } else {
1202          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1203          emit(AND(result_dst, result_src, src_reg(0x1)));
1204       }
1205       break;
1206    case ir_binop_any_nequal:
1207       /* "!=" operator producing a scalar boolean. */
1208       if (ir->operands[0]->type->is_vector() ||
1209           ir->operands[1]->type->is_vector()) {
1210          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1211
1212          emit(MOV(result_dst, src_reg(0)));
1213          inst = emit(MOV(result_dst, src_reg(1)));
1214          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1215       } else {
1216          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1217          emit(AND(result_dst, result_src, src_reg(0x1)));
1218       }
1219       break;
1220
1221    case ir_unop_any:
1222       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1223       emit(MOV(result_dst, src_reg(0)));
1224
1225       inst = emit(MOV(result_dst, src_reg(1)));
1226       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1227       break;
1228
1229    case ir_binop_logic_xor:
1230       emit(XOR(result_dst, op[0], op[1]));
1231       break;
1232
1233    case ir_binop_logic_or:
1234       emit(OR(result_dst, op[0], op[1]));
1235       break;
1236
1237    case ir_binop_logic_and:
1238       emit(AND(result_dst, op[0], op[1]));
1239       break;
1240
1241    case ir_binop_dot:
1242       assert(ir->operands[0]->type->is_vector());
1243       assert(ir->operands[0]->type == ir->operands[1]->type);
1244       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1245       break;
1246
1247    case ir_unop_sqrt:
1248       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1249       break;
1250    case ir_unop_rsq:
1251       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1252       break;
1253
1254    case ir_unop_bitcast_i2f:
1255    case ir_unop_bitcast_u2f:
1256       this->result = op[0];
1257       this->result.type = BRW_REGISTER_TYPE_F;
1258       break;
1259
1260    case ir_unop_bitcast_f2i:
1261       this->result = op[0];
1262       this->result.type = BRW_REGISTER_TYPE_D;
1263       break;
1264
1265    case ir_unop_bitcast_f2u:
1266       this->result = op[0];
1267       this->result.type = BRW_REGISTER_TYPE_UD;
1268       break;
1269
1270    case ir_unop_i2f:
1271    case ir_unop_i2u:
1272    case ir_unop_u2i:
1273    case ir_unop_u2f:
1274    case ir_unop_b2f:
1275    case ir_unop_b2i:
1276    case ir_unop_f2i:
1277    case ir_unop_f2u:
1278       emit(MOV(result_dst, op[0]));
1279       break;
1280    case ir_unop_f2b:
1281    case ir_unop_i2b: {
1282       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1283       emit(AND(result_dst, result_src, src_reg(1)));
1284       break;
1285    }
1286
1287    case ir_unop_trunc:
1288       emit(RNDZ(result_dst, op[0]));
1289       break;
1290    case ir_unop_ceil:
1291       op[0].negate = !op[0].negate;
1292       inst = emit(RNDD(result_dst, op[0]));
1293       this->result.negate = true;
1294       break;
1295    case ir_unop_floor:
1296       inst = emit(RNDD(result_dst, op[0]));
1297       break;
1298    case ir_unop_fract:
1299       inst = emit(FRC(result_dst, op[0]));
1300       break;
1301    case ir_unop_round_even:
1302       emit(RNDE(result_dst, op[0]));
1303       break;
1304
1305    case ir_binop_min:
1306       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1307       break;
1308    case ir_binop_max:
1309       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1310       break;
1311
1312    case ir_binop_pow:
1313       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1314       break;
1315
1316    case ir_unop_bit_not:
1317       inst = emit(NOT(result_dst, op[0]));
1318       break;
1319    case ir_binop_bit_and:
1320       inst = emit(AND(result_dst, op[0], op[1]));
1321       break;
1322    case ir_binop_bit_xor:
1323       inst = emit(XOR(result_dst, op[0], op[1]));
1324       break;
1325    case ir_binop_bit_or:
1326       inst = emit(OR(result_dst, op[0], op[1]));
1327       break;
1328
1329    case ir_binop_lshift:
1330       inst = emit(SHL(result_dst, op[0], op[1]));
1331       break;
1332
1333    case ir_binop_rshift:
1334       if (ir->type->base_type == GLSL_TYPE_INT)
1335          inst = emit(ASR(result_dst, op[0], op[1]));
1336       else
1337          inst = emit(SHR(result_dst, op[0], op[1]));
1338       break;
1339
1340    case ir_binop_ubo_load: {
1341       ir_constant *uniform_block = ir->operands[0]->as_constant();
1342       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1343       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1344       src_reg offset = op[1];
1345
1346       /* Now, load the vector from that offset. */
1347       assert(ir->type->is_vector() || ir->type->is_scalar());
1348
1349       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1350       packed_consts.type = result.type;
1351       src_reg surf_index =
1352          src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1353       if (const_offset_ir) {
1354          offset = src_reg(const_offset / 16);
1355       } else {
1356          emit(SHR(dst_reg(offset), offset, src_reg(4)));
1357       }
1358
1359       vec4_instruction *pull =
1360          emit(new(mem_ctx) vec4_instruction(this,
1361                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1362                                             dst_reg(packed_consts),
1363                                             surf_index,
1364                                             offset));
1365       pull->base_mrf = 14;
1366       pull->mlen = 1;
1367
1368       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1369       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1370                                             const_offset % 16 / 4,
1371                                             const_offset % 16 / 4,
1372                                             const_offset % 16 / 4);
1373
1374       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1375       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1376          emit(CMP(result_dst, packed_consts, src_reg(0u),
1377                   BRW_CONDITIONAL_NZ));
1378          emit(AND(result_dst, result, src_reg(0x1)));
1379       } else {
1380          emit(MOV(result_dst, packed_consts));
1381       }
1382       break;
1383    }
1384
1385    case ir_quadop_vector:
1386       assert(!"not reached: should be handled by lower_quadop_vector");
1387       break;
1388    }
1389 }
1390
1391
1392 void
1393 vec4_visitor::visit(ir_swizzle *ir)
1394 {
1395    src_reg src;
1396    int i = 0;
1397    int swizzle[4];
1398
1399    /* Note that this is only swizzles in expressions, not those on the left
1400     * hand side of an assignment, which do write masking.  See ir_assignment
1401     * for that.
1402     */
1403
1404    ir->val->accept(this);
1405    src = this->result;
1406    assert(src.file != BAD_FILE);
1407
1408    for (i = 0; i < ir->type->vector_elements; i++) {
1409       switch (i) {
1410       case 0:
1411          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1412          break;
1413       case 1:
1414          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1415          break;
1416       case 2:
1417          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1418          break;
1419       case 3:
1420          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1421             break;
1422       }
1423    }
1424    for (; i < 4; i++) {
1425       /* Replicate the last channel out. */
1426       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1427    }
1428
1429    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1430
1431    this->result = src;
1432 }
1433
1434 void
1435 vec4_visitor::visit(ir_dereference_variable *ir)
1436 {
1437    const struct glsl_type *type = ir->type;
1438    dst_reg *reg = variable_storage(ir->var);
1439
1440    if (!reg) {
1441       fail("Failed to find variable storage for %s\n", ir->var->name);
1442       this->result = src_reg(brw_null_reg());
1443       return;
1444    }
1445
1446    this->result = src_reg(*reg);
1447
1448    /* System values get their swizzle from the dst_reg writemask */
1449    if (ir->var->mode == ir_var_system_value)
1450       return;
1451
1452    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1453       this->result.swizzle = swizzle_for_size(type->vector_elements);
1454 }
1455
1456 void
1457 vec4_visitor::visit(ir_dereference_array *ir)
1458 {
1459    ir_constant *constant_index;
1460    src_reg src;
1461    int element_size = type_size(ir->type);
1462
1463    constant_index = ir->array_index->constant_expression_value();
1464
1465    ir->array->accept(this);
1466    src = this->result;
1467
1468    if (constant_index) {
1469       src.reg_offset += constant_index->value.i[0] * element_size;
1470    } else {
1471       /* Variable index array dereference.  It eats the "vec4" of the
1472        * base of the array and an index that offsets the Mesa register
1473        * index.
1474        */
1475       ir->array_index->accept(this);
1476
1477       src_reg index_reg;
1478
1479       if (element_size == 1) {
1480          index_reg = this->result;
1481       } else {
1482          index_reg = src_reg(this, glsl_type::int_type);
1483
1484          emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1485       }
1486
1487       if (src.reladdr) {
1488          src_reg temp = src_reg(this, glsl_type::int_type);
1489
1490          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1491
1492          index_reg = temp;
1493       }
1494
1495       src.reladdr = ralloc(mem_ctx, src_reg);
1496       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1497    }
1498
1499    /* If the type is smaller than a vec4, replicate the last channel out. */
1500    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1501       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1502    else
1503       src.swizzle = BRW_SWIZZLE_NOOP;
1504    src.type = brw_type_for_base_type(ir->type);
1505
1506    this->result = src;
1507 }
1508
1509 void
1510 vec4_visitor::visit(ir_dereference_record *ir)
1511 {
1512    unsigned int i;
1513    const glsl_type *struct_type = ir->record->type;
1514    int offset = 0;
1515
1516    ir->record->accept(this);
1517
1518    for (i = 0; i < struct_type->length; i++) {
1519       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1520          break;
1521       offset += type_size(struct_type->fields.structure[i].type);
1522    }
1523
1524    /* If the type is smaller than a vec4, replicate the last channel out. */
1525    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1526       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1527    else
1528       this->result.swizzle = BRW_SWIZZLE_NOOP;
1529    this->result.type = brw_type_for_base_type(ir->type);
1530
1531    this->result.reg_offset += offset;
1532 }
1533
1534 /**
1535  * We want to be careful in assignment setup to hit the actual storage
1536  * instead of potentially using a temporary like we might with the
1537  * ir_dereference handler.
1538  */
1539 static dst_reg
1540 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1541 {
1542    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1543     * access of a vector, it must be separated into a series conditional moves
1544     * before reaching this point (see ir_vec_index_to_cond_assign).
1545     */
1546    assert(ir->as_dereference());
1547    ir_dereference_array *deref_array = ir->as_dereference_array();
1548    if (deref_array) {
1549       assert(!deref_array->array->type->is_vector());
1550    }
1551
1552    /* Use the rvalue deref handler for the most part.  We'll ignore
1553     * swizzles in it and write swizzles using writemask, though.
1554     */
1555    ir->accept(v);
1556    return dst_reg(v->result);
1557 }
1558
1559 void
1560 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1561                               const struct glsl_type *type, uint32_t predicate)
1562 {
1563    if (type->base_type == GLSL_TYPE_STRUCT) {
1564       for (unsigned int i = 0; i < type->length; i++) {
1565          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1566       }
1567       return;
1568    }
1569
1570    if (type->is_array()) {
1571       for (unsigned int i = 0; i < type->length; i++) {
1572          emit_block_move(dst, src, type->fields.array, predicate);
1573       }
1574       return;
1575    }
1576
1577    if (type->is_matrix()) {
1578       const struct glsl_type *vec_type;
1579
1580       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1581                                          type->vector_elements, 1);
1582
1583       for (int i = 0; i < type->matrix_columns; i++) {
1584          emit_block_move(dst, src, vec_type, predicate);
1585       }
1586       return;
1587    }
1588
1589    assert(type->is_scalar() || type->is_vector());
1590
1591    dst->type = brw_type_for_base_type(type);
1592    src->type = dst->type;
1593
1594    dst->writemask = (1 << type->vector_elements) - 1;
1595
1596    src->swizzle = swizzle_for_size(type->vector_elements);
1597
1598    vec4_instruction *inst = emit(MOV(*dst, *src));
1599    inst->predicate = predicate;
1600
1601    dst->reg_offset++;
1602    src->reg_offset++;
1603 }
1604
1605
1606 /* If the RHS processing resulted in an instruction generating a
1607  * temporary value, and it would be easy to rewrite the instruction to
1608  * generate its result right into the LHS instead, do so.  This ends
1609  * up reliably removing instructions where it can be tricky to do so
1610  * later without real UD chain information.
1611  */
1612 bool
1613 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1614                                      dst_reg dst,
1615                                      src_reg src,
1616                                      vec4_instruction *pre_rhs_inst,
1617                                      vec4_instruction *last_rhs_inst)
1618 {
1619    /* This could be supported, but it would take more smarts. */
1620    if (ir->condition)
1621       return false;
1622
1623    if (pre_rhs_inst == last_rhs_inst)
1624       return false; /* No instructions generated to work with. */
1625
1626    /* Make sure the last instruction generated our source reg. */
1627    if (src.file != GRF ||
1628        src.file != last_rhs_inst->dst.file ||
1629        src.reg != last_rhs_inst->dst.reg ||
1630        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1631        src.reladdr ||
1632        src.abs ||
1633        src.negate ||
1634        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1635       return false;
1636
1637    /* Check that that last instruction fully initialized the channels
1638     * we want to use, in the order we want to use them.  We could
1639     * potentially reswizzle the operands of many instructions so that
1640     * we could handle out of order channels, but don't yet.
1641     */
1642
1643    for (unsigned i = 0; i < 4; i++) {
1644       if (dst.writemask & (1 << i)) {
1645          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1646             return false;
1647
1648          if (BRW_GET_SWZ(src.swizzle, i) != i)
1649             return false;
1650       }
1651    }
1652
1653    /* Success!  Rewrite the instruction. */
1654    last_rhs_inst->dst.file = dst.file;
1655    last_rhs_inst->dst.reg = dst.reg;
1656    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1657    last_rhs_inst->dst.reladdr = dst.reladdr;
1658    last_rhs_inst->dst.writemask &= dst.writemask;
1659
1660    return true;
1661 }
1662
1663 void
1664 vec4_visitor::visit(ir_assignment *ir)
1665 {
1666    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1667    uint32_t predicate = BRW_PREDICATE_NONE;
1668
1669    if (!ir->lhs->type->is_scalar() &&
1670        !ir->lhs->type->is_vector()) {
1671       ir->rhs->accept(this);
1672       src_reg src = this->result;
1673
1674       if (ir->condition) {
1675          emit_bool_to_cond_code(ir->condition, &predicate);
1676       }
1677
1678       /* emit_block_move doesn't account for swizzles in the source register.
1679        * This should be ok, since the source register is a structure or an
1680        * array, and those can't be swizzled.  But double-check to be sure.
1681        */
1682       assert(src.swizzle ==
1683              (ir->rhs->type->is_matrix()
1684               ? swizzle_for_size(ir->rhs->type->vector_elements)
1685               : BRW_SWIZZLE_NOOP));
1686
1687       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1688       return;
1689    }
1690
1691    /* Now we're down to just a scalar/vector with writemasks. */
1692    int i;
1693
1694    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1695    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1696
1697    ir->rhs->accept(this);
1698
1699    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1700
1701    src_reg src = this->result;
1702
1703    int swizzles[4];
1704    int first_enabled_chan = 0;
1705    int src_chan = 0;
1706
1707    assert(ir->lhs->type->is_vector() ||
1708           ir->lhs->type->is_scalar());
1709    dst.writemask = ir->write_mask;
1710
1711    for (int i = 0; i < 4; i++) {
1712       if (dst.writemask & (1 << i)) {
1713          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1714          break;
1715       }
1716    }
1717
1718    /* Swizzle a small RHS vector into the channels being written.
1719     *
1720     * glsl ir treats write_mask as dictating how many channels are
1721     * present on the RHS while in our instructions we need to make
1722     * those channels appear in the slots of the vec4 they're written to.
1723     */
1724    for (int i = 0; i < 4; i++) {
1725       if (dst.writemask & (1 << i))
1726          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1727       else
1728          swizzles[i] = first_enabled_chan;
1729    }
1730    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1731                               swizzles[2], swizzles[3]);
1732
1733    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1734       return;
1735    }
1736
1737    if (ir->condition) {
1738       emit_bool_to_cond_code(ir->condition, &predicate);
1739    }
1740
1741    for (i = 0; i < type_size(ir->lhs->type); i++) {
1742       vec4_instruction *inst = emit(MOV(dst, src));
1743       inst->predicate = predicate;
1744
1745       dst.reg_offset++;
1746       src.reg_offset++;
1747    }
1748 }
1749
1750 void
1751 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1752 {
1753    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1754       foreach_list(node, &ir->components) {
1755          ir_constant *field_value = (ir_constant *)node;
1756
1757          emit_constant_values(dst, field_value);
1758       }
1759       return;
1760    }
1761
1762    if (ir->type->is_array()) {
1763       for (unsigned int i = 0; i < ir->type->length; i++) {
1764          emit_constant_values(dst, ir->array_elements[i]);
1765       }
1766       return;
1767    }
1768
1769    if (ir->type->is_matrix()) {
1770       for (int i = 0; i < ir->type->matrix_columns; i++) {
1771          float *vec = &ir->value.f[i * ir->type->vector_elements];
1772
1773          for (int j = 0; j < ir->type->vector_elements; j++) {
1774             dst->writemask = 1 << j;
1775             dst->type = BRW_REGISTER_TYPE_F;
1776
1777             emit(MOV(*dst, src_reg(vec[j])));
1778          }
1779          dst->reg_offset++;
1780       }
1781       return;
1782    }
1783
1784    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
1785
1786    for (int i = 0; i < ir->type->vector_elements; i++) {
1787       if (!(remaining_writemask & (1 << i)))
1788          continue;
1789
1790       dst->writemask = 1 << i;
1791       dst->type = brw_type_for_base_type(ir->type);
1792
1793       /* Find other components that match the one we're about to
1794        * write.  Emits fewer instructions for things like vec4(0.5,
1795        * 1.5, 1.5, 1.5).
1796        */
1797       for (int j = i + 1; j < ir->type->vector_elements; j++) {
1798          if (ir->type->base_type == GLSL_TYPE_BOOL) {
1799             if (ir->value.b[i] == ir->value.b[j])
1800                dst->writemask |= (1 << j);
1801          } else {
1802             /* u, i, and f storage all line up, so no need for a
1803              * switch case for comparing each type.
1804              */
1805             if (ir->value.u[i] == ir->value.u[j])
1806                dst->writemask |= (1 << j);
1807          }
1808       }
1809
1810       switch (ir->type->base_type) {
1811       case GLSL_TYPE_FLOAT:
1812          emit(MOV(*dst, src_reg(ir->value.f[i])));
1813          break;
1814       case GLSL_TYPE_INT:
1815          emit(MOV(*dst, src_reg(ir->value.i[i])));
1816          break;
1817       case GLSL_TYPE_UINT:
1818          emit(MOV(*dst, src_reg(ir->value.u[i])));
1819          break;
1820       case GLSL_TYPE_BOOL:
1821          emit(MOV(*dst, src_reg(ir->value.b[i])));
1822          break;
1823       default:
1824          assert(!"Non-float/uint/int/bool constant");
1825          break;
1826       }
1827
1828       remaining_writemask &= ~dst->writemask;
1829    }
1830    dst->reg_offset++;
1831 }
1832
1833 void
1834 vec4_visitor::visit(ir_constant *ir)
1835 {
1836    dst_reg dst = dst_reg(this, ir->type);
1837    this->result = src_reg(dst);
1838
1839    emit_constant_values(&dst, ir);
1840 }
1841
1842 void
1843 vec4_visitor::visit(ir_call *ir)
1844 {
1845    assert(!"not reached");
1846 }
1847
1848 void
1849 vec4_visitor::visit(ir_texture *ir)
1850 {
1851    int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
1852
1853    /* Should be lowered by do_lower_texture_projection */
1854    assert(!ir->projector);
1855
1856    /* Generate code to compute all the subexpression trees.  This has to be
1857     * done before loading any values into MRFs for the sampler message since
1858     * generating these values may involve SEND messages that need the MRFs.
1859     */
1860    src_reg coordinate;
1861    if (ir->coordinate) {
1862       ir->coordinate->accept(this);
1863       coordinate = this->result;
1864    }
1865
1866    src_reg shadow_comparitor;
1867    if (ir->shadow_comparitor) {
1868       ir->shadow_comparitor->accept(this);
1869       shadow_comparitor = this->result;
1870    }
1871
1872    src_reg lod, dPdx, dPdy;
1873    switch (ir->op) {
1874    case ir_txf:
1875    case ir_txl:
1876    case ir_txs:
1877       ir->lod_info.lod->accept(this);
1878       lod = this->result;
1879       break;
1880    case ir_txd:
1881       ir->lod_info.grad.dPdx->accept(this);
1882       dPdx = this->result;
1883
1884       ir->lod_info.grad.dPdy->accept(this);
1885       dPdy = this->result;
1886       break;
1887    case ir_tex:
1888    case ir_txb:
1889       break;
1890    }
1891
1892    vec4_instruction *inst = NULL;
1893    switch (ir->op) {
1894    case ir_tex:
1895    case ir_txl:
1896       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
1897       break;
1898    case ir_txd:
1899       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
1900       break;
1901    case ir_txf:
1902       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
1903       break;
1904    case ir_txs:
1905       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
1906       break;
1907    case ir_txb:
1908       assert(!"TXB is not valid for vertex shaders.");
1909    }
1910
1911    /* Texel offsets go in the message header; Gen4 also requires headers. */
1912    inst->header_present = ir->offset || intel->gen < 5;
1913    inst->base_mrf = 2;
1914    inst->mlen = inst->header_present + 1; /* always at least one */
1915    inst->sampler = sampler;
1916    inst->dst = dst_reg(this, ir->type);
1917    inst->shadow_compare = ir->shadow_comparitor != NULL;
1918
1919    if (ir->offset != NULL && ir->op != ir_txf)
1920       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
1921
1922    /* MRF for the first parameter */
1923    int param_base = inst->base_mrf + inst->header_present;
1924
1925    if (ir->op == ir_txs) {
1926       int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
1927       emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, writemask),
1928            lod));
1929    } else {
1930       int i, coord_mask = 0, zero_mask = 0;
1931       /* Load the coordinate */
1932       /* FINISHME: gl_clamp_mask and saturate */
1933       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
1934          coord_mask |= (1 << i);
1935       for (; i < 4; i++)
1936          zero_mask |= (1 << i);
1937
1938       if (ir->offset && ir->op == ir_txf) {
1939          /* It appears that the ld instruction used for txf does its
1940           * address bounds check before adding in the offset.  To work
1941           * around this, just add the integer offset to the integer
1942           * texel coordinate, and don't put the offset in the header.
1943           */
1944          ir_constant *offset = ir->offset->as_constant();
1945          assert(offset);
1946
1947          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
1948             src_reg src = coordinate;
1949             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
1950                                        BRW_GET_SWZ(src.swizzle, j),
1951                                        BRW_GET_SWZ(src.swizzle, j),
1952                                        BRW_GET_SWZ(src.swizzle, j));
1953             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
1954                      src, offset->value.i[j]));
1955          }
1956       } else {
1957          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
1958                   coordinate));
1959       }
1960       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
1961                src_reg(0)));
1962       /* Load the shadow comparitor */
1963       if (ir->shadow_comparitor) {
1964          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
1965                           WRITEMASK_X),
1966                   shadow_comparitor));
1967          inst->mlen++;
1968       }
1969
1970       /* Load the LOD info */
1971       if (ir->op == ir_txl) {
1972          int mrf, writemask;
1973          if (intel->gen >= 5) {
1974             mrf = param_base + 1;
1975             if (ir->shadow_comparitor) {
1976                writemask = WRITEMASK_Y;
1977                /* mlen already incremented */
1978             } else {
1979                writemask = WRITEMASK_X;
1980                inst->mlen++;
1981             }
1982          } else /* intel->gen == 4 */ {
1983             mrf = param_base;
1984             writemask = WRITEMASK_Z;
1985          }
1986          emit(MOV(dst_reg(MRF, mrf, ir->lod_info.lod->type, writemask), lod));
1987       } else if (ir->op == ir_txf) {
1988          emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, WRITEMASK_W),
1989                   lod));
1990       } else if (ir->op == ir_txd) {
1991          const glsl_type *type = ir->lod_info.grad.dPdx->type;
1992
1993          if (intel->gen >= 5) {
1994             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1995             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1996             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
1997             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
1998             inst->mlen++;
1999
2000             if (ir->type->vector_elements == 3) {
2001                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2002                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2003                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2004                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2005                inst->mlen++;
2006             }
2007          } else /* intel->gen == 4 */ {
2008             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2009             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2010             inst->mlen += 2;
2011          }
2012       }
2013    }
2014
2015    emit(inst);
2016
2017    swizzle_result(ir, src_reg(inst->dst), sampler);
2018 }
2019
2020 void
2021 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2022 {
2023    this->result = orig_val;
2024
2025    int s = c->key.tex.swizzles[sampler];
2026
2027    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2028                         || s == SWIZZLE_NOOP)
2029       return;
2030
2031    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2032    int swizzle[4];
2033
2034    for (int i = 0; i < 4; i++) {
2035       switch (GET_SWZ(s, i)) {
2036       case SWIZZLE_ZERO:
2037          zero_mask |= (1 << i);
2038          break;
2039       case SWIZZLE_ONE:
2040          one_mask |= (1 << i);
2041          break;
2042       default:
2043          copy_mask |= (1 << i);
2044          swizzle[i] = GET_SWZ(s, i);
2045          break;
2046       }
2047    }
2048
2049    this->result = src_reg(this, ir->type);
2050    dst_reg swizzled_result(this->result);
2051
2052    if (copy_mask) {
2053       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2054       swizzled_result.writemask = copy_mask;
2055       emit(MOV(swizzled_result, orig_val));
2056    }
2057
2058    if (zero_mask) {
2059       swizzled_result.writemask = zero_mask;
2060       emit(MOV(swizzled_result, src_reg(0.0f)));
2061    }
2062
2063    if (one_mask) {
2064       swizzled_result.writemask = one_mask;
2065       emit(MOV(swizzled_result, src_reg(1.0f)));
2066    }
2067 }
2068
2069 void
2070 vec4_visitor::visit(ir_return *ir)
2071 {
2072    assert(!"not reached");
2073 }
2074
2075 void
2076 vec4_visitor::visit(ir_discard *ir)
2077 {
2078    assert(!"not reached");
2079 }
2080
2081 void
2082 vec4_visitor::visit(ir_if *ir)
2083 {
2084    /* Don't point the annotation at the if statement, because then it plus
2085     * the then and else blocks get printed.
2086     */
2087    this->base_ir = ir->condition;
2088
2089    if (intel->gen == 6) {
2090       emit_if_gen6(ir);
2091    } else {
2092       uint32_t predicate;
2093       emit_bool_to_cond_code(ir->condition, &predicate);
2094       emit(IF(predicate));
2095    }
2096
2097    visit_instructions(&ir->then_instructions);
2098
2099    if (!ir->else_instructions.is_empty()) {
2100       this->base_ir = ir->condition;
2101       emit(BRW_OPCODE_ELSE);
2102
2103       visit_instructions(&ir->else_instructions);
2104    }
2105
2106    this->base_ir = ir->condition;
2107    emit(BRW_OPCODE_ENDIF);
2108 }
2109
2110 void
2111 vec4_visitor::emit_ndc_computation()
2112 {
2113    /* Get the position */
2114    src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
2115
2116    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2117    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2118    output_reg[BRW_VERT_RESULT_NDC] = ndc;
2119
2120    current_annotation = "NDC";
2121    dst_reg ndc_w = ndc;
2122    ndc_w.writemask = WRITEMASK_W;
2123    src_reg pos_w = pos;
2124    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2125    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2126
2127    dst_reg ndc_xyz = ndc;
2128    ndc_xyz.writemask = WRITEMASK_XYZ;
2129
2130    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2131 }
2132
2133 void
2134 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2135 {
2136    if (intel->gen < 6 &&
2137        ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2138         c->key.userclip_active || brw->has_negative_rhw_bug)) {
2139       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2140       dst_reg header1_w = header1;
2141       header1_w.writemask = WRITEMASK_W;
2142       GLuint i;
2143
2144       emit(MOV(header1, 0u));
2145
2146       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2147          src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2148
2149          current_annotation = "Point size";
2150          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2151          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2152       }
2153
2154       current_annotation = "Clipping flags";
2155       for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2156          vec4_instruction *inst;
2157
2158          inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2159                          src_reg(this->userplane[i])));
2160          inst->conditional_mod = BRW_CONDITIONAL_L;
2161
2162          inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2163          inst->predicate = BRW_PREDICATE_NORMAL;
2164       }
2165
2166       /* i965 clipping workaround:
2167        * 1) Test for -ve rhw
2168        * 2) If set,
2169        *      set ndc = (0,0,0,0)
2170        *      set ucp[6] = 1
2171        *
2172        * Later, clipping will detect ucp[6] and ensure the primitive is
2173        * clipped against all fixed planes.
2174        */
2175       if (brw->has_negative_rhw_bug) {
2176 #if 0
2177          /* FINISHME */
2178          brw_CMP(p,
2179                  vec8(brw_null_reg()),
2180                  BRW_CONDITIONAL_L,
2181                  brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2182                  brw_imm_f(0));
2183
2184          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2185          brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2186          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2187 #endif
2188       }
2189
2190       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2191    } else if (intel->gen < 6) {
2192       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2193    } else {
2194       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2195       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2196          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2197                   src_reg(output_reg[VERT_RESULT_PSIZ])));
2198       }
2199    }
2200 }
2201
2202 void
2203 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2204 {
2205    if (intel->gen < 6) {
2206       /* Clip distance slots are set aside in gen5, but they are not used.  It
2207        * is not clear whether we actually need to set aside space for them,
2208        * but the performance cost is negligible.
2209        */
2210       return;
2211    }
2212
2213    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2214     *
2215     *     "If a linked set of shaders forming the vertex stage contains no
2216     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2217     *     application has requested clipping against user clip planes through
2218     *     the API, then the coordinate written to gl_Position is used for
2219     *     comparison against the user clip planes."
2220     *
2221     * This function is only called if the shader didn't write to
2222     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2223     * if the user wrote to it; otherwise we use gl_Position.
2224     */
2225    gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2226    if (!(c->prog_data.outputs_written
2227          & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2228       clip_vertex = VERT_RESULT_HPOS;
2229    }
2230
2231    for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2232         ++i) {
2233       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2234                src_reg(output_reg[clip_vertex]),
2235                src_reg(this->userplane[i + offset])));
2236    }
2237 }
2238
2239 void
2240 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2241 {
2242    assert (vert_result < VERT_RESULT_MAX);
2243    reg.type = output_reg[vert_result].type;
2244    current_annotation = output_reg_annotation[vert_result];
2245    /* Copy the register, saturating if necessary */
2246    vec4_instruction *inst = emit(MOV(reg,
2247                                      src_reg(output_reg[vert_result])));
2248    if ((vert_result == VERT_RESULT_COL0 ||
2249         vert_result == VERT_RESULT_COL1 ||
2250         vert_result == VERT_RESULT_BFC0 ||
2251         vert_result == VERT_RESULT_BFC1) &&
2252        c->key.clamp_vertex_color) {
2253       inst->saturate = true;
2254    }
2255 }
2256
2257 void
2258 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2259 {
2260    struct brw_reg hw_reg = brw_message_reg(mrf);
2261    dst_reg reg = dst_reg(MRF, mrf);
2262    reg.type = BRW_REGISTER_TYPE_F;
2263
2264    switch (vert_result) {
2265    case VERT_RESULT_PSIZ:
2266       /* PSIZ is always in slot 0, and is coupled with other flags. */
2267       current_annotation = "indices, point width, clip flags";
2268       emit_psiz_and_flags(hw_reg);
2269       break;
2270    case BRW_VERT_RESULT_NDC:
2271       current_annotation = "NDC";
2272       emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2273       break;
2274    case BRW_VERT_RESULT_HPOS_DUPLICATE:
2275    case VERT_RESULT_HPOS:
2276       current_annotation = "gl_Position";
2277       emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2278       break;
2279    case VERT_RESULT_CLIP_DIST0:
2280    case VERT_RESULT_CLIP_DIST1:
2281       if (this->c->key.uses_clip_distance) {
2282          emit_generic_urb_slot(reg, vert_result);
2283       } else {
2284          current_annotation = "user clip distances";
2285          emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2286       }
2287       break;
2288    case VERT_RESULT_EDGE:
2289       /* This is present when doing unfilled polygons.  We're supposed to copy
2290        * the edge flag from the user-provided vertex array
2291        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2292        * of that attribute (starts as 1.0f).  This is then used in clipping to
2293        * determine which edges should be drawn as wireframe.
2294        */
2295       current_annotation = "edge flag";
2296       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2297                                     glsl_type::float_type, WRITEMASK_XYZW))));
2298       break;
2299    case BRW_VERT_RESULT_PAD:
2300       /* No need to write to this slot */
2301       break;
2302    default:
2303       emit_generic_urb_slot(reg, vert_result);
2304       break;
2305    }
2306 }
2307
2308 static int
2309 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2310 {
2311    struct intel_context *intel = &brw->intel;
2312
2313    if (intel->gen >= 6) {
2314       /* URB data written (does not include the message header reg) must
2315        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2316        * section 5.4.3.2.2: URB_INTERLEAVED.
2317        *
2318        * URB entries are allocated on a multiple of 1024 bits, so an
2319        * extra 128 bits written here to make the end align to 256 is
2320        * no problem.
2321        */
2322       if ((mlen % 2) != 1)
2323          mlen++;
2324    }
2325
2326    return mlen;
2327 }
2328
2329 /**
2330  * Generates the VUE payload plus the 1 or 2 URB write instructions to
2331  * complete the VS thread.
2332  *
2333  * The VUE layout is documented in Volume 2a.
2334  */
2335 void
2336 vec4_visitor::emit_urb_writes()
2337 {
2338    /* MRF 0 is reserved for the debugger, so start with message header
2339     * in MRF 1.
2340     */
2341    int base_mrf = 1;
2342    int mrf = base_mrf;
2343    /* In the process of generating our URB write message contents, we
2344     * may need to unspill a register or load from an array.  Those
2345     * reads would use MRFs 14-15.
2346     */
2347    int max_usable_mrf = 13;
2348
2349    /* The following assertion verifies that max_usable_mrf causes an
2350     * even-numbered amount of URB write data, which will meet gen6's
2351     * requirements for length alignment.
2352     */
2353    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2354
2355    /* First mrf is the g0-based message header containing URB handles and such,
2356     * which is implied in VS_OPCODE_URB_WRITE.
2357     */
2358    mrf++;
2359
2360    if (intel->gen < 6) {
2361       emit_ndc_computation();
2362    }
2363
2364    /* Set up the VUE data for the first URB write */
2365    int slot;
2366    for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) {
2367       emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2368
2369       /* If this was max_usable_mrf, we can't fit anything more into this URB
2370        * WRITE.
2371        */
2372       if (mrf > max_usable_mrf) {
2373          slot++;
2374          break;
2375       }
2376    }
2377
2378    current_annotation = "URB write";
2379    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2380    inst->base_mrf = base_mrf;
2381    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2382    inst->eot = (slot >= c->prog_data.vue_map.num_slots);
2383
2384    /* Optional second URB write */
2385    if (!inst->eot) {
2386       mrf = base_mrf + 1;
2387
2388       for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
2389          assert(mrf < max_usable_mrf);
2390
2391          emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2392       }
2393
2394       current_annotation = "URB write";
2395       inst = emit(VS_OPCODE_URB_WRITE);
2396       inst->base_mrf = base_mrf;
2397       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2398       inst->eot = true;
2399       /* URB destination offset.  In the previous write, we got MRFs
2400        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2401        * URB row increments, and each of our MRFs is half of one of
2402        * those, since we're doing interleaved writes.
2403        */
2404       inst->offset = (max_usable_mrf - base_mrf) / 2;
2405    }
2406 }
2407
2408 src_reg
2409 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2410                                  src_reg *reladdr, int reg_offset)
2411 {
2412    /* Because we store the values to scratch interleaved like our
2413     * vertex data, we need to scale the vec4 index by 2.
2414     */
2415    int message_header_scale = 2;
2416
2417    /* Pre-gen6, the message header uses byte offsets instead of vec4
2418     * (16-byte) offset units.
2419     */
2420    if (intel->gen < 6)
2421       message_header_scale *= 16;
2422
2423    if (reladdr) {
2424       src_reg index = src_reg(this, glsl_type::int_type);
2425
2426       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2427       emit_before(inst, MUL(dst_reg(index),
2428                             index, src_reg(message_header_scale)));
2429
2430       return index;
2431    } else {
2432       return src_reg(reg_offset * message_header_scale);
2433    }
2434 }
2435
2436 src_reg
2437 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2438                                        src_reg *reladdr, int reg_offset)
2439 {
2440    if (reladdr) {
2441       src_reg index = src_reg(this, glsl_type::int_type);
2442
2443       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2444
2445       /* Pre-gen6, the message header uses byte offsets instead of vec4
2446        * (16-byte) offset units.
2447        */
2448       if (intel->gen < 6) {
2449          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2450       }
2451
2452       return index;
2453    } else {
2454       int message_header_scale = intel->gen < 6 ? 16 : 1;
2455       return src_reg(reg_offset * message_header_scale);
2456    }
2457 }
2458
2459 /**
2460  * Emits an instruction before @inst to load the value named by @orig_src
2461  * from scratch space at @base_offset to @temp.
2462  *
2463  * @base_offset is measured in 32-byte units (the size of a register).
2464  */
2465 void
2466 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2467                                 dst_reg temp, src_reg orig_src,
2468                                 int base_offset)
2469 {
2470    int reg_offset = base_offset + orig_src.reg_offset;
2471    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2472
2473    emit_before(inst, SCRATCH_READ(temp, index));
2474 }
2475
2476 /**
2477  * Emits an instruction after @inst to store the value to be written
2478  * to @orig_dst to scratch space at @base_offset, from @temp.
2479  *
2480  * @base_offset is measured in 32-byte units (the size of a register).
2481  */
2482 void
2483 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2484 {
2485    int reg_offset = base_offset + inst->dst.reg_offset;
2486    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2487
2488    /* Create a temporary register to store *inst's result in.
2489     *
2490     * We have to be careful in MOVing from our temporary result register in
2491     * the scratch write.  If we swizzle from channels of the temporary that
2492     * weren't initialized, it will confuse live interval analysis, which will
2493     * make spilling fail to make progress.
2494     */
2495    src_reg temp = src_reg(this, glsl_type::vec4_type);
2496    temp.type = inst->dst.type;
2497    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2498    int swizzles[4];
2499    for (int i = 0; i < 4; i++)
2500       if (inst->dst.writemask & (1 << i))
2501          swizzles[i] = i;
2502       else
2503          swizzles[i] = first_writemask_chan;
2504    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2505                                swizzles[2], swizzles[3]);
2506
2507    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2508                                        inst->dst.writemask));
2509    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2510    write->predicate = inst->predicate;
2511    write->ir = inst->ir;
2512    write->annotation = inst->annotation;
2513    inst->insert_after(write);
2514
2515    inst->dst.file = temp.file;
2516    inst->dst.reg = temp.reg;
2517    inst->dst.reg_offset = temp.reg_offset;
2518    inst->dst.reladdr = NULL;
2519 }
2520
2521 /**
2522  * We can't generally support array access in GRF space, because a
2523  * single instruction's destination can only span 2 contiguous
2524  * registers.  So, we send all GRF arrays that get variable index
2525  * access to scratch space.
2526  */
2527 void
2528 vec4_visitor::move_grf_array_access_to_scratch()
2529 {
2530    int scratch_loc[this->virtual_grf_count];
2531
2532    for (int i = 0; i < this->virtual_grf_count; i++) {
2533       scratch_loc[i] = -1;
2534    }
2535
2536    /* First, calculate the set of virtual GRFs that need to be punted
2537     * to scratch due to having any array access on them, and where in
2538     * scratch.
2539     */
2540    foreach_list(node, &this->instructions) {
2541       vec4_instruction *inst = (vec4_instruction *)node;
2542
2543       if (inst->dst.file == GRF && inst->dst.reladdr &&
2544           scratch_loc[inst->dst.reg] == -1) {
2545          scratch_loc[inst->dst.reg] = c->last_scratch;
2546          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2547       }
2548
2549       for (int i = 0 ; i < 3; i++) {
2550          src_reg *src = &inst->src[i];
2551
2552          if (src->file == GRF && src->reladdr &&
2553              scratch_loc[src->reg] == -1) {
2554             scratch_loc[src->reg] = c->last_scratch;
2555             c->last_scratch += this->virtual_grf_sizes[src->reg];
2556          }
2557       }
2558    }
2559
2560    /* Now, for anything that will be accessed through scratch, rewrite
2561     * it to load/store.  Note that this is a _safe list walk, because
2562     * we may generate a new scratch_write instruction after the one
2563     * we're processing.
2564     */
2565    foreach_list_safe(node, &this->instructions) {
2566       vec4_instruction *inst = (vec4_instruction *)node;
2567
2568       /* Set up the annotation tracking for new generated instructions. */
2569       base_ir = inst->ir;
2570       current_annotation = inst->annotation;
2571
2572       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2573          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2574       }
2575
2576       for (int i = 0 ; i < 3; i++) {
2577          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2578             continue;
2579
2580          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2581
2582          emit_scratch_read(inst, temp, inst->src[i],
2583                            scratch_loc[inst->src[i].reg]);
2584
2585          inst->src[i].file = temp.file;
2586          inst->src[i].reg = temp.reg;
2587          inst->src[i].reg_offset = temp.reg_offset;
2588          inst->src[i].reladdr = NULL;
2589       }
2590    }
2591 }
2592
2593 /**
2594  * Emits an instruction before @inst to load the value named by @orig_src
2595  * from the pull constant buffer (surface) at @base_offset to @temp.
2596  */
2597 void
2598 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2599                                       dst_reg temp, src_reg orig_src,
2600                                       int base_offset)
2601 {
2602    int reg_offset = base_offset + orig_src.reg_offset;
2603    src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2604    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2605    vec4_instruction *load;
2606
2607    load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2608                                         temp, index, offset);
2609    load->base_mrf = 14;
2610    load->mlen = 1;
2611    emit_before(inst, load);
2612 }
2613
2614 /**
2615  * Implements array access of uniforms by inserting a
2616  * PULL_CONSTANT_LOAD instruction.
2617  *
2618  * Unlike temporary GRF array access (where we don't support it due to
2619  * the difficulty of doing relative addressing on instruction
2620  * destinations), we could potentially do array access of uniforms
2621  * that were loaded in GRF space as push constants.  In real-world
2622  * usage we've seen, though, the arrays being used are always larger
2623  * than we could load as push constants, so just always move all
2624  * uniform array access out to a pull constant buffer.
2625  */
2626 void
2627 vec4_visitor::move_uniform_array_access_to_pull_constants()
2628 {
2629    int pull_constant_loc[this->uniforms];
2630
2631    for (int i = 0; i < this->uniforms; i++) {
2632       pull_constant_loc[i] = -1;
2633    }
2634
2635    /* Walk through and find array access of uniforms.  Put a copy of that
2636     * uniform in the pull constant buffer.
2637     *
2638     * Note that we don't move constant-indexed accesses to arrays.  No
2639     * testing has been done of the performance impact of this choice.
2640     */
2641    foreach_list_safe(node, &this->instructions) {
2642       vec4_instruction *inst = (vec4_instruction *)node;
2643
2644       for (int i = 0 ; i < 3; i++) {
2645          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2646             continue;
2647
2648          int uniform = inst->src[i].reg;
2649
2650          /* If this array isn't already present in the pull constant buffer,
2651           * add it.
2652           */
2653          if (pull_constant_loc[uniform] == -1) {
2654             const float **values = &prog_data->param[uniform * 4];
2655
2656             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2657
2658             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2659                prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2660             }
2661          }
2662
2663          /* Set up the annotation tracking for new generated instructions. */
2664          base_ir = inst->ir;
2665          current_annotation = inst->annotation;
2666
2667          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2668
2669          emit_pull_constant_load(inst, temp, inst->src[i],
2670                                  pull_constant_loc[uniform]);
2671
2672          inst->src[i].file = temp.file;
2673          inst->src[i].reg = temp.reg;
2674          inst->src[i].reg_offset = temp.reg_offset;
2675          inst->src[i].reladdr = NULL;
2676       }
2677    }
2678
2679    /* Now there are no accesses of the UNIFORM file with a reladdr, so
2680     * no need to track them as larger-than-vec4 objects.  This will be
2681     * relied on in cutting out unused uniform vectors from push
2682     * constants.
2683     */
2684    split_uniform_registers();
2685 }
2686
2687 void
2688 vec4_visitor::resolve_ud_negate(src_reg *reg)
2689 {
2690    if (reg->type != BRW_REGISTER_TYPE_UD ||
2691        !reg->negate)
2692       return;
2693
2694    src_reg temp = src_reg(this, glsl_type::uvec4_type);
2695    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2696    *reg = temp;
2697 }
2698
2699 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2700                            struct gl_shader_program *prog,
2701                            struct brw_shader *shader)
2702 {
2703    this->c = c;
2704    this->p = &c->func;
2705    this->brw = p->brw;
2706    this->intel = &brw->intel;
2707    this->ctx = &intel->ctx;
2708    this->prog = prog;
2709    this->shader = shader;
2710
2711    this->mem_ctx = ralloc_context(NULL);
2712    this->failed = false;
2713
2714    this->base_ir = NULL;
2715    this->current_annotation = NULL;
2716
2717    this->c = c;
2718    this->vp = &c->vp->program;
2719    this->prog_data = &c->prog_data;
2720
2721    this->variable_ht = hash_table_ctor(0,
2722                                        hash_table_pointer_hash,
2723                                        hash_table_pointer_compare);
2724
2725    this->virtual_grf_def = NULL;
2726    this->virtual_grf_use = NULL;
2727    this->virtual_grf_sizes = NULL;
2728    this->virtual_grf_count = 0;
2729    this->virtual_grf_reg_map = NULL;
2730    this->virtual_grf_reg_count = 0;
2731    this->virtual_grf_array_size = 0;
2732    this->live_intervals_valid = false;
2733
2734    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2735
2736    this->uniforms = 0;
2737 }
2738
2739 vec4_visitor::~vec4_visitor()
2740 {
2741    ralloc_free(this->mem_ctx);
2742    hash_table_dtor(this->variable_ht);
2743 }
2744
2745
2746 void
2747 vec4_visitor::fail(const char *format, ...)
2748 {
2749    va_list va;
2750    char *msg;
2751
2752    if (failed)
2753       return;
2754
2755    failed = true;
2756
2757    va_start(va, format);
2758    msg = ralloc_vasprintf(mem_ctx, format, va);
2759    va_end(va);
2760    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2761
2762    this->fail_msg = msg;
2763
2764    if (INTEL_DEBUG & DEBUG_VS) {
2765       fprintf(stderr, "%s",  msg);
2766    }
2767 }
2768
2769 } /* namespace brw */