src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 extern "C" {
  26 #include "main/macros.h"
  27 #include "program/prog_parameter.h"
  28 #include "program/sampler.h"
  29 }
  30
  31 namespace brw {
  32
  33 vec4_instruction::vec4_instruction(vec4_visitor *v,
  34                                    enum opcode opcode, dst_reg dst,
  35                                    src_reg src0, src_reg src1, src_reg src2)
  36 {
  37    this->opcode = opcode;
  38    this->dst = dst;
  39    this->src[0] = src0;
  40    this->src[1] = src1;
  41    this->src[2] = src2;
  42    this->ir = v->base_ir;
  43    this->annotation = v->current_annotation;
  44 }
  45
  46 vec4_instruction *
  47 vec4_visitor::emit(vec4_instruction *inst)
  48 {
  49    this->instructions.push_tail(inst);
  50
  51    return inst;
  52 }
  53
  54 vec4_instruction *
  55 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  56 {
  57    new_inst->ir = inst->ir;
  58    new_inst->annotation = inst->annotation;
  59
  60    inst->insert_before(new_inst);
  61
  62    return inst;
  63 }
  64
  65 vec4_instruction *
  66 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  67                    src_reg src0, src_reg src1, src_reg src2)
  68 {
  69    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  70                                              src0, src1, src2));
  71 }
  72
  73
  74 vec4_instruction *
  75 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  76 {
  77    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  78 }
  79
  80 vec4_instruction *
  81 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  82 {
  83    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  84 }
  85
  86 vec4_instruction *
  87 vec4_visitor::emit(enum opcode opcode)
  88 {
  89    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  90 }
  91
  92 #define ALU1(op)                                                        \
  93    vec4_instruction *                                                   \
  94    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
  95    {                                                                    \
  96       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
  97                                            src0);                       \
  98    }
  99
 100 #define ALU2(op)                                                        \
 101    vec4_instruction *                                                   \
 102    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 103    {                                                                    \
 104       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 105                                            src0, src1);                 \
 106    }
 107
 108 ALU1(NOT)
 109 ALU1(MOV)
 110 ALU1(FRC)
 111 ALU1(RNDD)
 112 ALU1(RNDE)
 113 ALU1(RNDZ)
 114 ALU2(ADD)
 115 ALU2(MUL)
 116 ALU2(MACH)
 117 ALU2(AND)
 118 ALU2(OR)
 119 ALU2(XOR)
 120 ALU2(DP3)
 121 ALU2(DP4)
 122 ALU2(DPH)
 123
 124 /** Gen4 predicated IF. */
 125 vec4_instruction *
 126 vec4_visitor::IF(uint32_t predicate)
 127 {
 128    vec4_instruction *inst;
 129
 130    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 131    inst->predicate = predicate;
 132
 133    return inst;
 134 }
 135
 136 /** Gen6+ IF with embedded comparison. */
 137 vec4_instruction *
 138 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 139 {
 140    assert(intel->gen >= 6);
 141
 142    vec4_instruction *inst;
 143
 144    resolve_ud_negate(&src0);
 145    resolve_ud_negate(&src1);
 146
 147    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 148                                         src0, src1);
 149    inst->conditional_mod = condition;
 150
 151    return inst;
 152 }
 153
 154 /**
 155  * CMP: Sets the low bit of the destination channels with the result
 156  * of the comparison, while the upper bits are undefined, and updates
 157  * the flag register with the packed 16 bits of the result.
 158  */
 159 vec4_instruction *
 160 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 161 {
 162    vec4_instruction *inst;
 163
 164    /* original gen4 does type conversion to the destination type
 165     * before before comparison, producing garbage results for floating
 166     * point comparisons.
 167     */
 168    if (intel->gen == 4) {
 169       dst.type = src0.type;
 170       if (dst.file == HW_REG)
 171          dst.fixed_hw_reg.type = dst.type;
 172    }
 173
 174    resolve_ud_negate(&src0);
 175    resolve_ud_negate(&src1);
 176
 177    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 178    inst->conditional_mod = condition;
 179
 180    return inst;
 181 }
 182
 183 vec4_instruction *
 184 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 185 {
 186    vec4_instruction *inst;
 187
 188    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 189                                         dst, index);
 190    inst->base_mrf = 14;
 191    inst->mlen = 2;
 192
 193    return inst;
 194 }
 195
 196 vec4_instruction *
 197 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 198 {
 199    vec4_instruction *inst;
 200
 201    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 202                                         dst, src, index);
 203    inst->base_mrf = 13;
 204    inst->mlen = 3;
 205
 206    return inst;
 207 }
 208
 209 void
 210 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 211 {
 212    static enum opcode dot_opcodes[] = {
 213       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 214    };
 215
 216    emit(dot_opcodes[elements - 2], dst, src0, src1);
 217 }
 218
 219 void
 220 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 221 {
 222    /* The gen6 math instruction ignores the source modifiers --
 223     * swizzle, abs, negate, and at least some parts of the register
 224     * region description.
 225     *
 226     * While it would seem that this MOV could be avoided at this point
 227     * in the case that the swizzle is matched up with the destination
 228     * writemask, note that uniform packing and register allocation
 229     * could rearrange our swizzle, so let's leave this matter up to
 230     * copy propagation later.
 231     */
 232    src_reg temp_src = src_reg(this, glsl_type::vec4_type);
 233    emit(MOV(dst_reg(temp_src), src));
 234
 235    if (dst.writemask != WRITEMASK_XYZW) {
 236       /* The gen6 math instruction must be align1, so we can't do
 237        * writemasks.
 238        */
 239       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 240
 241       emit(opcode, temp_dst, temp_src);
 242
 243       emit(MOV(dst, src_reg(temp_dst)));
 244    } else {
 245       emit(opcode, dst, temp_src);
 246    }
 247 }
 248
 249 void
 250 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 251 {
 252    vec4_instruction *inst = emit(opcode, dst, src);
 253    inst->base_mrf = 1;
 254    inst->mlen = 1;
 255 }
 256
 257 void
 258 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 259 {
 260    switch (opcode) {
 261    case SHADER_OPCODE_RCP:
 262    case SHADER_OPCODE_RSQ:
 263    case SHADER_OPCODE_SQRT:
 264    case SHADER_OPCODE_EXP2:
 265    case SHADER_OPCODE_LOG2:
 266    case SHADER_OPCODE_SIN:
 267    case SHADER_OPCODE_COS:
 268       break;
 269    default:
 270       assert(!"not reached: bad math opcode");
 271       return;
 272    }
 273
 274    if (intel->gen >= 7) {
 275       emit(opcode, dst, src);
 276    } else if (intel->gen == 6) {
 277       return emit_math1_gen6(opcode, dst, src);
 278    } else {
 279       return emit_math1_gen4(opcode, dst, src);
 280    }
 281 }
 282
 283 void
 284 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 285                               dst_reg dst, src_reg src0, src_reg src1)
 286 {
 287    src_reg expanded;
 288
 289    /* The gen6 math instruction ignores the source modifiers --
 290     * swizzle, abs, negate, and at least some parts of the register
 291     * region description.  Move the sources to temporaries to make it
 292     * generally work.
 293     */
 294
 295    expanded = src_reg(this, glsl_type::vec4_type);
 296    expanded.type = src0.type;
 297    emit(MOV(dst_reg(expanded), src0));
 298    src0 = expanded;
 299
 300    expanded = src_reg(this, glsl_type::vec4_type);
 301    expanded.type = src1.type;
 302    emit(MOV(dst_reg(expanded), src1));
 303    src1 = expanded;
 304
 305    if (dst.writemask != WRITEMASK_XYZW) {
 306       /* The gen6 math instruction must be align1, so we can't do
 307        * writemasks.
 308        */
 309       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 310       temp_dst.type = dst.type;
 311
 312       emit(opcode, temp_dst, src0, src1);
 313
 314       emit(MOV(dst, src_reg(temp_dst)));
 315    } else {
 316       emit(opcode, dst, src0, src1);
 317    }
 318 }
 319
 320 void
 321 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 322                               dst_reg dst, src_reg src0, src_reg src1)
 323 {
 324    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 325    inst->base_mrf = 1;
 326    inst->mlen = 2;
 327 }
 328
 329 void
 330 vec4_visitor::emit_math(enum opcode opcode,
 331                         dst_reg dst, src_reg src0, src_reg src1)
 332 {
 333    switch (opcode) {
 334    case SHADER_OPCODE_POW:
 335    case SHADER_OPCODE_INT_QUOTIENT:
 336    case SHADER_OPCODE_INT_REMAINDER:
 337       break;
 338    default:
 339       assert(!"not reached: unsupported binary math opcode");
 340       return;
 341    }
 342
 343    if (intel->gen >= 7) {
 344       emit(opcode, dst, src0, src1);
 345    } else if (intel->gen == 6) {
 346       return emit_math2_gen6(opcode, dst, src0, src1);
 347    } else {
 348       return emit_math2_gen4(opcode, dst, src0, src1);
 349    }
 350 }
 351
 352 void
 353 vec4_visitor::visit_instructions(const exec_list *list)
 354 {
 355    foreach_list(node, list) {
 356       ir_instruction *ir = (ir_instruction *)node;
 357
 358       base_ir = ir;
 359       ir->accept(this);
 360    }
 361 }
 362
 363
 364 static int
 365 type_size(const struct glsl_type *type)
 366 {
 367    unsigned int i;
 368    int size;
 369
 370    switch (type->base_type) {
 371    case GLSL_TYPE_UINT:
 372    case GLSL_TYPE_INT:
 373    case GLSL_TYPE_FLOAT:
 374    case GLSL_TYPE_BOOL:
 375       if (type->is_matrix()) {
 376          return type->matrix_columns;
 377       } else {
 378          /* Regardless of size of vector, it gets a vec4. This is bad
 379           * packing for things like floats, but otherwise arrays become a
 380           * mess.  Hopefully a later pass over the code can pack scalars
 381           * down if appropriate.
 382           */
 383          return 1;
 384       }
 385    case GLSL_TYPE_ARRAY:
 386       assert(type->length > 0);
 387       return type_size(type->fields.array) * type->length;
 388    case GLSL_TYPE_STRUCT:
 389       size = 0;
 390       for (i = 0; i < type->length; i++) {
 391          size += type_size(type->fields.structure[i].type);
 392       }
 393       return size;
 394    case GLSL_TYPE_SAMPLER:
 395       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 396        * at link time.
 397        */
 398       return 1;
 399    default:
 400       assert(0);
 401       return 0;
 402    }
 403 }
 404
 405 int
 406 vec4_visitor::virtual_grf_alloc(int size)
 407 {
 408    if (virtual_grf_array_size <= virtual_grf_count) {
 409       if (virtual_grf_array_size == 0)
 410          virtual_grf_array_size = 16;
 411       else
 412          virtual_grf_array_size *= 2;
 413       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 414                                    virtual_grf_array_size);
 415       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 416                                      virtual_grf_array_size);
 417    }
 418    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 419    virtual_grf_reg_count += size;
 420    virtual_grf_sizes[virtual_grf_count] = size;
 421    return virtual_grf_count++;
 422 }
 423
 424 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 425 {
 426    init();
 427
 428    this->file = GRF;
 429    this->reg = v->virtual_grf_alloc(type_size(type));
 430
 431    if (type->is_array() || type->is_record()) {
 432       this->swizzle = BRW_SWIZZLE_NOOP;
 433    } else {
 434       this->swizzle = swizzle_for_size(type->vector_elements);
 435    }
 436
 437    this->type = brw_type_for_base_type(type);
 438 }
 439
 440 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 441 {
 442    init();
 443
 444    this->file = GRF;
 445    this->reg = v->virtual_grf_alloc(type_size(type));
 446
 447    if (type->is_array() || type->is_record()) {
 448       this->writemask = WRITEMASK_XYZW;
 449    } else {
 450       this->writemask = (1 << type->vector_elements) - 1;
 451    }
 452
 453    this->type = brw_type_for_base_type(type);
 454 }
 455
 456 /* Our support for uniforms is piggy-backed on the struct
 457  * gl_fragment_program, because that's where the values actually
 458  * get stored, rather than in some global gl_shader_program uniform
 459  * store.
 460  */
 461 int
 462 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
 463 {
 464    unsigned int offset = 0;
 465    float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
 466
 467    if (type->is_matrix()) {
 468       const glsl_type *column = type->column_type();
 469
 470       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 471          offset += setup_uniform_values(loc + offset, column);
 472       }
 473
 474       return offset;
 475    }
 476
 477    switch (type->base_type) {
 478    case GLSL_TYPE_FLOAT:
 479    case GLSL_TYPE_UINT:
 480    case GLSL_TYPE_INT:
 481    case GLSL_TYPE_BOOL:
 482       for (unsigned int i = 0; i < type->vector_elements; i++) {
 483          c->prog_data.param[this->uniforms * 4 + i] = &values[i];
 484       }
 485
 486       /* Set up pad elements to get things aligned to a vec4 boundary. */
 487       for (unsigned int i = type->vector_elements; i < 4; i++) {
 488          static float zero = 0;
 489
 490          c->prog_data.param[this->uniforms * 4 + i] = &zero;
 491       }
 492
 493       /* Track the size of this uniform vector, for future packing of
 494        * uniforms.
 495        */
 496       this->uniform_vector_size[this->uniforms] = type->vector_elements;
 497       this->uniforms++;
 498
 499       return 1;
 500
 501    case GLSL_TYPE_STRUCT:
 502       for (unsigned int i = 0; i < type->length; i++) {
 503          offset += setup_uniform_values(loc + offset,
 504                                         type->fields.structure[i].type);
 505       }
 506       return offset;
 507
 508    case GLSL_TYPE_ARRAY:
 509       for (unsigned int i = 0; i < type->length; i++) {
 510          offset += setup_uniform_values(loc + offset, type->fields.array);
 511       }
 512       return offset;
 513
 514    case GLSL_TYPE_SAMPLER:
 515       /* The sampler takes up a slot, but we don't use any values from it. */
 516       return 1;
 517
 518    default:
 519       assert(!"not reached");
 520       return 0;
 521    }
 522 }
 523
 524 void
 525 vec4_visitor::setup_uniform_clipplane_values()
 526 {
 527    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 528
 529    /* Pre-Gen6, we compact clip planes.  For example, if the user
 530     * enables just clip planes 0, 1, and 3, we will enable clip planes
 531     * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
 532     * plane 2.  This simplifies the implementation of the Gen6 clip
 533     * thread.
 534     *
 535     * In Gen6 and later, we don't compact clip planes, because this
 536     * simplifies the implementation of gl_ClipDistance.
 537     */
 538    int compacted_clipplane_index = 0;
 539    for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
 540       if (intel->gen < 6 &&
 541           !(c->key.userclip_planes_enabled_gen_4_5 & (1 << i))) {
 542          continue;
 543       }
 544       this->uniform_vector_size[this->uniforms] = 4;
 545       this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
 546       this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
 547       for (int j = 0; j < 4; ++j) {
 548          c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 549       }
 550       ++compacted_clipplane_index;
 551       ++this->uniforms;
 552    }
 553 }
 554
 555 /* Our support for builtin uniforms is even scarier than non-builtin.
 556  * It sits on top of the PROG_STATE_VAR parameters that are
 557  * automatically updated from GL context state.
 558  */
 559 void
 560 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 561 {
 562    const ir_state_slot *const slots = ir->state_slots;
 563    assert(ir->state_slots != NULL);
 564
 565    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 566       /* This state reference has already been setup by ir_to_mesa,
 567        * but we'll get the same index back here.  We can reference
 568        * ParameterValues directly, since unlike brw_fs.cpp, we never
 569        * add new state references during compile.
 570        */
 571       int index = _mesa_add_state_reference(this->vp->Base.Parameters,
 572                                             (gl_state_index *)slots[i].tokens);
 573       float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
 574
 575       this->uniform_vector_size[this->uniforms] = 0;
 576       /* Add each of the unique swizzled channels of the element.
 577        * This will end up matching the size of the glsl_type of this field.
 578        */
 579       int last_swiz = -1;
 580       for (unsigned int j = 0; j < 4; j++) {
 581          int swiz = GET_SWZ(slots[i].swizzle, j);
 582          last_swiz = swiz;
 583
 584          c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
 585          if (swiz <= last_swiz)
 586             this->uniform_vector_size[this->uniforms]++;
 587       }
 588       this->uniforms++;
 589    }
 590 }
 591
 592 dst_reg *
 593 vec4_visitor::variable_storage(ir_variable *var)
 594 {
 595    return (dst_reg *)hash_table_find(this->variable_ht, var);
 596 }
 597
 598 void
 599 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 600 {
 601    ir_expression *expr = ir->as_expression();
 602
 603    *predicate = BRW_PREDICATE_NORMAL;
 604
 605    if (expr) {
 606       src_reg op[2];
 607       vec4_instruction *inst;
 608
 609       assert(expr->get_num_operands() <= 2);
 610       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 611          expr->operands[i]->accept(this);
 612          op[i] = this->result;
 613
 614          resolve_ud_negate(&op[i]);
 615       }
 616
 617       switch (expr->operation) {
 618       case ir_unop_logic_not:
 619          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 620          inst->conditional_mod = BRW_CONDITIONAL_Z;
 621          break;
 622
 623       case ir_binop_logic_xor:
 624          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 625          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 626          break;
 627
 628       case ir_binop_logic_or:
 629          inst = emit(OR(dst_null_d(), op[0], op[1]));
 630          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 631          break;
 632
 633       case ir_binop_logic_and:
 634          inst = emit(AND(dst_null_d(), op[0], op[1]));
 635          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 636          break;
 637
 638       case ir_unop_f2b:
 639          if (intel->gen >= 6) {
 640             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 641          } else {
 642             inst = emit(MOV(dst_null_f(), op[0]));
 643             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 644          }
 645          break;
 646
 647       case ir_unop_i2b:
 648          if (intel->gen >= 6) {
 649             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 650          } else {
 651             inst = emit(MOV(dst_null_d(), op[0]));
 652             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 653          }
 654          break;
 655
 656       case ir_binop_all_equal:
 657          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 658          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 659          break;
 660
 661       case ir_binop_any_nequal:
 662          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 663          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 664          break;
 665
 666       case ir_unop_any:
 667          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 668          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 669          break;
 670
 671       case ir_binop_greater:
 672       case ir_binop_gequal:
 673       case ir_binop_less:
 674       case ir_binop_lequal:
 675       case ir_binop_equal:
 676       case ir_binop_nequal:
 677          emit(CMP(dst_null_d(), op[0], op[1],
 678                   brw_conditional_for_comparison(expr->operation)));
 679          break;
 680
 681       default:
 682          assert(!"not reached");
 683          break;
 684       }
 685       return;
 686    }
 687
 688    ir->accept(this);
 689
 690    resolve_ud_negate(&this->result);
 691
 692    if (intel->gen >= 6) {
 693       vec4_instruction *inst = emit(AND(dst_null_d(),
 694                                         this->result, src_reg(1)));
 695       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 696    } else {
 697       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 698       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 699    }
 700 }
 701
 702 /**
 703  * Emit a gen6 IF statement with the comparison folded into the IF
 704  * instruction.
 705  */
 706 void
 707 vec4_visitor::emit_if_gen6(ir_if *ir)
 708 {
 709    ir_expression *expr = ir->condition->as_expression();
 710
 711    if (expr) {
 712       src_reg op[2];
 713       dst_reg temp;
 714
 715       assert(expr->get_num_operands() <= 2);
 716       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 717          expr->operands[i]->accept(this);
 718          op[i] = this->result;
 719       }
 720
 721       switch (expr->operation) {
 722       case ir_unop_logic_not:
 723          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 724          return;
 725
 726       case ir_binop_logic_xor:
 727          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 728          return;
 729
 730       case ir_binop_logic_or:
 731          temp = dst_reg(this, glsl_type::bool_type);
 732          emit(OR(temp, op[0], op[1]));
 733          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 734          return;
 735
 736       case ir_binop_logic_and:
 737          temp = dst_reg(this, glsl_type::bool_type);
 738          emit(AND(temp, op[0], op[1]));
 739          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 740          return;
 741
 742       case ir_unop_f2b:
 743          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 744          return;
 745
 746       case ir_unop_i2b:
 747          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 748          return;
 749
 750       case ir_binop_greater:
 751       case ir_binop_gequal:
 752       case ir_binop_less:
 753       case ir_binop_lequal:
 754       case ir_binop_equal:
 755       case ir_binop_nequal:
 756          emit(IF(op[0], op[1],
 757                  brw_conditional_for_comparison(expr->operation)));
 758          return;
 759
 760       case ir_binop_all_equal:
 761          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 762          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 763          return;
 764
 765       case ir_binop_any_nequal:
 766          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 767          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 768          return;
 769
 770       case ir_unop_any:
 771          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 772          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 773          return;
 774
 775       default:
 776          assert(!"not reached");
 777          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 778          return;
 779       }
 780       return;
 781    }
 782
 783    ir->condition->accept(this);
 784
 785    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 786 }
 787
 788 void
 789 vec4_visitor::visit(ir_variable *ir)
 790 {
 791    dst_reg *reg = NULL;
 792
 793    if (variable_storage(ir))
 794       return;
 795
 796    switch (ir->mode) {
 797    case ir_var_in:
 798       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
 799
 800       /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 801        * come in as floating point conversions of the integer values.
 802        */
 803       for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
 804          if (!c->key.gl_fixed_input_size[i])
 805             continue;
 806
 807          dst_reg dst = *reg;
 808          dst.type = brw_type_for_base_type(ir->type);
 809          dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
 810          emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 811       }
 812       break;
 813
 814    case ir_var_out:
 815       reg = new(mem_ctx) dst_reg(this, ir->type);
 816
 817       for (int i = 0; i < type_size(ir->type); i++) {
 818          output_reg[ir->location + i] = *reg;
 819          output_reg[ir->location + i].reg_offset = i;
 820          output_reg[ir->location + i].type =
 821             brw_type_for_base_type(ir->type->get_scalar_type());
 822          output_reg_annotation[ir->location + i] = ir->name;
 823       }
 824       break;
 825
 826    case ir_var_auto:
 827    case ir_var_temporary:
 828       reg = new(mem_ctx) dst_reg(this, ir->type);
 829       break;
 830
 831    case ir_var_uniform:
 832       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 833
 834       /* Thanks to the lower_ubo_reference pass, we will see only
 835        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 836        * variables, so no need for them to be in variable_ht.
 837        */
 838       if (ir->uniform_block != -1)
 839          return;
 840
 841       /* Track how big the whole uniform variable is, in case we need to put a
 842        * copy of its data into pull constants for array access.
 843        */
 844       this->uniform_size[this->uniforms] = type_size(ir->type);
 845
 846       if (!strncmp(ir->name, "gl_", 3)) {
 847          setup_builtin_uniform_values(ir);
 848       } else {
 849          setup_uniform_values(ir->location, ir->type);
 850       }
 851       break;
 852
 853    case ir_var_system_value:
 854       /* VertexID is stored by the VF as the last vertex element, but
 855        * we don't represent it with a flag in inputs_read, so we call
 856        * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
 857        */
 858       reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
 859       prog_data->uses_vertexid = true;
 860
 861       switch (ir->location) {
 862       case SYSTEM_VALUE_VERTEX_ID:
 863          reg->writemask = WRITEMASK_X;
 864          break;
 865       case SYSTEM_VALUE_INSTANCE_ID:
 866          reg->writemask = WRITEMASK_Y;
 867          break;
 868       default:
 869          assert(!"not reached");
 870          break;
 871       }
 872       break;
 873
 874    default:
 875       assert(!"not reached");
 876    }
 877
 878    reg->type = brw_type_for_base_type(ir->type);
 879    hash_table_insert(this->variable_ht, reg, ir);
 880 }
 881
 882 void
 883 vec4_visitor::visit(ir_loop *ir)
 884 {
 885    dst_reg counter;
 886
 887    /* We don't want debugging output to print the whole body of the
 888     * loop as the annotation.
 889     */
 890    this->base_ir = NULL;
 891
 892    if (ir->counter != NULL) {
 893       this->base_ir = ir->counter;
 894       ir->counter->accept(this);
 895       counter = *(variable_storage(ir->counter));
 896
 897       if (ir->from != NULL) {
 898          this->base_ir = ir->from;
 899          ir->from->accept(this);
 900
 901          emit(MOV(counter, this->result));
 902       }
 903    }
 904
 905    emit(BRW_OPCODE_DO);
 906
 907    if (ir->to) {
 908       this->base_ir = ir->to;
 909       ir->to->accept(this);
 910
 911       emit(CMP(dst_null_d(), src_reg(counter), this->result,
 912                brw_conditional_for_comparison(ir->cmp)));
 913
 914       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
 915       inst->predicate = BRW_PREDICATE_NORMAL;
 916    }
 917
 918    visit_instructions(&ir->body_instructions);
 919
 920
 921    if (ir->increment) {
 922       this->base_ir = ir->increment;
 923       ir->increment->accept(this);
 924       emit(ADD(counter, src_reg(counter), this->result));
 925    }
 926
 927    emit(BRW_OPCODE_WHILE);
 928 }
 929
 930 void
 931 vec4_visitor::visit(ir_loop_jump *ir)
 932 {
 933    switch (ir->mode) {
 934    case ir_loop_jump::jump_break:
 935       emit(BRW_OPCODE_BREAK);
 936       break;
 937    case ir_loop_jump::jump_continue:
 938       emit(BRW_OPCODE_CONTINUE);
 939       break;
 940    }
 941 }
 942
 943
 944 void
 945 vec4_visitor::visit(ir_function_signature *ir)
 946 {
 947    assert(0);
 948    (void)ir;
 949 }
 950
 951 void
 952 vec4_visitor::visit(ir_function *ir)
 953 {
 954    /* Ignore function bodies other than main() -- we shouldn't see calls to
 955     * them since they should all be inlined.
 956     */
 957    if (strcmp(ir->name, "main") == 0) {
 958       const ir_function_signature *sig;
 959       exec_list empty;
 960
 961       sig = ir->matching_signature(&empty);
 962
 963       assert(sig);
 964
 965       visit_instructions(&sig->body);
 966    }
 967 }
 968
 969 bool
 970 vec4_visitor::try_emit_sat(ir_expression *ir)
 971 {
 972    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
 973    if (!sat_src)
 974       return false;
 975
 976    sat_src->accept(this);
 977    src_reg src = this->result;
 978
 979    this->result = src_reg(this, ir->type);
 980    vec4_instruction *inst;
 981    inst = emit(MOV(dst_reg(this->result), src));
 982    inst->saturate = true;
 983
 984    return true;
 985 }
 986
 987 void
 988 vec4_visitor::emit_bool_comparison(unsigned int op,
 989                                  dst_reg dst, src_reg src0, src_reg src1)
 990 {
 991    /* original gen4 does destination conversion before comparison. */
 992    if (intel->gen < 5)
 993       dst.type = src0.type;
 994
 995    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
 996
 997    dst.type = BRW_REGISTER_TYPE_D;
 998    emit(AND(dst, src_reg(dst), src_reg(0x1)));
 999 }
1000
1001 void
1002 vec4_visitor::visit(ir_expression *ir)
1003 {
1004    unsigned int operand;
1005    src_reg op[Elements(ir->operands)];
1006    src_reg result_src;
1007    dst_reg result_dst;
1008    vec4_instruction *inst;
1009
1010    if (try_emit_sat(ir))
1011       return;
1012
1013    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1014       this->result.file = BAD_FILE;
1015       ir->operands[operand]->accept(this);
1016       if (this->result.file == BAD_FILE) {
1017          printf("Failed to get tree for expression operand:\n");
1018          ir->operands[operand]->print();
1019          exit(1);
1020       }
1021       op[operand] = this->result;
1022
1023       /* Matrix expression operands should have been broken down to vector
1024        * operations already.
1025        */
1026       assert(!ir->operands[operand]->type->is_matrix());
1027    }
1028
1029    int vector_elements = ir->operands[0]->type->vector_elements;
1030    if (ir->operands[1]) {
1031       vector_elements = MAX2(vector_elements,
1032                              ir->operands[1]->type->vector_elements);
1033    }
1034
1035    this->result.file = BAD_FILE;
1036
1037    /* Storage for our result.  Ideally for an assignment we'd be using
1038     * the actual storage for the result here, instead.
1039     */
1040    result_src = src_reg(this, ir->type);
1041    /* convenience for the emit functions below. */
1042    result_dst = dst_reg(result_src);
1043    /* If nothing special happens, this is the result. */
1044    this->result = result_src;
1045    /* Limit writes to the channels that will be used by result_src later.
1046     * This does limit this temp's use as a temporary for multi-instruction
1047     * sequences.
1048     */
1049    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1050
1051    switch (ir->operation) {
1052    case ir_unop_logic_not:
1053       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1054        * ones complement of the whole register, not just bit 0.
1055        */
1056       emit(XOR(result_dst, op[0], src_reg(1)));
1057       break;
1058    case ir_unop_neg:
1059       op[0].negate = !op[0].negate;
1060       this->result = op[0];
1061       break;
1062    case ir_unop_abs:
1063       op[0].abs = true;
1064       op[0].negate = false;
1065       this->result = op[0];
1066       break;
1067
1068    case ir_unop_sign:
1069       emit(MOV(result_dst, src_reg(0.0f)));
1070
1071       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1072       inst = emit(MOV(result_dst, src_reg(1.0f)));
1073       inst->predicate = BRW_PREDICATE_NORMAL;
1074
1075       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1076       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1077       inst->predicate = BRW_PREDICATE_NORMAL;
1078
1079       break;
1080
1081    case ir_unop_rcp:
1082       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1083       break;
1084
1085    case ir_unop_exp2:
1086       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1087       break;
1088    case ir_unop_log2:
1089       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1090       break;
1091    case ir_unop_exp:
1092    case ir_unop_log:
1093       assert(!"not reached: should be handled by ir_explog_to_explog2");
1094       break;
1095    case ir_unop_sin:
1096    case ir_unop_sin_reduced:
1097       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1098       break;
1099    case ir_unop_cos:
1100    case ir_unop_cos_reduced:
1101       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1102       break;
1103
1104    case ir_unop_dFdx:
1105    case ir_unop_dFdy:
1106       assert(!"derivatives not valid in vertex shader");
1107       break;
1108
1109    case ir_unop_noise:
1110       assert(!"not reached: should be handled by lower_noise");
1111       break;
1112
1113    case ir_binop_add:
1114       emit(ADD(result_dst, op[0], op[1]));
1115       break;
1116    case ir_binop_sub:
1117       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1118       break;
1119
1120    case ir_binop_mul:
1121       if (ir->type->is_integer()) {
1122          /* For integer multiplication, the MUL uses the low 16 bits
1123           * of one of the operands (src0 on gen6, src1 on gen7).  The
1124           * MACH accumulates in the contribution of the upper 16 bits
1125           * of that operand.
1126           *
1127           * FINISHME: Emit just the MUL if we know an operand is small
1128           * enough.
1129           */
1130          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1131
1132          emit(MUL(acc, op[0], op[1]));
1133          emit(MACH(dst_null_d(), op[0], op[1]));
1134          emit(MOV(result_dst, src_reg(acc)));
1135       } else {
1136          emit(MUL(result_dst, op[0], op[1]));
1137       }
1138       break;
1139    case ir_binop_div:
1140       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1141       assert(ir->type->is_integer());
1142       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1143       break;
1144    case ir_binop_mod:
1145       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1146       assert(ir->type->is_integer());
1147       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1148       break;
1149
1150    case ir_binop_less:
1151    case ir_binop_greater:
1152    case ir_binop_lequal:
1153    case ir_binop_gequal:
1154    case ir_binop_equal:
1155    case ir_binop_nequal: {
1156       emit(CMP(result_dst, op[0], op[1],
1157                brw_conditional_for_comparison(ir->operation)));
1158       emit(AND(result_dst, result_src, src_reg(0x1)));
1159       break;
1160    }
1161
1162    case ir_binop_all_equal:
1163       /* "==" operator producing a scalar boolean. */
1164       if (ir->operands[0]->type->is_vector() ||
1165           ir->operands[1]->type->is_vector()) {
1166          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1167          emit(MOV(result_dst, src_reg(0)));
1168          inst = emit(MOV(result_dst, src_reg(1)));
1169          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1170       } else {
1171          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1172          emit(AND(result_dst, result_src, src_reg(0x1)));
1173       }
1174       break;
1175    case ir_binop_any_nequal:
1176       /* "!=" operator producing a scalar boolean. */
1177       if (ir->operands[0]->type->is_vector() ||
1178           ir->operands[1]->type->is_vector()) {
1179          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1180
1181          emit(MOV(result_dst, src_reg(0)));
1182          inst = emit(MOV(result_dst, src_reg(1)));
1183          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1184       } else {
1185          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1186          emit(AND(result_dst, result_src, src_reg(0x1)));
1187       }
1188       break;
1189
1190    case ir_unop_any:
1191       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1192       emit(MOV(result_dst, src_reg(0)));
1193
1194       inst = emit(MOV(result_dst, src_reg(1)));
1195       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1196       break;
1197
1198    case ir_binop_logic_xor:
1199       emit(XOR(result_dst, op[0], op[1]));
1200       break;
1201
1202    case ir_binop_logic_or:
1203       emit(OR(result_dst, op[0], op[1]));
1204       break;
1205
1206    case ir_binop_logic_and:
1207       emit(AND(result_dst, op[0], op[1]));
1208       break;
1209
1210    case ir_binop_dot:
1211       assert(ir->operands[0]->type->is_vector());
1212       assert(ir->operands[0]->type == ir->operands[1]->type);
1213       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1214       break;
1215
1216    case ir_unop_sqrt:
1217       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1218       break;
1219    case ir_unop_rsq:
1220       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1221       break;
1222
1223    case ir_unop_bitcast_i2f:
1224    case ir_unop_bitcast_u2f:
1225       this->result = op[0];
1226       this->result.type = BRW_REGISTER_TYPE_F;
1227       break;
1228
1229    case ir_unop_bitcast_f2i:
1230       this->result = op[0];
1231       this->result.type = BRW_REGISTER_TYPE_D;
1232       break;
1233
1234    case ir_unop_bitcast_f2u:
1235       this->result = op[0];
1236       this->result.type = BRW_REGISTER_TYPE_UD;
1237       break;
1238
1239    case ir_unop_i2f:
1240    case ir_unop_i2u:
1241    case ir_unop_u2i:
1242    case ir_unop_u2f:
1243    case ir_unop_b2f:
1244    case ir_unop_b2i:
1245    case ir_unop_f2i:
1246    case ir_unop_f2u:
1247       emit(MOV(result_dst, op[0]));
1248       break;
1249    case ir_unop_f2b:
1250    case ir_unop_i2b: {
1251       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1252       emit(AND(result_dst, result_src, src_reg(1)));
1253       break;
1254    }
1255
1256    case ir_unop_trunc:
1257       emit(RNDZ(result_dst, op[0]));
1258       break;
1259    case ir_unop_ceil:
1260       op[0].negate = !op[0].negate;
1261       inst = emit(RNDD(result_dst, op[0]));
1262       this->result.negate = true;
1263       break;
1264    case ir_unop_floor:
1265       inst = emit(RNDD(result_dst, op[0]));
1266       break;
1267    case ir_unop_fract:
1268       inst = emit(FRC(result_dst, op[0]));
1269       break;
1270    case ir_unop_round_even:
1271       emit(RNDE(result_dst, op[0]));
1272       break;
1273
1274    case ir_binop_min:
1275       if (intel->gen >= 6) {
1276          inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1277          inst->conditional_mod = BRW_CONDITIONAL_L;
1278       } else {
1279          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_L));
1280
1281          inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1282          inst->predicate = BRW_PREDICATE_NORMAL;
1283       }
1284       break;
1285    case ir_binop_max:
1286       if (intel->gen >= 6) {
1287          inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1288          inst->conditional_mod = BRW_CONDITIONAL_G;
1289       } else {
1290          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_G));
1291
1292          inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1293          inst->predicate = BRW_PREDICATE_NORMAL;
1294       }
1295       break;
1296
1297    case ir_binop_pow:
1298       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1299       break;
1300
1301    case ir_unop_bit_not:
1302       inst = emit(NOT(result_dst, op[0]));
1303       break;
1304    case ir_binop_bit_and:
1305       inst = emit(AND(result_dst, op[0], op[1]));
1306       break;
1307    case ir_binop_bit_xor:
1308       inst = emit(XOR(result_dst, op[0], op[1]));
1309       break;
1310    case ir_binop_bit_or:
1311       inst = emit(OR(result_dst, op[0], op[1]));
1312       break;
1313
1314    case ir_binop_lshift:
1315       inst = emit(BRW_OPCODE_SHL, result_dst, op[0], op[1]);
1316       break;
1317
1318    case ir_binop_rshift:
1319       if (ir->type->base_type == GLSL_TYPE_INT)
1320          inst = emit(BRW_OPCODE_ASR, result_dst, op[0], op[1]);
1321       else
1322          inst = emit(BRW_OPCODE_SHR, result_dst, op[0], op[1]);
1323       break;
1324
1325    case ir_binop_ubo_load: {
1326       ir_constant *uniform_block = ir->operands[0]->as_constant();
1327       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1328       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1329       src_reg offset = op[1];
1330
1331       /* Now, load the vector from that offset. */
1332       assert(ir->type->is_vector() || ir->type->is_scalar());
1333
1334       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1335       packed_consts.type = result.type;
1336       src_reg surf_index =
1337          src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1338       if (const_offset_ir) {
1339          offset = src_reg(const_offset / 16);
1340       } else {
1341          emit(BRW_OPCODE_SHR, dst_reg(offset), offset, src_reg(4));
1342       }
1343
1344       vec4_instruction *pull =
1345          emit(new(mem_ctx) vec4_instruction(this,
1346                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1347                                             dst_reg(packed_consts),
1348                                             surf_index,
1349                                             offset));
1350       pull->base_mrf = 14;
1351       pull->mlen = 1;
1352
1353       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1354       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1355                                             const_offset % 16 / 4,
1356                                             const_offset % 16 / 4,
1357                                             const_offset % 16 / 4);
1358
1359       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1360       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1361          emit(CMP(result_dst, packed_consts, src_reg(0u),
1362                   BRW_CONDITIONAL_NZ));
1363          emit(AND(result_dst, result, src_reg(0x1)));
1364       } else {
1365          emit(MOV(result_dst, packed_consts));
1366       }
1367       break;
1368    }
1369
1370    case ir_quadop_vector:
1371       assert(!"not reached: should be handled by lower_quadop_vector");
1372       break;
1373    }
1374 }
1375
1376
1377 void
1378 vec4_visitor::visit(ir_swizzle *ir)
1379 {
1380    src_reg src;
1381    int i = 0;
1382    int swizzle[4];
1383
1384    /* Note that this is only swizzles in expressions, not those on the left
1385     * hand side of an assignment, which do write masking.  See ir_assignment
1386     * for that.
1387     */
1388
1389    ir->val->accept(this);
1390    src = this->result;
1391    assert(src.file != BAD_FILE);
1392
1393    for (i = 0; i < ir->type->vector_elements; i++) {
1394       switch (i) {
1395       case 0:
1396          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1397          break;
1398       case 1:
1399          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1400          break;
1401       case 2:
1402          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1403          break;
1404       case 3:
1405          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1406             break;
1407       }
1408    }
1409    for (; i < 4; i++) {
1410       /* Replicate the last channel out. */
1411       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1412    }
1413
1414    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1415
1416    this->result = src;
1417 }
1418
1419 void
1420 vec4_visitor::visit(ir_dereference_variable *ir)
1421 {
1422    const struct glsl_type *type = ir->type;
1423    dst_reg *reg = variable_storage(ir->var);
1424
1425    if (!reg) {
1426       fail("Failed to find variable storage for %s\n", ir->var->name);
1427       this->result = src_reg(brw_null_reg());
1428       return;
1429    }
1430
1431    this->result = src_reg(*reg);
1432
1433    /* System values get their swizzle from the dst_reg writemask */
1434    if (ir->var->mode == ir_var_system_value)
1435       return;
1436
1437    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1438       this->result.swizzle = swizzle_for_size(type->vector_elements);
1439 }
1440
1441 void
1442 vec4_visitor::visit(ir_dereference_array *ir)
1443 {
1444    ir_constant *constant_index;
1445    src_reg src;
1446    int element_size = type_size(ir->type);
1447
1448    constant_index = ir->array_index->constant_expression_value();
1449
1450    ir->array->accept(this);
1451    src = this->result;
1452
1453    if (constant_index) {
1454       src.reg_offset += constant_index->value.i[0] * element_size;
1455    } else {
1456       /* Variable index array dereference.  It eats the "vec4" of the
1457        * base of the array and an index that offsets the Mesa register
1458        * index.
1459        */
1460       ir->array_index->accept(this);
1461
1462       src_reg index_reg;
1463
1464       if (element_size == 1) {
1465          index_reg = this->result;
1466       } else {
1467          index_reg = src_reg(this, glsl_type::int_type);
1468
1469          emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1470       }
1471
1472       if (src.reladdr) {
1473          src_reg temp = src_reg(this, glsl_type::int_type);
1474
1475          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1476
1477          index_reg = temp;
1478       }
1479
1480       src.reladdr = ralloc(mem_ctx, src_reg);
1481       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1482    }
1483
1484    /* If the type is smaller than a vec4, replicate the last channel out. */
1485    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1486       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1487    else
1488       src.swizzle = BRW_SWIZZLE_NOOP;
1489    src.type = brw_type_for_base_type(ir->type);
1490
1491    this->result = src;
1492 }
1493
1494 void
1495 vec4_visitor::visit(ir_dereference_record *ir)
1496 {
1497    unsigned int i;
1498    const glsl_type *struct_type = ir->record->type;
1499    int offset = 0;
1500
1501    ir->record->accept(this);
1502
1503    for (i = 0; i < struct_type->length; i++) {
1504       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1505          break;
1506       offset += type_size(struct_type->fields.structure[i].type);
1507    }
1508
1509    /* If the type is smaller than a vec4, replicate the last channel out. */
1510    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1511       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1512    else
1513       this->result.swizzle = BRW_SWIZZLE_NOOP;
1514    this->result.type = brw_type_for_base_type(ir->type);
1515
1516    this->result.reg_offset += offset;
1517 }
1518
1519 /**
1520  * We want to be careful in assignment setup to hit the actual storage
1521  * instead of potentially using a temporary like we might with the
1522  * ir_dereference handler.
1523  */
1524 static dst_reg
1525 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1526 {
1527    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1528     * access of a vector, it must be separated into a series conditional moves
1529     * before reaching this point (see ir_vec_index_to_cond_assign).
1530     */
1531    assert(ir->as_dereference());
1532    ir_dereference_array *deref_array = ir->as_dereference_array();
1533    if (deref_array) {
1534       assert(!deref_array->array->type->is_vector());
1535    }
1536
1537    /* Use the rvalue deref handler for the most part.  We'll ignore
1538     * swizzles in it and write swizzles using writemask, though.
1539     */
1540    ir->accept(v);
1541    return dst_reg(v->result);
1542 }
1543
1544 void
1545 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1546                               const struct glsl_type *type, uint32_t predicate)
1547 {
1548    if (type->base_type == GLSL_TYPE_STRUCT) {
1549       for (unsigned int i = 0; i < type->length; i++) {
1550          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1551       }
1552       return;
1553    }
1554
1555    if (type->is_array()) {
1556       for (unsigned int i = 0; i < type->length; i++) {
1557          emit_block_move(dst, src, type->fields.array, predicate);
1558       }
1559       return;
1560    }
1561
1562    if (type->is_matrix()) {
1563       const struct glsl_type *vec_type;
1564
1565       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1566                                          type->vector_elements, 1);
1567
1568       for (int i = 0; i < type->matrix_columns; i++) {
1569          emit_block_move(dst, src, vec_type, predicate);
1570       }
1571       return;
1572    }
1573
1574    assert(type->is_scalar() || type->is_vector());
1575
1576    dst->type = brw_type_for_base_type(type);
1577    src->type = dst->type;
1578
1579    dst->writemask = (1 << type->vector_elements) - 1;
1580
1581    src->swizzle = swizzle_for_size(type->vector_elements);
1582
1583    vec4_instruction *inst = emit(MOV(*dst, *src));
1584    inst->predicate = predicate;
1585
1586    dst->reg_offset++;
1587    src->reg_offset++;
1588 }
1589
1590
1591 /* If the RHS processing resulted in an instruction generating a
1592  * temporary value, and it would be easy to rewrite the instruction to
1593  * generate its result right into the LHS instead, do so.  This ends
1594  * up reliably removing instructions where it can be tricky to do so
1595  * later without real UD chain information.
1596  */
1597 bool
1598 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1599                                      dst_reg dst,
1600                                      src_reg src,
1601                                      vec4_instruction *pre_rhs_inst,
1602                                      vec4_instruction *last_rhs_inst)
1603 {
1604    /* This could be supported, but it would take more smarts. */
1605    if (ir->condition)
1606       return false;
1607
1608    if (pre_rhs_inst == last_rhs_inst)
1609       return false; /* No instructions generated to work with. */
1610
1611    /* Make sure the last instruction generated our source reg. */
1612    if (src.file != GRF ||
1613        src.file != last_rhs_inst->dst.file ||
1614        src.reg != last_rhs_inst->dst.reg ||
1615        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1616        src.reladdr ||
1617        src.abs ||
1618        src.negate ||
1619        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1620       return false;
1621
1622    /* Check that that last instruction fully initialized the channels
1623     * we want to use, in the order we want to use them.  We could
1624     * potentially reswizzle the operands of many instructions so that
1625     * we could handle out of order channels, but don't yet.
1626     */
1627
1628    for (unsigned i = 0; i < 4; i++) {
1629       if (dst.writemask & (1 << i)) {
1630          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1631             return false;
1632
1633          if (BRW_GET_SWZ(src.swizzle, i) != i)
1634             return false;
1635       }
1636    }
1637
1638    /* Success!  Rewrite the instruction. */
1639    last_rhs_inst->dst.file = dst.file;
1640    last_rhs_inst->dst.reg = dst.reg;
1641    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1642    last_rhs_inst->dst.reladdr = dst.reladdr;
1643    last_rhs_inst->dst.writemask &= dst.writemask;
1644
1645    return true;
1646 }
1647
1648 void
1649 vec4_visitor::visit(ir_assignment *ir)
1650 {
1651    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1652    uint32_t predicate = BRW_PREDICATE_NONE;
1653
1654    if (!ir->lhs->type->is_scalar() &&
1655        !ir->lhs->type->is_vector()) {
1656       ir->rhs->accept(this);
1657       src_reg src = this->result;
1658
1659       if (ir->condition) {
1660          emit_bool_to_cond_code(ir->condition, &predicate);
1661       }
1662
1663       /* emit_block_move doesn't account for swizzles in the source register.
1664        * This should be ok, since the source register is a structure or an
1665        * array, and those can't be swizzled.  But double-check to be sure.
1666        */
1667       assert(src.swizzle ==
1668              (ir->rhs->type->is_matrix()
1669               ? swizzle_for_size(ir->rhs->type->vector_elements)
1670               : BRW_SWIZZLE_NOOP));
1671
1672       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1673       return;
1674    }
1675
1676    /* Now we're down to just a scalar/vector with writemasks. */
1677    int i;
1678
1679    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1680    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1681
1682    ir->rhs->accept(this);
1683
1684    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1685
1686    src_reg src = this->result;
1687
1688    int swizzles[4];
1689    int first_enabled_chan = 0;
1690    int src_chan = 0;
1691
1692    assert(ir->lhs->type->is_vector() ||
1693           ir->lhs->type->is_scalar());
1694    dst.writemask = ir->write_mask;
1695
1696    for (int i = 0; i < 4; i++) {
1697       if (dst.writemask & (1 << i)) {
1698          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1699          break;
1700       }
1701    }
1702
1703    /* Swizzle a small RHS vector into the channels being written.
1704     *
1705     * glsl ir treats write_mask as dictating how many channels are
1706     * present on the RHS while in our instructions we need to make
1707     * those channels appear in the slots of the vec4 they're written to.
1708     */
1709    for (int i = 0; i < 4; i++) {
1710       if (dst.writemask & (1 << i))
1711          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1712       else
1713          swizzles[i] = first_enabled_chan;
1714    }
1715    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1716                               swizzles[2], swizzles[3]);
1717
1718    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1719       return;
1720    }
1721
1722    if (ir->condition) {
1723       emit_bool_to_cond_code(ir->condition, &predicate);
1724    }
1725
1726    for (i = 0; i < type_size(ir->lhs->type); i++) {
1727       vec4_instruction *inst = emit(MOV(dst, src));
1728       inst->predicate = predicate;
1729
1730       dst.reg_offset++;
1731       src.reg_offset++;
1732    }
1733 }
1734
1735 void
1736 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1737 {
1738    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1739       foreach_list(node, &ir->components) {
1740          ir_constant *field_value = (ir_constant *)node;
1741
1742          emit_constant_values(dst, field_value);
1743       }
1744       return;
1745    }
1746
1747    if (ir->type->is_array()) {
1748       for (unsigned int i = 0; i < ir->type->length; i++) {
1749          emit_constant_values(dst, ir->array_elements[i]);
1750       }
1751       return;
1752    }
1753
1754    if (ir->type->is_matrix()) {
1755       for (int i = 0; i < ir->type->matrix_columns; i++) {
1756          float *vec = &ir->value.f[i * ir->type->vector_elements];
1757
1758          for (int j = 0; j < ir->type->vector_elements; j++) {
1759             dst->writemask = 1 << j;
1760             dst->type = BRW_REGISTER_TYPE_F;
1761
1762             emit(MOV(*dst, src_reg(vec[j])));
1763          }
1764          dst->reg_offset++;
1765       }
1766       return;
1767    }
1768
1769    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
1770
1771    for (int i = 0; i < ir->type->vector_elements; i++) {
1772       if (!(remaining_writemask & (1 << i)))
1773          continue;
1774
1775       dst->writemask = 1 << i;
1776       dst->type = brw_type_for_base_type(ir->type);
1777
1778       /* Find other components that match the one we're about to
1779        * write.  Emits fewer instructions for things like vec4(0.5,
1780        * 1.5, 1.5, 1.5).
1781        */
1782       for (int j = i + 1; j < ir->type->vector_elements; j++) {
1783          if (ir->type->base_type == GLSL_TYPE_BOOL) {
1784             if (ir->value.b[i] == ir->value.b[j])
1785                dst->writemask |= (1 << j);
1786          } else {
1787             /* u, i, and f storage all line up, so no need for a
1788              * switch case for comparing each type.
1789              */
1790             if (ir->value.u[i] == ir->value.u[j])
1791                dst->writemask |= (1 << j);
1792          }
1793       }
1794
1795       switch (ir->type->base_type) {
1796       case GLSL_TYPE_FLOAT:
1797          emit(MOV(*dst, src_reg(ir->value.f[i])));
1798          break;
1799       case GLSL_TYPE_INT:
1800          emit(MOV(*dst, src_reg(ir->value.i[i])));
1801          break;
1802       case GLSL_TYPE_UINT:
1803          emit(MOV(*dst, src_reg(ir->value.u[i])));
1804          break;
1805       case GLSL_TYPE_BOOL:
1806          emit(MOV(*dst, src_reg(ir->value.b[i])));
1807          break;
1808       default:
1809          assert(!"Non-float/uint/int/bool constant");
1810          break;
1811       }
1812
1813       remaining_writemask &= ~dst->writemask;
1814    }
1815    dst->reg_offset++;
1816 }
1817
1818 void
1819 vec4_visitor::visit(ir_constant *ir)
1820 {
1821    dst_reg dst = dst_reg(this, ir->type);
1822    this->result = src_reg(dst);
1823
1824    emit_constant_values(&dst, ir);
1825 }
1826
1827 void
1828 vec4_visitor::visit(ir_call *ir)
1829 {
1830    assert(!"not reached");
1831 }
1832
1833 void
1834 vec4_visitor::visit(ir_texture *ir)
1835 {
1836    int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
1837
1838    /* Should be lowered by do_lower_texture_projection */
1839    assert(!ir->projector);
1840
1841    /* Generate code to compute all the subexpression trees.  This has to be
1842     * done before loading any values into MRFs for the sampler message since
1843     * generating these values may involve SEND messages that need the MRFs.
1844     */
1845    src_reg coordinate;
1846    if (ir->coordinate) {
1847       ir->coordinate->accept(this);
1848       coordinate = this->result;
1849    }
1850
1851    src_reg shadow_comparitor;
1852    if (ir->shadow_comparitor) {
1853       ir->shadow_comparitor->accept(this);
1854       shadow_comparitor = this->result;
1855    }
1856
1857    src_reg lod, dPdx, dPdy;
1858    switch (ir->op) {
1859    case ir_txf:
1860    case ir_txl:
1861    case ir_txs:
1862       ir->lod_info.lod->accept(this);
1863       lod = this->result;
1864       break;
1865    case ir_txd:
1866       ir->lod_info.grad.dPdx->accept(this);
1867       dPdx = this->result;
1868
1869       ir->lod_info.grad.dPdy->accept(this);
1870       dPdy = this->result;
1871       break;
1872    case ir_tex:
1873    case ir_txb:
1874       break;
1875    }
1876
1877    vec4_instruction *inst = NULL;
1878    switch (ir->op) {
1879    case ir_tex:
1880    case ir_txl:
1881       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
1882       break;
1883    case ir_txd:
1884       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
1885       break;
1886    case ir_txf:
1887       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
1888       break;
1889    case ir_txs:
1890       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
1891       break;
1892    case ir_txb:
1893       assert(!"TXB is not valid for vertex shaders.");
1894    }
1895
1896    /* Texel offsets go in the message header; Gen4 also requires headers. */
1897    inst->header_present = ir->offset || intel->gen < 5;
1898    inst->base_mrf = 2;
1899    inst->mlen = inst->header_present + 1; /* always at least one */
1900    inst->sampler = sampler;
1901    inst->dst = dst_reg(this, ir->type);
1902    inst->shadow_compare = ir->shadow_comparitor != NULL;
1903
1904    if (ir->offset != NULL && ir->op != ir_txf)
1905       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
1906
1907    /* MRF for the first parameter */
1908    int param_base = inst->base_mrf + inst->header_present;
1909
1910    if (ir->op == ir_txs) {
1911       int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
1912       emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, writemask),
1913            lod));
1914    } else {
1915       int i, coord_mask = 0, zero_mask = 0;
1916       /* Load the coordinate */
1917       /* FINISHME: gl_clamp_mask and saturate */
1918       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
1919          coord_mask |= (1 << i);
1920       for (; i < 4; i++)
1921          zero_mask |= (1 << i);
1922
1923       if (ir->offset && ir->op == ir_txf) {
1924          /* It appears that the ld instruction used for txf does its
1925           * address bounds check before adding in the offset.  To work
1926           * around this, just add the integer offset to the integer
1927           * texel coordinate, and don't put the offset in the header.
1928           */
1929          ir_constant *offset = ir->offset->as_constant();
1930          assert(offset);
1931
1932          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
1933             src_reg src = coordinate;
1934             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
1935                                        BRW_GET_SWZ(src.swizzle, j),
1936                                        BRW_GET_SWZ(src.swizzle, j),
1937                                        BRW_GET_SWZ(src.swizzle, j));
1938             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
1939                      src, offset->value.i[j]));
1940          }
1941       } else {
1942          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
1943                   coordinate));
1944       }
1945       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
1946                src_reg(0)));
1947       /* Load the shadow comparitor */
1948       if (ir->shadow_comparitor) {
1949          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
1950                           WRITEMASK_X),
1951                   shadow_comparitor));
1952          inst->mlen++;
1953       }
1954
1955       /* Load the LOD info */
1956       if (ir->op == ir_txl) {
1957          int mrf, writemask;
1958          if (intel->gen >= 5) {
1959             mrf = param_base + 1;
1960             if (ir->shadow_comparitor) {
1961                writemask = WRITEMASK_Y;
1962                /* mlen already incremented */
1963             } else {
1964                writemask = WRITEMASK_X;
1965                inst->mlen++;
1966             }
1967          } else /* intel->gen == 4 */ {
1968             mrf = param_base;
1969             writemask = WRITEMASK_Z;
1970          }
1971          emit(MOV(dst_reg(MRF, mrf, ir->lod_info.lod->type, writemask), lod));
1972       } else if (ir->op == ir_txf) {
1973          emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, WRITEMASK_W),
1974                   lod));
1975       } else if (ir->op == ir_txd) {
1976          const glsl_type *type = ir->lod_info.grad.dPdx->type;
1977
1978          if (intel->gen >= 5) {
1979             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1980             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1981             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
1982             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
1983             inst->mlen++;
1984
1985             if (ir->type->vector_elements == 3) {
1986                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
1987                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
1988                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
1989                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
1990                inst->mlen++;
1991             }
1992          } else /* intel->gen == 4 */ {
1993             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
1994             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
1995             inst->mlen += 2;
1996          }
1997       }
1998    }
1999
2000    emit(inst);
2001
2002    swizzle_result(ir, src_reg(inst->dst), sampler);
2003 }
2004
2005 void
2006 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2007 {
2008    this->result = orig_val;
2009
2010    int s = c->key.tex.swizzles[sampler];
2011
2012    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2013                         || s == SWIZZLE_NOOP)
2014       return;
2015
2016    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2017    int swizzle[4];
2018
2019    for (int i = 0; i < 4; i++) {
2020       switch (GET_SWZ(s, i)) {
2021       case SWIZZLE_ZERO:
2022          zero_mask |= (1 << i);
2023          break;
2024       case SWIZZLE_ONE:
2025          one_mask |= (1 << i);
2026          break;
2027       default:
2028          copy_mask |= (1 << i);
2029          swizzle[i] = GET_SWZ(s, i);
2030          break;
2031       }
2032    }
2033
2034    this->result = src_reg(this, ir->type);
2035    dst_reg swizzled_result(this->result);
2036
2037    if (copy_mask) {
2038       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2039       swizzled_result.writemask = copy_mask;
2040       emit(MOV(swizzled_result, orig_val));
2041    }
2042
2043    if (zero_mask) {
2044       swizzled_result.writemask = zero_mask;
2045       emit(MOV(swizzled_result, src_reg(0.0f)));
2046    }
2047
2048    if (one_mask) {
2049       swizzled_result.writemask = one_mask;
2050       emit(MOV(swizzled_result, src_reg(1.0f)));
2051    }
2052 }
2053
2054 void
2055 vec4_visitor::visit(ir_return *ir)
2056 {
2057    assert(!"not reached");
2058 }
2059
2060 void
2061 vec4_visitor::visit(ir_discard *ir)
2062 {
2063    assert(!"not reached");
2064 }
2065
2066 void
2067 vec4_visitor::visit(ir_if *ir)
2068 {
2069    /* Don't point the annotation at the if statement, because then it plus
2070     * the then and else blocks get printed.
2071     */
2072    this->base_ir = ir->condition;
2073
2074    if (intel->gen == 6) {
2075       emit_if_gen6(ir);
2076    } else {
2077       uint32_t predicate;
2078       emit_bool_to_cond_code(ir->condition, &predicate);
2079       emit(IF(predicate));
2080    }
2081
2082    visit_instructions(&ir->then_instructions);
2083
2084    if (!ir->else_instructions.is_empty()) {
2085       this->base_ir = ir->condition;
2086       emit(BRW_OPCODE_ELSE);
2087
2088       visit_instructions(&ir->else_instructions);
2089    }
2090
2091    this->base_ir = ir->condition;
2092    emit(BRW_OPCODE_ENDIF);
2093 }
2094
2095 void
2096 vec4_visitor::emit_ndc_computation()
2097 {
2098    /* Get the position */
2099    src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
2100
2101    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2102    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2103    output_reg[BRW_VERT_RESULT_NDC] = ndc;
2104
2105    current_annotation = "NDC";
2106    dst_reg ndc_w = ndc;
2107    ndc_w.writemask = WRITEMASK_W;
2108    src_reg pos_w = pos;
2109    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2110    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2111
2112    dst_reg ndc_xyz = ndc;
2113    ndc_xyz.writemask = WRITEMASK_XYZ;
2114
2115    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2116 }
2117
2118 void
2119 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2120 {
2121    if (intel->gen < 6 &&
2122        ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2123         c->key.userclip_active || brw->has_negative_rhw_bug)) {
2124       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2125       dst_reg header1_w = header1;
2126       header1_w.writemask = WRITEMASK_W;
2127       GLuint i;
2128
2129       emit(MOV(header1, 0u));
2130
2131       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2132          src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2133
2134          current_annotation = "Point size";
2135          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2136          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2137       }
2138
2139       current_annotation = "Clipping flags";
2140       for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2141          vec4_instruction *inst;
2142
2143          inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2144                          src_reg(this->userplane[i])));
2145          inst->conditional_mod = BRW_CONDITIONAL_L;
2146
2147          inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2148          inst->predicate = BRW_PREDICATE_NORMAL;
2149       }
2150
2151       /* i965 clipping workaround:
2152        * 1) Test for -ve rhw
2153        * 2) If set,
2154        *      set ndc = (0,0,0,0)
2155        *      set ucp[6] = 1
2156        *
2157        * Later, clipping will detect ucp[6] and ensure the primitive is
2158        * clipped against all fixed planes.
2159        */
2160       if (brw->has_negative_rhw_bug) {
2161 #if 0
2162          /* FINISHME */
2163          brw_CMP(p,
2164                  vec8(brw_null_reg()),
2165                  BRW_CONDITIONAL_L,
2166                  brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2167                  brw_imm_f(0));
2168
2169          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2170          brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2171          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2172 #endif
2173       }
2174
2175       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2176    } else if (intel->gen < 6) {
2177       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2178    } else {
2179       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2180       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2181          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2182                   src_reg(output_reg[VERT_RESULT_PSIZ])));
2183       }
2184    }
2185 }
2186
2187 void
2188 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2189 {
2190    if (intel->gen < 6) {
2191       /* Clip distance slots are set aside in gen5, but they are not used.  It
2192        * is not clear whether we actually need to set aside space for them,
2193        * but the performance cost is negligible.
2194        */
2195       return;
2196    }
2197
2198    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2199     *
2200     *     "If a linked set of shaders forming the vertex stage contains no
2201     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2202     *     application has requested clipping against user clip planes through
2203     *     the API, then the coordinate written to gl_Position is used for
2204     *     comparison against the user clip planes."
2205     *
2206     * This function is only called if the shader didn't write to
2207     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2208     * if the user wrote to it; otherwise we use gl_Position.
2209     */
2210    gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2211    if (!(c->prog_data.outputs_written
2212          & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2213       clip_vertex = VERT_RESULT_HPOS;
2214    }
2215
2216    for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2217         ++i) {
2218       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2219                src_reg(output_reg[clip_vertex]),
2220                src_reg(this->userplane[i + offset])));
2221    }
2222 }
2223
2224 void
2225 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2226 {
2227    assert (vert_result < VERT_RESULT_MAX);
2228    reg.type = output_reg[vert_result].type;
2229    current_annotation = output_reg_annotation[vert_result];
2230    /* Copy the register, saturating if necessary */
2231    vec4_instruction *inst = emit(MOV(reg,
2232                                      src_reg(output_reg[vert_result])));
2233    if ((vert_result == VERT_RESULT_COL0 ||
2234         vert_result == VERT_RESULT_COL1 ||
2235         vert_result == VERT_RESULT_BFC0 ||
2236         vert_result == VERT_RESULT_BFC1) &&
2237        c->key.clamp_vertex_color) {
2238       inst->saturate = true;
2239    }
2240 }
2241
2242 void
2243 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2244 {
2245    struct brw_reg hw_reg = brw_message_reg(mrf);
2246    dst_reg reg = dst_reg(MRF, mrf);
2247    reg.type = BRW_REGISTER_TYPE_F;
2248
2249    switch (vert_result) {
2250    case VERT_RESULT_PSIZ:
2251       /* PSIZ is always in slot 0, and is coupled with other flags. */
2252       current_annotation = "indices, point width, clip flags";
2253       emit_psiz_and_flags(hw_reg);
2254       break;
2255    case BRW_VERT_RESULT_NDC:
2256       current_annotation = "NDC";
2257       emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2258       break;
2259    case BRW_VERT_RESULT_HPOS_DUPLICATE:
2260    case VERT_RESULT_HPOS:
2261       current_annotation = "gl_Position";
2262       emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2263       break;
2264    case VERT_RESULT_CLIP_DIST0:
2265    case VERT_RESULT_CLIP_DIST1:
2266       if (this->c->key.uses_clip_distance) {
2267          emit_generic_urb_slot(reg, vert_result);
2268       } else {
2269          current_annotation = "user clip distances";
2270          emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2271       }
2272       break;
2273    case VERT_RESULT_EDGE:
2274       /* This is present when doing unfilled polygons.  We're supposed to copy
2275        * the edge flag from the user-provided vertex array
2276        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2277        * of that attribute (starts as 1.0f).  This is then used in clipping to
2278        * determine which edges should be drawn as wireframe.
2279        */
2280       current_annotation = "edge flag";
2281       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2282                                     glsl_type::float_type, WRITEMASK_XYZW))));
2283       break;
2284    case BRW_VERT_RESULT_PAD:
2285       /* No need to write to this slot */
2286       break;
2287    default:
2288       emit_generic_urb_slot(reg, vert_result);
2289       break;
2290    }
2291 }
2292
2293 static int
2294 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2295 {
2296    struct intel_context *intel = &brw->intel;
2297
2298    if (intel->gen >= 6) {
2299       /* URB data written (does not include the message header reg) must
2300        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2301        * section 5.4.3.2.2: URB_INTERLEAVED.
2302        *
2303        * URB entries are allocated on a multiple of 1024 bits, so an
2304        * extra 128 bits written here to make the end align to 256 is
2305        * no problem.
2306        */
2307       if ((mlen % 2) != 1)
2308          mlen++;
2309    }
2310
2311    return mlen;
2312 }
2313
2314 /**
2315  * Generates the VUE payload plus the 1 or 2 URB write instructions to
2316  * complete the VS thread.
2317  *
2318  * The VUE layout is documented in Volume 2a.
2319  */
2320 void
2321 vec4_visitor::emit_urb_writes()
2322 {
2323    /* MRF 0 is reserved for the debugger, so start with message header
2324     * in MRF 1.
2325     */
2326    int base_mrf = 1;
2327    int mrf = base_mrf;
2328    /* In the process of generating our URB write message contents, we
2329     * may need to unspill a register or load from an array.  Those
2330     * reads would use MRFs 14-15.
2331     */
2332    int max_usable_mrf = 13;
2333
2334    /* The following assertion verifies that max_usable_mrf causes an
2335     * even-numbered amount of URB write data, which will meet gen6's
2336     * requirements for length alignment.
2337     */
2338    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2339
2340    /* First mrf is the g0-based message header containing URB handles and such,
2341     * which is implied in VS_OPCODE_URB_WRITE.
2342     */
2343    mrf++;
2344
2345    if (intel->gen < 6) {
2346       emit_ndc_computation();
2347    }
2348
2349    /* Set up the VUE data for the first URB write */
2350    int slot;
2351    for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) {
2352       emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2353
2354       /* If this was max_usable_mrf, we can't fit anything more into this URB
2355        * WRITE.
2356        */
2357       if (mrf > max_usable_mrf) {
2358          slot++;
2359          break;
2360       }
2361    }
2362
2363    current_annotation = "URB write";
2364    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2365    inst->base_mrf = base_mrf;
2366    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2367    inst->eot = (slot >= c->prog_data.vue_map.num_slots);
2368
2369    /* Optional second URB write */
2370    if (!inst->eot) {
2371       mrf = base_mrf + 1;
2372
2373       for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
2374          assert(mrf < max_usable_mrf);
2375
2376          emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2377       }
2378
2379       current_annotation = "URB write";
2380       inst = emit(VS_OPCODE_URB_WRITE);
2381       inst->base_mrf = base_mrf;
2382       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2383       inst->eot = true;
2384       /* URB destination offset.  In the previous write, we got MRFs
2385        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2386        * URB row increments, and each of our MRFs is half of one of
2387        * those, since we're doing interleaved writes.
2388        */
2389       inst->offset = (max_usable_mrf - base_mrf) / 2;
2390    }
2391 }
2392
2393 src_reg
2394 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2395                                  src_reg *reladdr, int reg_offset)
2396 {
2397    /* Because we store the values to scratch interleaved like our
2398     * vertex data, we need to scale the vec4 index by 2.
2399     */
2400    int message_header_scale = 2;
2401
2402    /* Pre-gen6, the message header uses byte offsets instead of vec4
2403     * (16-byte) offset units.
2404     */
2405    if (intel->gen < 6)
2406       message_header_scale *= 16;
2407
2408    if (reladdr) {
2409       src_reg index = src_reg(this, glsl_type::int_type);
2410
2411       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2412       emit_before(inst, MUL(dst_reg(index),
2413                             index, src_reg(message_header_scale)));
2414
2415       return index;
2416    } else {
2417       return src_reg(reg_offset * message_header_scale);
2418    }
2419 }
2420
2421 src_reg
2422 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2423                                        src_reg *reladdr, int reg_offset)
2424 {
2425    if (reladdr) {
2426       src_reg index = src_reg(this, glsl_type::int_type);
2427
2428       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2429
2430       /* Pre-gen6, the message header uses byte offsets instead of vec4
2431        * (16-byte) offset units.
2432        */
2433       if (intel->gen < 6) {
2434          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2435       }
2436
2437       return index;
2438    } else {
2439       int message_header_scale = intel->gen < 6 ? 16 : 1;
2440       return src_reg(reg_offset * message_header_scale);
2441    }
2442 }
2443
2444 /**
2445  * Emits an instruction before @inst to load the value named by @orig_src
2446  * from scratch space at @base_offset to @temp.
2447  *
2448  * @base_offset is measured in 32-byte units (the size of a register).
2449  */
2450 void
2451 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2452                                 dst_reg temp, src_reg orig_src,
2453                                 int base_offset)
2454 {
2455    int reg_offset = base_offset + orig_src.reg_offset;
2456    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2457
2458    emit_before(inst, SCRATCH_READ(temp, index));
2459 }
2460
2461 /**
2462  * Emits an instruction after @inst to store the value to be written
2463  * to @orig_dst to scratch space at @base_offset, from @temp.
2464  *
2465  * @base_offset is measured in 32-byte units (the size of a register).
2466  */
2467 void
2468 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2469 {
2470    int reg_offset = base_offset + inst->dst.reg_offset;
2471    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2472
2473    /* Create a temporary register to store *inst's result in.
2474     *
2475     * We have to be careful in MOVing from our temporary result register in
2476     * the scratch write.  If we swizzle from channels of the temporary that
2477     * weren't initialized, it will confuse live interval analysis, which will
2478     * make spilling fail to make progress.
2479     */
2480    src_reg temp = src_reg(this, glsl_type::vec4_type);
2481    temp.type = inst->dst.type;
2482    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2483    int swizzles[4];
2484    for (int i = 0; i < 4; i++)
2485       if (inst->dst.writemask & (1 << i))
2486          swizzles[i] = i;
2487       else
2488          swizzles[i] = first_writemask_chan;
2489    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2490                                swizzles[2], swizzles[3]);
2491
2492    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2493                                        inst->dst.writemask));
2494    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2495    write->predicate = inst->predicate;
2496    write->ir = inst->ir;
2497    write->annotation = inst->annotation;
2498    inst->insert_after(write);
2499
2500    inst->dst.file = temp.file;
2501    inst->dst.reg = temp.reg;
2502    inst->dst.reg_offset = temp.reg_offset;
2503    inst->dst.reladdr = NULL;
2504 }
2505
2506 /**
2507  * We can't generally support array access in GRF space, because a
2508  * single instruction's destination can only span 2 contiguous
2509  * registers.  So, we send all GRF arrays that get variable index
2510  * access to scratch space.
2511  */
2512 void
2513 vec4_visitor::move_grf_array_access_to_scratch()
2514 {
2515    int scratch_loc[this->virtual_grf_count];
2516
2517    for (int i = 0; i < this->virtual_grf_count; i++) {
2518       scratch_loc[i] = -1;
2519    }
2520
2521    /* First, calculate the set of virtual GRFs that need to be punted
2522     * to scratch due to having any array access on them, and where in
2523     * scratch.
2524     */
2525    foreach_list(node, &this->instructions) {
2526       vec4_instruction *inst = (vec4_instruction *)node;
2527
2528       if (inst->dst.file == GRF && inst->dst.reladdr &&
2529           scratch_loc[inst->dst.reg] == -1) {
2530          scratch_loc[inst->dst.reg] = c->last_scratch;
2531          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2532       }
2533
2534       for (int i = 0 ; i < 3; i++) {
2535          src_reg *src = &inst->src[i];
2536
2537          if (src->file == GRF && src->reladdr &&
2538              scratch_loc[src->reg] == -1) {
2539             scratch_loc[src->reg] = c->last_scratch;
2540             c->last_scratch += this->virtual_grf_sizes[src->reg];
2541          }
2542       }
2543    }
2544
2545    /* Now, for anything that will be accessed through scratch, rewrite
2546     * it to load/store.  Note that this is a _safe list walk, because
2547     * we may generate a new scratch_write instruction after the one
2548     * we're processing.
2549     */
2550    foreach_list_safe(node, &this->instructions) {
2551       vec4_instruction *inst = (vec4_instruction *)node;
2552
2553       /* Set up the annotation tracking for new generated instructions. */
2554       base_ir = inst->ir;
2555       current_annotation = inst->annotation;
2556
2557       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2558          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2559       }
2560
2561       for (int i = 0 ; i < 3; i++) {
2562          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2563             continue;
2564
2565          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2566
2567          emit_scratch_read(inst, temp, inst->src[i],
2568                            scratch_loc[inst->src[i].reg]);
2569
2570          inst->src[i].file = temp.file;
2571          inst->src[i].reg = temp.reg;
2572          inst->src[i].reg_offset = temp.reg_offset;
2573          inst->src[i].reladdr = NULL;
2574       }
2575    }
2576 }
2577
2578 /**
2579  * Emits an instruction before @inst to load the value named by @orig_src
2580  * from the pull constant buffer (surface) at @base_offset to @temp.
2581  */
2582 void
2583 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2584                                       dst_reg temp, src_reg orig_src,
2585                                       int base_offset)
2586 {
2587    int reg_offset = base_offset + orig_src.reg_offset;
2588    src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2589    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2590    vec4_instruction *load;
2591
2592    load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2593                                         temp, index, offset);
2594    load->base_mrf = 14;
2595    load->mlen = 1;
2596    emit_before(inst, load);
2597 }
2598
2599 /**
2600  * Implements array access of uniforms by inserting a
2601  * PULL_CONSTANT_LOAD instruction.
2602  *
2603  * Unlike temporary GRF array access (where we don't support it due to
2604  * the difficulty of doing relative addressing on instruction
2605  * destinations), we could potentially do array access of uniforms
2606  * that were loaded in GRF space as push constants.  In real-world
2607  * usage we've seen, though, the arrays being used are always larger
2608  * than we could load as push constants, so just always move all
2609  * uniform array access out to a pull constant buffer.
2610  */
2611 void
2612 vec4_visitor::move_uniform_array_access_to_pull_constants()
2613 {
2614    int pull_constant_loc[this->uniforms];
2615
2616    for (int i = 0; i < this->uniforms; i++) {
2617       pull_constant_loc[i] = -1;
2618    }
2619
2620    /* Walk through and find array access of uniforms.  Put a copy of that
2621     * uniform in the pull constant buffer.
2622     *
2623     * Note that we don't move constant-indexed accesses to arrays.  No
2624     * testing has been done of the performance impact of this choice.
2625     */
2626    foreach_list_safe(node, &this->instructions) {
2627       vec4_instruction *inst = (vec4_instruction *)node;
2628
2629       for (int i = 0 ; i < 3; i++) {
2630          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2631             continue;
2632
2633          int uniform = inst->src[i].reg;
2634
2635          /* If this array isn't already present in the pull constant buffer,
2636           * add it.
2637           */
2638          if (pull_constant_loc[uniform] == -1) {
2639             const float **values = &prog_data->param[uniform * 4];
2640
2641             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2642
2643             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2644                prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2645             }
2646          }
2647
2648          /* Set up the annotation tracking for new generated instructions. */
2649          base_ir = inst->ir;
2650          current_annotation = inst->annotation;
2651
2652          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2653
2654          emit_pull_constant_load(inst, temp, inst->src[i],
2655                                  pull_constant_loc[uniform]);
2656
2657          inst->src[i].file = temp.file;
2658          inst->src[i].reg = temp.reg;
2659          inst->src[i].reg_offset = temp.reg_offset;
2660          inst->src[i].reladdr = NULL;
2661       }
2662    }
2663
2664    /* Now there are no accesses of the UNIFORM file with a reladdr, so
2665     * no need to track them as larger-than-vec4 objects.  This will be
2666     * relied on in cutting out unused uniform vectors from push
2667     * constants.
2668     */
2669    split_uniform_registers();
2670 }
2671
2672 void
2673 vec4_visitor::resolve_ud_negate(src_reg *reg)
2674 {
2675    if (reg->type != BRW_REGISTER_TYPE_UD ||
2676        !reg->negate)
2677       return;
2678
2679    src_reg temp = src_reg(this, glsl_type::uvec4_type);
2680    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2681    *reg = temp;
2682 }
2683
2684 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2685                            struct gl_shader_program *prog,
2686                            struct brw_shader *shader)
2687 {
2688    this->c = c;
2689    this->p = &c->func;
2690    this->brw = p->brw;
2691    this->intel = &brw->intel;
2692    this->ctx = &intel->ctx;
2693    this->prog = prog;
2694    this->shader = shader;
2695
2696    this->mem_ctx = ralloc_context(NULL);
2697    this->failed = false;
2698
2699    this->base_ir = NULL;
2700    this->current_annotation = NULL;
2701
2702    this->c = c;
2703    this->vp = (struct gl_vertex_program *)
2704      prog->_LinkedShaders[MESA_SHADER_VERTEX]->Program;
2705    this->prog_data = &c->prog_data;
2706
2707    this->variable_ht = hash_table_ctor(0,
2708                                        hash_table_pointer_hash,
2709                                        hash_table_pointer_compare);
2710
2711    this->virtual_grf_def = NULL;
2712    this->virtual_grf_use = NULL;
2713    this->virtual_grf_sizes = NULL;
2714    this->virtual_grf_count = 0;
2715    this->virtual_grf_reg_map = NULL;
2716    this->virtual_grf_reg_count = 0;
2717    this->virtual_grf_array_size = 0;
2718    this->live_intervals_valid = false;
2719
2720    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2721
2722    this->uniforms = 0;
2723 }
2724
2725 vec4_visitor::~vec4_visitor()
2726 {
2727    ralloc_free(this->mem_ctx);
2728    hash_table_dtor(this->variable_ht);
2729 }
2730
2731
2732 void
2733 vec4_visitor::fail(const char *format, ...)
2734 {
2735    va_list va;
2736    char *msg;
2737
2738    if (failed)
2739       return;
2740
2741    failed = true;
2742
2743    va_start(va, format);
2744    msg = ralloc_vasprintf(mem_ctx, format, va);
2745    va_end(va);
2746    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2747
2748    this->fail_msg = msg;
2749
2750    if (INTEL_DEBUG & DEBUG_VS) {
2751       fprintf(stderr, "%s",  msg);
2752    }
2753 }
2754
2755 } /* namespace brw */