src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 extern "C" {
  26 #include "main/macros.h"
  27 #include "program/prog_parameter.h"
  28 }
  29
  30 namespace brw {
  31
  32 src_reg::src_reg(dst_reg reg)
  33 {
  34    init();
  35
  36    this->file = reg.file;
  37    this->reg = reg.reg;
  38    this->reg_offset = reg.reg_offset;
  39    this->type = reg.type;
  40    this->reladdr = reg.reladdr;
  41    this->fixed_hw_reg = reg.fixed_hw_reg;
  42
  43    int swizzles[4];
  44    int next_chan = 0;
  45    int last = 0;
  46
  47    for (int i = 0; i < 4; i++) {
  48       if (!(reg.writemask & (1 << i)))
  49          continue;
  50
  51       swizzles[next_chan++] = last = i;
  52    }
  53
  54    for (; next_chan < 4; next_chan++) {
  55       swizzles[next_chan] = last;
  56    }
  57
  58    this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
  59                                 swizzles[2], swizzles[3]);
  60 }
  61
  62 dst_reg::dst_reg(src_reg reg)
  63 {
  64    init();
  65
  66    this->file = reg.file;
  67    this->reg = reg.reg;
  68    this->reg_offset = reg.reg_offset;
  69    this->type = reg.type;
  70    this->writemask = WRITEMASK_XYZW;
  71    this->reladdr = reg.reladdr;
  72    this->fixed_hw_reg = reg.fixed_hw_reg;
  73 }
  74
  75 vec4_instruction::vec4_instruction(vec4_visitor *v,
  76                                    enum opcode opcode, dst_reg dst,
  77                                    src_reg src0, src_reg src1, src_reg src2)
  78 {
  79    this->opcode = opcode;
  80    this->dst = dst;
  81    this->src[0] = src0;
  82    this->src[1] = src1;
  83    this->src[2] = src2;
  84    this->ir = v->base_ir;
  85    this->annotation = v->current_annotation;
  86 }
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(vec4_instruction *inst)
  90 {
  91    this->instructions.push_tail(inst);
  92
  93    return inst;
  94 }
  95
  96 vec4_instruction *
  97 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  98 {
  99    new_inst->ir = inst->ir;
 100    new_inst->annotation = inst->annotation;
 101
 102    inst->insert_before(new_inst);
 103
 104    return inst;
 105 }
 106
 107 vec4_instruction *
 108 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
 109                    src_reg src0, src_reg src1, src_reg src2)
 110 {
 111    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
 112                                              src0, src1, src2));
 113 }
 114
 115
 116 vec4_instruction *
 117 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
 118 {
 119    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
 120 }
 121
 122 vec4_instruction *
 123 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
 124 {
 125    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
 126 }
 127
 128 vec4_instruction *
 129 vec4_visitor::emit(enum opcode opcode)
 130 {
 131    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 132 }
 133
 134 #define ALU1(op)                                                        \
 135    vec4_instruction *                                                   \
 136    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
 137    {                                                                    \
 138       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 139                                            src0);                       \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    vec4_instruction *                                                   \
 144    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 145    {                                                                    \
 146       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 147                                            src0, src1);                 \
 148    }
 149
 150 ALU1(NOT)
 151 ALU1(MOV)
 152 ALU1(FRC)
 153 ALU1(RNDD)
 154 ALU1(RNDE)
 155 ALU1(RNDZ)
 156 ALU2(ADD)
 157 ALU2(MUL)
 158 ALU2(MACH)
 159 ALU2(AND)
 160 ALU2(OR)
 161 ALU2(XOR)
 162 ALU2(DP3)
 163 ALU2(DP4)
 164
 165 /** Gen4 predicated IF. */
 166 vec4_instruction *
 167 vec4_visitor::IF(uint32_t predicate)
 168 {
 169    vec4_instruction *inst;
 170
 171    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 172    inst->predicate = predicate;
 173
 174    return inst;
 175 }
 176
 177 /** Gen6+ IF with embedded comparison. */
 178 vec4_instruction *
 179 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 180 {
 181    assert(intel->gen >= 6);
 182
 183    vec4_instruction *inst;
 184
 185    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 186                                         src0, src1);
 187    inst->conditional_mod = condition;
 188
 189    return inst;
 190 }
 191
 192 /**
 193  * CMP: Sets the low bit of the destination channels with the result
 194  * of the comparison, while the upper bits are undefined, and updates
 195  * the flag register with the packed 16 bits of the result.
 196  */
 197 vec4_instruction *
 198 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 199 {
 200    vec4_instruction *inst;
 201
 202    /* original gen4 does type conversion to the destination type
 203     * before before comparison, producing garbage results for floating
 204     * point comparisons.
 205     */
 206    if (intel->gen == 4) {
 207       dst.type = src0.type;
 208       if (dst.file == HW_REG)
 209          dst.fixed_hw_reg.type = dst.type;
 210    }
 211
 212    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 vec4_instruction *
 219 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 220 {
 221    vec4_instruction *inst;
 222
 223    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 224                                         dst, index);
 225    inst->base_mrf = 14;
 226    inst->mlen = 1;
 227
 228    return inst;
 229 }
 230
 231 vec4_instruction *
 232 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 233 {
 234    vec4_instruction *inst;
 235
 236    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 237                                         dst, src, index);
 238    inst->base_mrf = 13;
 239    inst->mlen = 2;
 240
 241    return inst;
 242 }
 243
 244 void
 245 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 246 {
 247    static enum opcode dot_opcodes[] = {
 248       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 249    };
 250
 251    emit(dot_opcodes[elements - 2], dst, src0, src1);
 252 }
 253
 254 void
 255 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 256 {
 257    /* The gen6 math instruction ignores the source modifiers --
 258     * swizzle, abs, negate, and at least some parts of the register
 259     * region description.
 260     *
 261     * While it would seem that this MOV could be avoided at this point
 262     * in the case that the swizzle is matched up with the destination
 263     * writemask, note that uniform packing and register allocation
 264     * could rearrange our swizzle, so let's leave this matter up to
 265     * copy propagation later.
 266     */
 267    src_reg temp_src = src_reg(this, glsl_type::vec4_type);
 268    emit(MOV(dst_reg(temp_src), src));
 269
 270    if (dst.writemask != WRITEMASK_XYZW) {
 271       /* The gen6 math instruction must be align1, so we can't do
 272        * writemasks.
 273        */
 274       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 275
 276       emit(opcode, temp_dst, temp_src);
 277
 278       emit(MOV(dst, src_reg(temp_dst)));
 279    } else {
 280       emit(opcode, dst, temp_src);
 281    }
 282 }
 283
 284 void
 285 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 286 {
 287    vec4_instruction *inst = emit(opcode, dst, src);
 288    inst->base_mrf = 1;
 289    inst->mlen = 1;
 290 }
 291
 292 void
 293 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 294 {
 295    switch (opcode) {
 296    case SHADER_OPCODE_RCP:
 297    case SHADER_OPCODE_RSQ:
 298    case SHADER_OPCODE_SQRT:
 299    case SHADER_OPCODE_EXP2:
 300    case SHADER_OPCODE_LOG2:
 301    case SHADER_OPCODE_SIN:
 302    case SHADER_OPCODE_COS:
 303       break;
 304    default:
 305       assert(!"not reached: bad math opcode");
 306       return;
 307    }
 308
 309    if (intel->gen >= 6) {
 310       return emit_math1_gen6(opcode, dst, src);
 311    } else {
 312       return emit_math1_gen4(opcode, dst, src);
 313    }
 314 }
 315
 316 void
 317 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 318                               dst_reg dst, src_reg src0, src_reg src1)
 319 {
 320    src_reg expanded;
 321
 322    /* The gen6 math instruction ignores the source modifiers --
 323     * swizzle, abs, negate, and at least some parts of the register
 324     * region description.  Move the sources to temporaries to make it
 325     * generally work.
 326     */
 327
 328    expanded = src_reg(this, glsl_type::vec4_type);
 329    emit(MOV(dst_reg(expanded), src0));
 330    src0 = expanded;
 331
 332    expanded = src_reg(this, glsl_type::vec4_type);
 333    emit(MOV(dst_reg(expanded), src1));
 334    src1 = expanded;
 335
 336    if (dst.writemask != WRITEMASK_XYZW) {
 337       /* The gen6 math instruction must be align1, so we can't do
 338        * writemasks.
 339        */
 340       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 341
 342       emit(opcode, temp_dst, src0, src1);
 343
 344       emit(MOV(dst, src_reg(temp_dst)));
 345    } else {
 346       emit(opcode, dst, src0, src1);
 347    }
 348 }
 349
 350 void
 351 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 352                               dst_reg dst, src_reg src0, src_reg src1)
 353 {
 354    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 355    inst->base_mrf = 1;
 356    inst->mlen = 2;
 357 }
 358
 359 void
 360 vec4_visitor::emit_math(enum opcode opcode,
 361                         dst_reg dst, src_reg src0, src_reg src1)
 362 {
 363    assert(opcode == SHADER_OPCODE_POW);
 364
 365    if (intel->gen >= 6) {
 366       return emit_math2_gen6(opcode, dst, src0, src1);
 367    } else {
 368       return emit_math2_gen4(opcode, dst, src0, src1);
 369    }
 370 }
 371
 372 void
 373 vec4_visitor::visit_instructions(const exec_list *list)
 374 {
 375    foreach_list(node, list) {
 376       ir_instruction *ir = (ir_instruction *)node;
 377
 378       base_ir = ir;
 379       ir->accept(this);
 380    }
 381 }
 382
 383
 384 static int
 385 type_size(const struct glsl_type *type)
 386 {
 387    unsigned int i;
 388    int size;
 389
 390    switch (type->base_type) {
 391    case GLSL_TYPE_UINT:
 392    case GLSL_TYPE_INT:
 393    case GLSL_TYPE_FLOAT:
 394    case GLSL_TYPE_BOOL:
 395       if (type->is_matrix()) {
 396          return type->matrix_columns;
 397       } else {
 398          /* Regardless of size of vector, it gets a vec4. This is bad
 399           * packing for things like floats, but otherwise arrays become a
 400           * mess.  Hopefully a later pass over the code can pack scalars
 401           * down if appropriate.
 402           */
 403          return 1;
 404       }
 405    case GLSL_TYPE_ARRAY:
 406       assert(type->length > 0);
 407       return type_size(type->fields.array) * type->length;
 408    case GLSL_TYPE_STRUCT:
 409       size = 0;
 410       for (i = 0; i < type->length; i++) {
 411          size += type_size(type->fields.structure[i].type);
 412       }
 413       return size;
 414    case GLSL_TYPE_SAMPLER:
 415       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 416        * at link time.
 417        */
 418       return 1;
 419    default:
 420       assert(0);
 421       return 0;
 422    }
 423 }
 424
 425 int
 426 vec4_visitor::virtual_grf_alloc(int size)
 427 {
 428    if (virtual_grf_array_size <= virtual_grf_count) {
 429       if (virtual_grf_array_size == 0)
 430          virtual_grf_array_size = 16;
 431       else
 432          virtual_grf_array_size *= 2;
 433       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 434                                    virtual_grf_array_size);
 435       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 436                                      virtual_grf_array_size);
 437    }
 438    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 439    virtual_grf_reg_count += size;
 440    virtual_grf_sizes[virtual_grf_count] = size;
 441    return virtual_grf_count++;
 442 }
 443
 444 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 445 {
 446    init();
 447
 448    this->file = GRF;
 449    this->reg = v->virtual_grf_alloc(type_size(type));
 450
 451    if (type->is_array() || type->is_record()) {
 452       this->swizzle = BRW_SWIZZLE_NOOP;
 453    } else {
 454       this->swizzle = swizzle_for_size(type->vector_elements);
 455    }
 456
 457    this->type = brw_type_for_base_type(type);
 458 }
 459
 460 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 461 {
 462    init();
 463
 464    this->file = GRF;
 465    this->reg = v->virtual_grf_alloc(type_size(type));
 466
 467    if (type->is_array() || type->is_record()) {
 468       this->writemask = WRITEMASK_XYZW;
 469    } else {
 470       this->writemask = (1 << type->vector_elements) - 1;
 471    }
 472
 473    this->type = brw_type_for_base_type(type);
 474 }
 475
 476 /* Our support for uniforms is piggy-backed on the struct
 477  * gl_fragment_program, because that's where the values actually
 478  * get stored, rather than in some global gl_shader_program uniform
 479  * store.
 480  */
 481 int
 482 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
 483 {
 484    unsigned int offset = 0;
 485    float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
 486
 487    if (type->is_matrix()) {
 488       const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 489                                                         type->vector_elements,
 490                                                         1);
 491
 492       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 493          offset += setup_uniform_values(loc + offset, column);
 494       }
 495
 496       return offset;
 497    }
 498
 499    switch (type->base_type) {
 500    case GLSL_TYPE_FLOAT:
 501    case GLSL_TYPE_UINT:
 502    case GLSL_TYPE_INT:
 503    case GLSL_TYPE_BOOL:
 504       for (unsigned int i = 0; i < type->vector_elements; i++) {
 505          c->prog_data.param[this->uniforms * 4 + i] = &values[i];
 506       }
 507
 508       /* Set up pad elements to get things aligned to a vec4 boundary. */
 509       for (unsigned int i = type->vector_elements; i < 4; i++) {
 510          static float zero = 0;
 511
 512          c->prog_data.param[this->uniforms * 4 + i] = &zero;
 513       }
 514
 515       /* Track the size of this uniform vector, for future packing of
 516        * uniforms.
 517        */
 518       this->uniform_vector_size[this->uniforms] = type->vector_elements;
 519       this->uniforms++;
 520
 521       return 1;
 522
 523    case GLSL_TYPE_STRUCT:
 524       for (unsigned int i = 0; i < type->length; i++) {
 525          offset += setup_uniform_values(loc + offset,
 526                                         type->fields.structure[i].type);
 527       }
 528       return offset;
 529
 530    case GLSL_TYPE_ARRAY:
 531       for (unsigned int i = 0; i < type->length; i++) {
 532          offset += setup_uniform_values(loc + offset, type->fields.array);
 533       }
 534       return offset;
 535
 536    case GLSL_TYPE_SAMPLER:
 537       /* The sampler takes up a slot, but we don't use any values from it. */
 538       return 1;
 539
 540    default:
 541       assert(!"not reached");
 542       return 0;
 543    }
 544 }
 545
 546 void
 547 vec4_visitor::setup_uniform_clipplane_values()
 548 {
 549    int compacted_clipplane_index = 0;
 550    for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
 551       if (ctx->Transform.ClipPlanesEnabled & (1 << i)) {
 552          this->uniform_vector_size[this->uniforms] = 4;
 553          this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
 554          this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
 555          for (int j = 0; j < 4; ++j) {
 556             c->prog_data.param[this->uniforms * 4 + j] = &ctx->Transform._ClipUserPlane[i][j];
 557          }
 558          ++compacted_clipplane_index;
 559          ++this->uniforms;
 560       }
 561    }
 562 }
 563
 564 /* Our support for builtin uniforms is even scarier than non-builtin.
 565  * It sits on top of the PROG_STATE_VAR parameters that are
 566  * automatically updated from GL context state.
 567  */
 568 void
 569 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 570 {
 571    const ir_state_slot *const slots = ir->state_slots;
 572    assert(ir->state_slots != NULL);
 573
 574    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 575       /* This state reference has already been setup by ir_to_mesa,
 576        * but we'll get the same index back here.  We can reference
 577        * ParameterValues directly, since unlike brw_fs.cpp, we never
 578        * add new state references during compile.
 579        */
 580       int index = _mesa_add_state_reference(this->vp->Base.Parameters,
 581                                             (gl_state_index *)slots[i].tokens);
 582       float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
 583
 584       this->uniform_vector_size[this->uniforms] = 0;
 585       /* Add each of the unique swizzled channels of the element.
 586        * This will end up matching the size of the glsl_type of this field.
 587        */
 588       int last_swiz = -1;
 589       for (unsigned int j = 0; j < 4; j++) {
 590          int swiz = GET_SWZ(slots[i].swizzle, j);
 591          last_swiz = swiz;
 592
 593          c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
 594          if (swiz <= last_swiz)
 595             this->uniform_vector_size[this->uniforms]++;
 596       }
 597       this->uniforms++;
 598    }
 599 }
 600
 601 dst_reg *
 602 vec4_visitor::variable_storage(ir_variable *var)
 603 {
 604    return (dst_reg *)hash_table_find(this->variable_ht, var);
 605 }
 606
 607 void
 608 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 609 {
 610    ir_expression *expr = ir->as_expression();
 611
 612    *predicate = BRW_PREDICATE_NORMAL;
 613
 614    if (expr) {
 615       src_reg op[2];
 616       vec4_instruction *inst;
 617
 618       assert(expr->get_num_operands() <= 2);
 619       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 620          expr->operands[i]->accept(this);
 621          op[i] = this->result;
 622       }
 623
 624       switch (expr->operation) {
 625       case ir_unop_logic_not:
 626          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 627          inst->conditional_mod = BRW_CONDITIONAL_Z;
 628          break;
 629
 630       case ir_binop_logic_xor:
 631          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 632          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 633          break;
 634
 635       case ir_binop_logic_or:
 636          inst = emit(OR(dst_null_d(), op[0], op[1]));
 637          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 638          break;
 639
 640       case ir_binop_logic_and:
 641          inst = emit(AND(dst_null_d(), op[0], op[1]));
 642          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 643          break;
 644
 645       case ir_unop_f2b:
 646          if (intel->gen >= 6) {
 647             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 648          } else {
 649             inst = emit(MOV(dst_null_f(), op[0]));
 650             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 651          }
 652          break;
 653
 654       case ir_unop_i2b:
 655          if (intel->gen >= 6) {
 656             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 657          } else {
 658             inst = emit(MOV(dst_null_d(), op[0]));
 659             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 660          }
 661          break;
 662
 663       case ir_binop_all_equal:
 664          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 665          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 666          break;
 667
 668       case ir_binop_any_nequal:
 669          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 670          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 671          break;
 672
 673       case ir_unop_any:
 674          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 675          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 676          break;
 677
 678       case ir_binop_greater:
 679       case ir_binop_gequal:
 680       case ir_binop_less:
 681       case ir_binop_lequal:
 682       case ir_binop_equal:
 683       case ir_binop_nequal:
 684          emit(CMP(dst_null_d(), op[0], op[1],
 685                   brw_conditional_for_comparison(expr->operation)));
 686          break;
 687
 688       default:
 689          assert(!"not reached");
 690          break;
 691       }
 692       return;
 693    }
 694
 695    ir->accept(this);
 696
 697    if (intel->gen >= 6) {
 698       vec4_instruction *inst = emit(AND(dst_null_d(),
 699                                         this->result, src_reg(1)));
 700       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 701    } else {
 702       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 703       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 704    }
 705 }
 706
 707 /**
 708  * Emit a gen6 IF statement with the comparison folded into the IF
 709  * instruction.
 710  */
 711 void
 712 vec4_visitor::emit_if_gen6(ir_if *ir)
 713 {
 714    ir_expression *expr = ir->condition->as_expression();
 715
 716    if (expr) {
 717       src_reg op[2];
 718       dst_reg temp;
 719
 720       assert(expr->get_num_operands() <= 2);
 721       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 722          expr->operands[i]->accept(this);
 723          op[i] = this->result;
 724       }
 725
 726       switch (expr->operation) {
 727       case ir_unop_logic_not:
 728          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 729          return;
 730
 731       case ir_binop_logic_xor:
 732          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 733          return;
 734
 735       case ir_binop_logic_or:
 736          temp = dst_reg(this, glsl_type::bool_type);
 737          emit(OR(temp, op[0], op[1]));
 738          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 739          return;
 740
 741       case ir_binop_logic_and:
 742          temp = dst_reg(this, glsl_type::bool_type);
 743          emit(AND(temp, op[0], op[1]));
 744          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 745          return;
 746
 747       case ir_unop_f2b:
 748          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 749          return;
 750
 751       case ir_unop_i2b:
 752          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 753          return;
 754
 755       case ir_binop_greater:
 756       case ir_binop_gequal:
 757       case ir_binop_less:
 758       case ir_binop_lequal:
 759       case ir_binop_equal:
 760       case ir_binop_nequal:
 761          emit(IF(op[0], op[1],
 762                  brw_conditional_for_comparison(expr->operation)));
 763          return;
 764
 765       case ir_binop_all_equal:
 766          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 767          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 768          return;
 769
 770       case ir_binop_any_nequal:
 771          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 772          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 773          return;
 774
 775       case ir_unop_any:
 776          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 777          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 778          return;
 779
 780       default:
 781          assert(!"not reached");
 782          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 783          return;
 784       }
 785       return;
 786    }
 787
 788    ir->condition->accept(this);
 789
 790    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 791 }
 792
 793 void
 794 vec4_visitor::visit(ir_variable *ir)
 795 {
 796    dst_reg *reg = NULL;
 797
 798    if (variable_storage(ir))
 799       return;
 800
 801    switch (ir->mode) {
 802    case ir_var_in:
 803       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
 804
 805       /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 806        * come in as floating point conversions of the integer values.
 807        */
 808       for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
 809          if (!c->key.gl_fixed_input_size[i])
 810             continue;
 811
 812          dst_reg dst = *reg;
 813          dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
 814          emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 815       }
 816       break;
 817
 818    case ir_var_out:
 819       reg = new(mem_ctx) dst_reg(this, ir->type);
 820
 821       for (int i = 0; i < type_size(ir->type); i++) {
 822          output_reg[ir->location + i] = *reg;
 823          output_reg[ir->location + i].reg_offset = i;
 824          output_reg[ir->location + i].type = BRW_REGISTER_TYPE_F;
 825          output_reg_annotation[ir->location + i] = ir->name;
 826       }
 827       break;
 828
 829    case ir_var_auto:
 830    case ir_var_temporary:
 831       reg = new(mem_ctx) dst_reg(this, ir->type);
 832       break;
 833
 834    case ir_var_uniform:
 835       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 836
 837       /* Track how big the whole uniform variable is, in case we need to put a
 838        * copy of its data into pull constants for array access.
 839        */
 840       this->uniform_size[this->uniforms] = type_size(ir->type);
 841
 842       if (!strncmp(ir->name, "gl_", 3)) {
 843          setup_builtin_uniform_values(ir);
 844       } else {
 845          setup_uniform_values(ir->location, ir->type);
 846       }
 847       break;
 848
 849    default:
 850       assert(!"not reached");
 851    }
 852
 853    reg->type = brw_type_for_base_type(ir->type);
 854    hash_table_insert(this->variable_ht, reg, ir);
 855 }
 856
 857 void
 858 vec4_visitor::visit(ir_loop *ir)
 859 {
 860    dst_reg counter;
 861
 862    /* We don't want debugging output to print the whole body of the
 863     * loop as the annotation.
 864     */
 865    this->base_ir = NULL;
 866
 867    if (ir->counter != NULL) {
 868       this->base_ir = ir->counter;
 869       ir->counter->accept(this);
 870       counter = *(variable_storage(ir->counter));
 871
 872       if (ir->from != NULL) {
 873          this->base_ir = ir->from;
 874          ir->from->accept(this);
 875
 876          emit(MOV(counter, this->result));
 877       }
 878    }
 879
 880    emit(BRW_OPCODE_DO);
 881
 882    if (ir->to) {
 883       this->base_ir = ir->to;
 884       ir->to->accept(this);
 885
 886       emit(CMP(dst_null_d(), src_reg(counter), this->result,
 887                brw_conditional_for_comparison(ir->cmp)));
 888
 889       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
 890       inst->predicate = BRW_PREDICATE_NORMAL;
 891    }
 892
 893    visit_instructions(&ir->body_instructions);
 894
 895
 896    if (ir->increment) {
 897       this->base_ir = ir->increment;
 898       ir->increment->accept(this);
 899       emit(ADD(counter, src_reg(counter), this->result));
 900    }
 901
 902    emit(BRW_OPCODE_WHILE);
 903 }
 904
 905 void
 906 vec4_visitor::visit(ir_loop_jump *ir)
 907 {
 908    switch (ir->mode) {
 909    case ir_loop_jump::jump_break:
 910       emit(BRW_OPCODE_BREAK);
 911       break;
 912    case ir_loop_jump::jump_continue:
 913       emit(BRW_OPCODE_CONTINUE);
 914       break;
 915    }
 916 }
 917
 918
 919 void
 920 vec4_visitor::visit(ir_function_signature *ir)
 921 {
 922    assert(0);
 923    (void)ir;
 924 }
 925
 926 void
 927 vec4_visitor::visit(ir_function *ir)
 928 {
 929    /* Ignore function bodies other than main() -- we shouldn't see calls to
 930     * them since they should all be inlined.
 931     */
 932    if (strcmp(ir->name, "main") == 0) {
 933       const ir_function_signature *sig;
 934       exec_list empty;
 935
 936       sig = ir->matching_signature(&empty);
 937
 938       assert(sig);
 939
 940       visit_instructions(&sig->body);
 941    }
 942 }
 943
 944 GLboolean
 945 vec4_visitor::try_emit_sat(ir_expression *ir)
 946 {
 947    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
 948    if (!sat_src)
 949       return false;
 950
 951    sat_src->accept(this);
 952    src_reg src = this->result;
 953
 954    this->result = src_reg(this, ir->type);
 955    vec4_instruction *inst;
 956    inst = emit(MOV(dst_reg(this->result), src));
 957    inst->saturate = true;
 958
 959    return true;
 960 }
 961
 962 void
 963 vec4_visitor::emit_bool_comparison(unsigned int op,
 964                                  dst_reg dst, src_reg src0, src_reg src1)
 965 {
 966    /* original gen4 does destination conversion before comparison. */
 967    if (intel->gen < 5)
 968       dst.type = src0.type;
 969
 970    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
 971
 972    dst.type = BRW_REGISTER_TYPE_D;
 973    emit(AND(dst, src_reg(dst), src_reg(0x1)));
 974 }
 975
 976 void
 977 vec4_visitor::visit(ir_expression *ir)
 978 {
 979    unsigned int operand;
 980    src_reg op[Elements(ir->operands)];
 981    src_reg result_src;
 982    dst_reg result_dst;
 983    vec4_instruction *inst;
 984
 985    if (try_emit_sat(ir))
 986       return;
 987
 988    for (operand = 0; operand < ir->get_num_operands(); operand++) {
 989       this->result.file = BAD_FILE;
 990       ir->operands[operand]->accept(this);
 991       if (this->result.file == BAD_FILE) {
 992          printf("Failed to get tree for expression operand:\n");
 993          ir->operands[operand]->print();
 994          exit(1);
 995       }
 996       op[operand] = this->result;
 997
 998       /* Matrix expression operands should have been broken down to vector
 999        * operations already.
1000        */
1001       assert(!ir->operands[operand]->type->is_matrix());
1002    }
1003
1004    int vector_elements = ir->operands[0]->type->vector_elements;
1005    if (ir->operands[1]) {
1006       vector_elements = MAX2(vector_elements,
1007                              ir->operands[1]->type->vector_elements);
1008    }
1009
1010    this->result.file = BAD_FILE;
1011
1012    /* Storage for our result.  Ideally for an assignment we'd be using
1013     * the actual storage for the result here, instead.
1014     */
1015    result_src = src_reg(this, ir->type);
1016    /* convenience for the emit functions below. */
1017    result_dst = dst_reg(result_src);
1018    /* If nothing special happens, this is the result. */
1019    this->result = result_src;
1020    /* Limit writes to the channels that will be used by result_src later.
1021     * This does limit this temp's use as a temporary for multi-instruction
1022     * sequences.
1023     */
1024    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1025
1026    switch (ir->operation) {
1027    case ir_unop_logic_not:
1028       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1029        * ones complement of the whole register, not just bit 0.
1030        */
1031       emit(XOR(result_dst, op[0], src_reg(1)));
1032       break;
1033    case ir_unop_neg:
1034       op[0].negate = !op[0].negate;
1035       this->result = op[0];
1036       break;
1037    case ir_unop_abs:
1038       op[0].abs = true;
1039       op[0].negate = false;
1040       this->result = op[0];
1041       break;
1042
1043    case ir_unop_sign:
1044       emit(MOV(result_dst, src_reg(0.0f)));
1045
1046       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1047       inst = emit(MOV(result_dst, src_reg(1.0f)));
1048       inst->predicate = BRW_PREDICATE_NORMAL;
1049
1050       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1051       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1052       inst->predicate = BRW_PREDICATE_NORMAL;
1053
1054       break;
1055
1056    case ir_unop_rcp:
1057       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1058       break;
1059
1060    case ir_unop_exp2:
1061       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1062       break;
1063    case ir_unop_log2:
1064       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1065       break;
1066    case ir_unop_exp:
1067    case ir_unop_log:
1068       assert(!"not reached: should be handled by ir_explog_to_explog2");
1069       break;
1070    case ir_unop_sin:
1071    case ir_unop_sin_reduced:
1072       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1073       break;
1074    case ir_unop_cos:
1075    case ir_unop_cos_reduced:
1076       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1077       break;
1078
1079    case ir_unop_dFdx:
1080    case ir_unop_dFdy:
1081       assert(!"derivatives not valid in vertex shader");
1082       break;
1083
1084    case ir_unop_noise:
1085       assert(!"not reached: should be handled by lower_noise");
1086       break;
1087
1088    case ir_binop_add:
1089       emit(ADD(result_dst, op[0], op[1]));
1090       break;
1091    case ir_binop_sub:
1092       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1093       break;
1094
1095    case ir_binop_mul:
1096       if (ir->type->is_integer()) {
1097          /* For integer multiplication, the MUL uses the low 16 bits
1098           * of one of the operands (src0 on gen6, src1 on gen7).  The
1099           * MACH accumulates in the contribution of the upper 16 bits
1100           * of that operand.
1101           *
1102           * FINISHME: Emit just the MUL if we know an operand is small
1103           * enough.
1104           */
1105          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1106
1107          emit(MUL(acc, op[0], op[1]));
1108          emit(MACH(dst_null_d(), op[0], op[1]));
1109          emit(MOV(result_dst, src_reg(acc)));
1110       } else {
1111          emit(MUL(result_dst, op[0], op[1]));
1112       }
1113       break;
1114    case ir_binop_div:
1115       assert(!"not reached: should be handled by ir_div_to_mul_rcp");
1116    case ir_binop_mod:
1117       assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
1118       break;
1119
1120    case ir_binop_less:
1121    case ir_binop_greater:
1122    case ir_binop_lequal:
1123    case ir_binop_gequal:
1124    case ir_binop_equal:
1125    case ir_binop_nequal: {
1126       emit(CMP(result_dst, op[0], op[1],
1127                brw_conditional_for_comparison(ir->operation)));
1128       emit(AND(result_dst, result_src, src_reg(0x1)));
1129       break;
1130    }
1131
1132    case ir_binop_all_equal:
1133       /* "==" operator producing a scalar boolean. */
1134       if (ir->operands[0]->type->is_vector() ||
1135           ir->operands[1]->type->is_vector()) {
1136          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1137          emit(MOV(result_dst, src_reg(0)));
1138          inst = emit(MOV(result_dst, src_reg(1)));
1139          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1140       } else {
1141          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1142          emit(AND(result_dst, result_src, src_reg(0x1)));
1143       }
1144       break;
1145    case ir_binop_any_nequal:
1146       /* "!=" operator producing a scalar boolean. */
1147       if (ir->operands[0]->type->is_vector() ||
1148           ir->operands[1]->type->is_vector()) {
1149          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1150
1151          emit(MOV(result_dst, src_reg(0)));
1152          inst = emit(MOV(result_dst, src_reg(1)));
1153          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1154       } else {
1155          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1156          emit(AND(result_dst, result_src, src_reg(0x1)));
1157       }
1158       break;
1159
1160    case ir_unop_any:
1161       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1162       emit(MOV(result_dst, src_reg(0)));
1163
1164       inst = emit(MOV(result_dst, src_reg(1)));
1165       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1166       break;
1167
1168    case ir_binop_logic_xor:
1169       emit(XOR(result_dst, op[0], op[1]));
1170       break;
1171
1172    case ir_binop_logic_or:
1173       emit(OR(result_dst, op[0], op[1]));
1174       break;
1175
1176    case ir_binop_logic_and:
1177       emit(AND(result_dst, op[0], op[1]));
1178       break;
1179
1180    case ir_binop_dot:
1181       assert(ir->operands[0]->type->is_vector());
1182       assert(ir->operands[0]->type == ir->operands[1]->type);
1183       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1184       break;
1185
1186    case ir_unop_sqrt:
1187       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1188       break;
1189    case ir_unop_rsq:
1190       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1191       break;
1192    case ir_unop_i2f:
1193    case ir_unop_i2u:
1194    case ir_unop_u2i:
1195    case ir_unop_u2f:
1196    case ir_unop_b2f:
1197    case ir_unop_b2i:
1198    case ir_unop_f2i:
1199       emit(MOV(result_dst, op[0]));
1200       break;
1201    case ir_unop_f2b:
1202    case ir_unop_i2b: {
1203       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1204       emit(AND(result_dst, result_src, src_reg(1)));
1205       break;
1206    }
1207
1208    case ir_unop_trunc:
1209       emit(RNDZ(result_dst, op[0]));
1210       break;
1211    case ir_unop_ceil:
1212       op[0].negate = !op[0].negate;
1213       inst = emit(RNDD(result_dst, op[0]));
1214       this->result.negate = true;
1215       break;
1216    case ir_unop_floor:
1217       inst = emit(RNDD(result_dst, op[0]));
1218       break;
1219    case ir_unop_fract:
1220       inst = emit(FRC(result_dst, op[0]));
1221       break;
1222    case ir_unop_round_even:
1223       emit(RNDE(result_dst, op[0]));
1224       break;
1225
1226    case ir_binop_min:
1227       emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_L));
1228
1229       inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1230       inst->predicate = BRW_PREDICATE_NORMAL;
1231       break;
1232    case ir_binop_max:
1233       emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_G));
1234
1235       inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1236       inst->predicate = BRW_PREDICATE_NORMAL;
1237       break;
1238
1239    case ir_binop_pow:
1240       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1241       break;
1242
1243    case ir_unop_bit_not:
1244       inst = emit(NOT(result_dst, op[0]));
1245       break;
1246    case ir_binop_bit_and:
1247       inst = emit(AND(result_dst, op[0], op[1]));
1248       break;
1249    case ir_binop_bit_xor:
1250       inst = emit(XOR(result_dst, op[0], op[1]));
1251       break;
1252    case ir_binop_bit_or:
1253       inst = emit(OR(result_dst, op[0], op[1]));
1254       break;
1255
1256    case ir_binop_lshift:
1257    case ir_binop_rshift:
1258       assert(!"GLSL 1.30 features unsupported");
1259       break;
1260
1261    case ir_quadop_vector:
1262       assert(!"not reached: should be handled by lower_quadop_vector");
1263       break;
1264    }
1265 }
1266
1267
1268 void
1269 vec4_visitor::visit(ir_swizzle *ir)
1270 {
1271    src_reg src;
1272    int i = 0;
1273    int swizzle[4];
1274
1275    /* Note that this is only swizzles in expressions, not those on the left
1276     * hand side of an assignment, which do write masking.  See ir_assignment
1277     * for that.
1278     */
1279
1280    ir->val->accept(this);
1281    src = this->result;
1282    assert(src.file != BAD_FILE);
1283
1284    for (i = 0; i < ir->type->vector_elements; i++) {
1285       switch (i) {
1286       case 0:
1287          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1288          break;
1289       case 1:
1290          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1291          break;
1292       case 2:
1293          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1294          break;
1295       case 3:
1296          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1297             break;
1298       }
1299    }
1300    for (; i < 4; i++) {
1301       /* Replicate the last channel out. */
1302       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1303    }
1304
1305    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1306
1307    this->result = src;
1308 }
1309
1310 void
1311 vec4_visitor::visit(ir_dereference_variable *ir)
1312 {
1313    const struct glsl_type *type = ir->type;
1314    dst_reg *reg = variable_storage(ir->var);
1315
1316    if (!reg) {
1317       fail("Failed to find variable storage for %s\n", ir->var->name);
1318       this->result = src_reg(brw_null_reg());
1319       return;
1320    }
1321
1322    this->result = src_reg(*reg);
1323
1324    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1325       this->result.swizzle = swizzle_for_size(type->vector_elements);
1326 }
1327
1328 void
1329 vec4_visitor::visit(ir_dereference_array *ir)
1330 {
1331    ir_constant *constant_index;
1332    src_reg src;
1333    int element_size = type_size(ir->type);
1334
1335    constant_index = ir->array_index->constant_expression_value();
1336
1337    ir->array->accept(this);
1338    src = this->result;
1339
1340    if (constant_index) {
1341       src.reg_offset += constant_index->value.i[0] * element_size;
1342    } else {
1343       /* Variable index array dereference.  It eats the "vec4" of the
1344        * base of the array and an index that offsets the Mesa register
1345        * index.
1346        */
1347       ir->array_index->accept(this);
1348
1349       src_reg index_reg;
1350
1351       if (element_size == 1) {
1352          index_reg = this->result;
1353       } else {
1354          index_reg = src_reg(this, glsl_type::int_type);
1355
1356          emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1357       }
1358
1359       if (src.reladdr) {
1360          src_reg temp = src_reg(this, glsl_type::int_type);
1361
1362          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1363
1364          index_reg = temp;
1365       }
1366
1367       src.reladdr = ralloc(mem_ctx, src_reg);
1368       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1369    }
1370
1371    /* If the type is smaller than a vec4, replicate the last channel out. */
1372    if (ir->type->is_scalar() || ir->type->is_vector())
1373       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1374    else
1375       src.swizzle = BRW_SWIZZLE_NOOP;
1376    src.type = brw_type_for_base_type(ir->type);
1377
1378    this->result = src;
1379 }
1380
1381 void
1382 vec4_visitor::visit(ir_dereference_record *ir)
1383 {
1384    unsigned int i;
1385    const glsl_type *struct_type = ir->record->type;
1386    int offset = 0;
1387
1388    ir->record->accept(this);
1389
1390    for (i = 0; i < struct_type->length; i++) {
1391       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1392          break;
1393       offset += type_size(struct_type->fields.structure[i].type);
1394    }
1395
1396    /* If the type is smaller than a vec4, replicate the last channel out. */
1397    if (ir->type->is_scalar() || ir->type->is_vector())
1398       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1399    else
1400       this->result.swizzle = BRW_SWIZZLE_NOOP;
1401    this->result.type = brw_type_for_base_type(ir->type);
1402
1403    this->result.reg_offset += offset;
1404 }
1405
1406 /**
1407  * We want to be careful in assignment setup to hit the actual storage
1408  * instead of potentially using a temporary like we might with the
1409  * ir_dereference handler.
1410  */
1411 static dst_reg
1412 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1413 {
1414    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1415     * access of a vector, it must be separated into a series conditional moves
1416     * before reaching this point (see ir_vec_index_to_cond_assign).
1417     */
1418    assert(ir->as_dereference());
1419    ir_dereference_array *deref_array = ir->as_dereference_array();
1420    if (deref_array) {
1421       assert(!deref_array->array->type->is_vector());
1422    }
1423
1424    /* Use the rvalue deref handler for the most part.  We'll ignore
1425     * swizzles in it and write swizzles using writemask, though.
1426     */
1427    ir->accept(v);
1428    return dst_reg(v->result);
1429 }
1430
1431 void
1432 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1433                               const struct glsl_type *type, uint32_t predicate)
1434 {
1435    if (type->base_type == GLSL_TYPE_STRUCT) {
1436       for (unsigned int i = 0; i < type->length; i++) {
1437          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1438       }
1439       return;
1440    }
1441
1442    if (type->is_array()) {
1443       for (unsigned int i = 0; i < type->length; i++) {
1444          emit_block_move(dst, src, type->fields.array, predicate);
1445       }
1446       return;
1447    }
1448
1449    if (type->is_matrix()) {
1450       const struct glsl_type *vec_type;
1451
1452       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1453                                          type->vector_elements, 1);
1454
1455       for (int i = 0; i < type->matrix_columns; i++) {
1456          emit_block_move(dst, src, vec_type, predicate);
1457       }
1458       return;
1459    }
1460
1461    assert(type->is_scalar() || type->is_vector());
1462
1463    dst->type = brw_type_for_base_type(type);
1464    src->type = dst->type;
1465
1466    dst->writemask = (1 << type->vector_elements) - 1;
1467
1468    /* Do we need to worry about swizzling a swizzle? */
1469    assert(src->swizzle = BRW_SWIZZLE_NOOP);
1470    src->swizzle = swizzle_for_size(type->vector_elements);
1471
1472    vec4_instruction *inst = emit(MOV(*dst, *src));
1473    inst->predicate = predicate;
1474
1475    dst->reg_offset++;
1476    src->reg_offset++;
1477 }
1478
1479
1480 /* If the RHS processing resulted in an instruction generating a
1481  * temporary value, and it would be easy to rewrite the instruction to
1482  * generate its result right into the LHS instead, do so.  This ends
1483  * up reliably removing instructions where it can be tricky to do so
1484  * later without real UD chain information.
1485  */
1486 bool
1487 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1488                                      dst_reg dst,
1489                                      src_reg src,
1490                                      vec4_instruction *pre_rhs_inst,
1491                                      vec4_instruction *last_rhs_inst)
1492 {
1493    /* This could be supported, but it would take more smarts. */
1494    if (ir->condition)
1495       return false;
1496
1497    if (pre_rhs_inst == last_rhs_inst)
1498       return false; /* No instructions generated to work with. */
1499
1500    /* Make sure the last instruction generated our source reg. */
1501    if (src.file != GRF ||
1502        src.file != last_rhs_inst->dst.file ||
1503        src.reg != last_rhs_inst->dst.reg ||
1504        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1505        src.reladdr ||
1506        src.abs ||
1507        src.negate ||
1508        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1509       return false;
1510
1511    /* Check that that last instruction fully initialized the channels
1512     * we want to use, in the order we want to use them.  We could
1513     * potentially reswizzle the operands of many instructions so that
1514     * we could handle out of order channels, but don't yet.
1515     */
1516    for (int i = 0; i < 4; i++) {
1517       if (dst.writemask & (1 << i)) {
1518          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1519             return false;
1520
1521          if (BRW_GET_SWZ(src.swizzle, i) != i)
1522             return false;
1523       }
1524    }
1525
1526    /* Success!  Rewrite the instruction. */
1527    last_rhs_inst->dst.file = dst.file;
1528    last_rhs_inst->dst.reg = dst.reg;
1529    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1530    last_rhs_inst->dst.reladdr = dst.reladdr;
1531    last_rhs_inst->dst.writemask &= dst.writemask;
1532
1533    return true;
1534 }
1535
1536 void
1537 vec4_visitor::visit(ir_assignment *ir)
1538 {
1539    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1540    uint32_t predicate = BRW_PREDICATE_NONE;
1541
1542    if (!ir->lhs->type->is_scalar() &&
1543        !ir->lhs->type->is_vector()) {
1544       ir->rhs->accept(this);
1545       src_reg src = this->result;
1546
1547       if (ir->condition) {
1548          emit_bool_to_cond_code(ir->condition, &predicate);
1549       }
1550
1551       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1552       return;
1553    }
1554
1555    /* Now we're down to just a scalar/vector with writemasks. */
1556    int i;
1557
1558    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1559    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1560
1561    ir->rhs->accept(this);
1562
1563    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1564
1565    src_reg src = this->result;
1566
1567    int swizzles[4];
1568    int first_enabled_chan = 0;
1569    int src_chan = 0;
1570
1571    assert(ir->lhs->type->is_vector() ||
1572           ir->lhs->type->is_scalar());
1573    dst.writemask = ir->write_mask;
1574
1575    for (int i = 0; i < 4; i++) {
1576       if (dst.writemask & (1 << i)) {
1577          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1578          break;
1579       }
1580    }
1581
1582    /* Swizzle a small RHS vector into the channels being written.
1583     *
1584     * glsl ir treats write_mask as dictating how many channels are
1585     * present on the RHS while in our instructions we need to make
1586     * those channels appear in the slots of the vec4 they're written to.
1587     */
1588    for (int i = 0; i < 4; i++) {
1589       if (dst.writemask & (1 << i))
1590          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1591       else
1592          swizzles[i] = first_enabled_chan;
1593    }
1594    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1595                               swizzles[2], swizzles[3]);
1596
1597    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1598       return;
1599    }
1600
1601    if (ir->condition) {
1602       emit_bool_to_cond_code(ir->condition, &predicate);
1603    }
1604
1605    for (i = 0; i < type_size(ir->lhs->type); i++) {
1606       vec4_instruction *inst = emit(MOV(dst, src));
1607       inst->predicate = predicate;
1608
1609       dst.reg_offset++;
1610       src.reg_offset++;
1611    }
1612 }
1613
1614 void
1615 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1616 {
1617    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1618       foreach_list(node, &ir->components) {
1619          ir_constant *field_value = (ir_constant *)node;
1620
1621          emit_constant_values(dst, field_value);
1622       }
1623       return;
1624    }
1625
1626    if (ir->type->is_array()) {
1627       for (unsigned int i = 0; i < ir->type->length; i++) {
1628          emit_constant_values(dst, ir->array_elements[i]);
1629       }
1630       return;
1631    }
1632
1633    if (ir->type->is_matrix()) {
1634       for (int i = 0; i < ir->type->matrix_columns; i++) {
1635          for (int j = 0; j < ir->type->vector_elements; j++) {
1636             dst->writemask = 1 << j;
1637             dst->type = BRW_REGISTER_TYPE_F;
1638
1639             emit(MOV(*dst,
1640                      src_reg(ir->value.f[i * ir->type->vector_elements + j])));
1641          }
1642          dst->reg_offset++;
1643       }
1644       return;
1645    }
1646
1647    for (int i = 0; i < ir->type->vector_elements; i++) {
1648       dst->writemask = 1 << i;
1649       dst->type = brw_type_for_base_type(ir->type);
1650
1651       switch (ir->type->base_type) {
1652       case GLSL_TYPE_FLOAT:
1653          emit(MOV(*dst, src_reg(ir->value.f[i])));
1654          break;
1655       case GLSL_TYPE_INT:
1656          emit(MOV(*dst, src_reg(ir->value.i[i])));
1657          break;
1658       case GLSL_TYPE_UINT:
1659          emit(MOV(*dst, src_reg(ir->value.u[i])));
1660          break;
1661       case GLSL_TYPE_BOOL:
1662          emit(MOV(*dst, src_reg(ir->value.b[i])));
1663          break;
1664       default:
1665          assert(!"Non-float/uint/int/bool constant");
1666          break;
1667       }
1668    }
1669    dst->reg_offset++;
1670 }
1671
1672 void
1673 vec4_visitor::visit(ir_constant *ir)
1674 {
1675    dst_reg dst = dst_reg(this, ir->type);
1676    this->result = src_reg(dst);
1677
1678    emit_constant_values(&dst, ir);
1679 }
1680
1681 void
1682 vec4_visitor::visit(ir_call *ir)
1683 {
1684    assert(!"not reached");
1685 }
1686
1687 void
1688 vec4_visitor::visit(ir_texture *ir)
1689 {
1690    /* FINISHME: Implement vertex texturing.
1691     *
1692     * With 0 vertex samplers available, the linker will reject
1693     * programs that do vertex texturing, but after our visitor has
1694     * run.
1695     */
1696    this->result = src_reg(this, glsl_type::vec4_type);
1697 }
1698
1699 void
1700 vec4_visitor::visit(ir_return *ir)
1701 {
1702    assert(!"not reached");
1703 }
1704
1705 void
1706 vec4_visitor::visit(ir_discard *ir)
1707 {
1708    assert(!"not reached");
1709 }
1710
1711 void
1712 vec4_visitor::visit(ir_if *ir)
1713 {
1714    /* Don't point the annotation at the if statement, because then it plus
1715     * the then and else blocks get printed.
1716     */
1717    this->base_ir = ir->condition;
1718
1719    if (intel->gen == 6) {
1720       emit_if_gen6(ir);
1721    } else {
1722       uint32_t predicate;
1723       emit_bool_to_cond_code(ir->condition, &predicate);
1724       emit(IF(predicate));
1725    }
1726
1727    visit_instructions(&ir->then_instructions);
1728
1729    if (!ir->else_instructions.is_empty()) {
1730       this->base_ir = ir->condition;
1731       emit(BRW_OPCODE_ELSE);
1732
1733       visit_instructions(&ir->else_instructions);
1734    }
1735
1736    this->base_ir = ir->condition;
1737    emit(BRW_OPCODE_ENDIF);
1738 }
1739
1740 void
1741 vec4_visitor::emit_ndc_computation()
1742 {
1743    /* Get the position */
1744    src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
1745
1746    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1747    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1748    output_reg[BRW_VERT_RESULT_NDC] = ndc;
1749
1750    current_annotation = "NDC";
1751    dst_reg ndc_w = ndc;
1752    ndc_w.writemask = WRITEMASK_W;
1753    src_reg pos_w = pos;
1754    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1755    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1756
1757    dst_reg ndc_xyz = ndc;
1758    ndc_xyz.writemask = WRITEMASK_XYZ;
1759
1760    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1761 }
1762
1763 void
1764 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
1765 {
1766    if (intel->gen < 6 &&
1767        ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1768         c->key.nr_userclip || brw->has_negative_rhw_bug)) {
1769       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1770       GLuint i;
1771
1772       emit(MOV(header1, 0u));
1773
1774       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1775          src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
1776
1777          current_annotation = "Point size";
1778          header1.writemask = WRITEMASK_W;
1779          emit(MUL(header1, psiz, src_reg((float)(1 << 11))));
1780          emit(AND(header1, src_reg(header1), 0x7ff << 8));
1781       }
1782
1783       current_annotation = "Clipping flags";
1784       for (i = 0; i < c->key.nr_userclip; i++) {
1785          vec4_instruction *inst;
1786
1787          inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
1788                          src_reg(this->userplane[i])));
1789          inst->conditional_mod = BRW_CONDITIONAL_L;
1790
1791          emit(OR(header1, src_reg(header1), 1u << i));
1792          inst->predicate = BRW_PREDICATE_NORMAL;
1793       }
1794
1795       /* i965 clipping workaround:
1796        * 1) Test for -ve rhw
1797        * 2) If set,
1798        *      set ndc = (0,0,0,0)
1799        *      set ucp[6] = 1
1800        *
1801        * Later, clipping will detect ucp[6] and ensure the primitive is
1802        * clipped against all fixed planes.
1803        */
1804       if (brw->has_negative_rhw_bug) {
1805 #if 0
1806          /* FINISHME */
1807          brw_CMP(p,
1808                  vec8(brw_null_reg()),
1809                  BRW_CONDITIONAL_L,
1810                  brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
1811                  brw_imm_f(0));
1812
1813          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1814          brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
1815          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1816 #endif
1817       }
1818
1819       header1.writemask = WRITEMASK_XYZW;
1820       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1821    } else if (intel->gen < 6) {
1822       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
1823    } else {
1824       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
1825       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1826          emit(MOV(brw_writemask(reg, WRITEMASK_W),
1827                   src_reg(output_reg[VERT_RESULT_PSIZ])));
1828       }
1829    }
1830 }
1831
1832 void
1833 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
1834 {
1835    if (intel->gen < 6) {
1836       /* Clip distance slots are set aside in gen5, but they are not used.  It
1837        * is not clear whether we actually need to set aside space for them,
1838        * but the performance cost is negligible.
1839        */
1840       return;
1841    }
1842
1843    for (int i = 0; i + offset < c->key.nr_userclip && i < 4; ++i) {
1844       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
1845                src_reg(output_reg[VERT_RESULT_HPOS]),
1846                src_reg(this->userplane[i + offset])));
1847    }
1848 }
1849
1850 void
1851 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
1852 {
1853    assert (vert_result < VERT_RESULT_MAX);
1854    current_annotation = output_reg_annotation[vert_result];
1855    /* Copy the register, saturating if necessary */
1856    vec4_instruction *inst = emit(MOV(reg,
1857                                      src_reg(output_reg[vert_result])));
1858    if ((vert_result == VERT_RESULT_COL0 ||
1859         vert_result == VERT_RESULT_COL1 ||
1860         vert_result == VERT_RESULT_BFC0 ||
1861         vert_result == VERT_RESULT_BFC1) &&
1862        c->key.clamp_vertex_color) {
1863       inst->saturate = true;
1864    }
1865 }
1866
1867 void
1868 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
1869 {
1870    struct brw_reg hw_reg = brw_message_reg(mrf);
1871    dst_reg reg = dst_reg(MRF, mrf);
1872    reg.type = BRW_REGISTER_TYPE_F;
1873
1874    switch (vert_result) {
1875    case VERT_RESULT_PSIZ:
1876       /* PSIZ is always in slot 0, and is coupled with other flags. */
1877       current_annotation = "indices, point width, clip flags";
1878       emit_psiz_and_flags(hw_reg);
1879       break;
1880    case BRW_VERT_RESULT_NDC:
1881       current_annotation = "NDC";
1882       emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
1883       break;
1884    case BRW_VERT_RESULT_HPOS_DUPLICATE:
1885    case VERT_RESULT_HPOS:
1886       current_annotation = "gl_Position";
1887       emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
1888       break;
1889    case VERT_RESULT_CLIP_DIST0:
1890    case VERT_RESULT_CLIP_DIST1:
1891       if (this->c->key.uses_clip_distance) {
1892          emit_generic_urb_slot(reg, vert_result);
1893       } else {
1894          current_annotation = "user clip distances";
1895          emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
1896       }
1897       break;
1898    case BRW_VERT_RESULT_PAD:
1899       /* No need to write to this slot */
1900       break;
1901    default:
1902       emit_generic_urb_slot(reg, vert_result);
1903       break;
1904    }
1905 }
1906
1907 static int
1908 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
1909 {
1910    struct intel_context *intel = &brw->intel;
1911
1912    if (intel->gen >= 6) {
1913       /* URB data written (does not include the message header reg) must
1914        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1915        * section 5.4.3.2.2: URB_INTERLEAVED.
1916        *
1917        * URB entries are allocated on a multiple of 1024 bits, so an
1918        * extra 128 bits written here to make the end align to 256 is
1919        * no problem.
1920        */
1921       if ((mlen % 2) != 1)
1922          mlen++;
1923    }
1924
1925    return mlen;
1926 }
1927
1928 /**
1929  * Generates the VUE payload plus the 1 or 2 URB write instructions to
1930  * complete the VS thread.
1931  *
1932  * The VUE layout is documented in Volume 2a.
1933  */
1934 void
1935 vec4_visitor::emit_urb_writes()
1936 {
1937    /* MRF 0 is reserved for the debugger, so start with message header
1938     * in MRF 1.
1939     */
1940    int base_mrf = 1;
1941    int mrf = base_mrf;
1942    /* In the process of generating our URB write message contents, we
1943     * may need to unspill a register or load from an array.  Those
1944     * reads would use MRFs 14-15.
1945     */
1946    int max_usable_mrf = 13;
1947
1948    /* The following assertion verifies that max_usable_mrf causes an
1949     * even-numbered amount of URB write data, which will meet gen6's
1950     * requirements for length alignment.
1951     */
1952    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1953
1954    /* FINISHME: edgeflag */
1955
1956    brw_compute_vue_map(&c->vue_map, intel, c->key.nr_userclip,
1957                        c->prog_data.outputs_written);
1958
1959    /* First mrf is the g0-based message header containing URB handles and such,
1960     * which is implied in VS_OPCODE_URB_WRITE.
1961     */
1962    mrf++;
1963
1964    if (intel->gen < 6) {
1965       emit_ndc_computation();
1966    }
1967
1968    /* Set up the VUE data for the first URB write */
1969    int slot;
1970    for (slot = 0; slot < c->vue_map.num_slots; ++slot) {
1971       emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
1972
1973       /* If this was max_usable_mrf, we can't fit anything more into this URB
1974        * WRITE.
1975        */
1976       if (mrf > max_usable_mrf) {
1977          slot++;
1978          break;
1979       }
1980    }
1981
1982    current_annotation = "URB write";
1983    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
1984    inst->base_mrf = base_mrf;
1985    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
1986    inst->eot = (slot >= c->vue_map.num_slots);
1987
1988    /* Optional second URB write */
1989    if (!inst->eot) {
1990       mrf = base_mrf + 1;
1991
1992       for (; slot < c->vue_map.num_slots; ++slot) {
1993          assert(mrf < max_usable_mrf);
1994
1995          emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
1996       }
1997
1998       current_annotation = "URB write";
1999       inst = emit(VS_OPCODE_URB_WRITE);
2000       inst->base_mrf = base_mrf;
2001       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2002       inst->eot = true;
2003       /* URB destination offset.  In the previous write, we got MRFs
2004        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2005        * URB row increments, and each of our MRFs is half of one of
2006        * those, since we're doing interleaved writes.
2007        */
2008       inst->offset = (max_usable_mrf - base_mrf) / 2;
2009    }
2010
2011    if (intel->gen == 6)
2012       c->prog_data.urb_entry_size = ALIGN(c->vue_map.num_slots, 8) / 8;
2013    else
2014       c->prog_data.urb_entry_size = ALIGN(c->vue_map.num_slots, 4) / 4;
2015 }
2016
2017 src_reg
2018 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2019                                  src_reg *reladdr, int reg_offset)
2020 {
2021    /* Because we store the values to scratch interleaved like our
2022     * vertex data, we need to scale the vec4 index by 2.
2023     */
2024    int message_header_scale = 2;
2025
2026    /* Pre-gen6, the message header uses byte offsets instead of vec4
2027     * (16-byte) offset units.
2028     */
2029    if (intel->gen < 6)
2030       message_header_scale *= 16;
2031
2032    if (reladdr) {
2033       src_reg index = src_reg(this, glsl_type::int_type);
2034
2035       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2036       emit_before(inst, MUL(dst_reg(index),
2037                             index, src_reg(message_header_scale)));
2038
2039       return index;
2040    } else {
2041       return src_reg(reg_offset * message_header_scale);
2042    }
2043 }
2044
2045 src_reg
2046 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2047                                        src_reg *reladdr, int reg_offset)
2048 {
2049    if (reladdr) {
2050       src_reg index = src_reg(this, glsl_type::int_type);
2051
2052       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2053
2054       /* Pre-gen6, the message header uses byte offsets instead of vec4
2055        * (16-byte) offset units.
2056        */
2057       if (intel->gen < 6) {
2058          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2059       }
2060
2061       return index;
2062    } else {
2063       int message_header_scale = intel->gen < 6 ? 16 : 1;
2064       return src_reg(reg_offset * message_header_scale);
2065    }
2066 }
2067
2068 /**
2069  * Emits an instruction before @inst to load the value named by @orig_src
2070  * from scratch space at @base_offset to @temp.
2071  */
2072 void
2073 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2074                                 dst_reg temp, src_reg orig_src,
2075                                 int base_offset)
2076 {
2077    int reg_offset = base_offset + orig_src.reg_offset;
2078    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2079
2080    emit_before(inst, SCRATCH_READ(temp, index));
2081 }
2082
2083 /**
2084  * Emits an instruction after @inst to store the value to be written
2085  * to @orig_dst to scratch space at @base_offset, from @temp.
2086  */
2087 void
2088 vec4_visitor::emit_scratch_write(vec4_instruction *inst,
2089                                  src_reg temp, dst_reg orig_dst,
2090                                  int base_offset)
2091 {
2092    int reg_offset = base_offset + orig_dst.reg_offset;
2093    src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
2094
2095    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2096                                        orig_dst.writemask));
2097    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2098    write->predicate = inst->predicate;
2099    write->ir = inst->ir;
2100    write->annotation = inst->annotation;
2101    inst->insert_after(write);
2102 }
2103
2104 /**
2105  * We can't generally support array access in GRF space, because a
2106  * single instruction's destination can only span 2 contiguous
2107  * registers.  So, we send all GRF arrays that get variable index
2108  * access to scratch space.
2109  */
2110 void
2111 vec4_visitor::move_grf_array_access_to_scratch()
2112 {
2113    int scratch_loc[this->virtual_grf_count];
2114
2115    for (int i = 0; i < this->virtual_grf_count; i++) {
2116       scratch_loc[i] = -1;
2117    }
2118
2119    /* First, calculate the set of virtual GRFs that need to be punted
2120     * to scratch due to having any array access on them, and where in
2121     * scratch.
2122     */
2123    foreach_list(node, &this->instructions) {
2124       vec4_instruction *inst = (vec4_instruction *)node;
2125
2126       if (inst->dst.file == GRF && inst->dst.reladdr &&
2127           scratch_loc[inst->dst.reg] == -1) {
2128          scratch_loc[inst->dst.reg] = c->last_scratch;
2129          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg] * 8 * 4;
2130       }
2131
2132       for (int i = 0 ; i < 3; i++) {
2133          src_reg *src = &inst->src[i];
2134
2135          if (src->file == GRF && src->reladdr &&
2136              scratch_loc[src->reg] == -1) {
2137             scratch_loc[src->reg] = c->last_scratch;
2138             c->last_scratch += this->virtual_grf_sizes[src->reg] * 8 * 4;
2139          }
2140       }
2141    }
2142
2143    /* Now, for anything that will be accessed through scratch, rewrite
2144     * it to load/store.  Note that this is a _safe list walk, because
2145     * we may generate a new scratch_write instruction after the one
2146     * we're processing.
2147     */
2148    foreach_list_safe(node, &this->instructions) {
2149       vec4_instruction *inst = (vec4_instruction *)node;
2150
2151       /* Set up the annotation tracking for new generated instructions. */
2152       base_ir = inst->ir;
2153       current_annotation = inst->annotation;
2154
2155       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2156          src_reg temp = src_reg(this, glsl_type::vec4_type);
2157
2158          emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
2159
2160          inst->dst.file = temp.file;
2161          inst->dst.reg = temp.reg;
2162          inst->dst.reg_offset = temp.reg_offset;
2163          inst->dst.reladdr = NULL;
2164       }
2165
2166       for (int i = 0 ; i < 3; i++) {
2167          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2168             continue;
2169
2170          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2171
2172          emit_scratch_read(inst, temp, inst->src[i],
2173                            scratch_loc[inst->src[i].reg]);
2174
2175          inst->src[i].file = temp.file;
2176          inst->src[i].reg = temp.reg;
2177          inst->src[i].reg_offset = temp.reg_offset;
2178          inst->src[i].reladdr = NULL;
2179       }
2180    }
2181 }
2182
2183 /**
2184  * Emits an instruction before @inst to load the value named by @orig_src
2185  * from the pull constant buffer (surface) at @base_offset to @temp.
2186  */
2187 void
2188 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2189                                       dst_reg temp, src_reg orig_src,
2190                                       int base_offset)
2191 {
2192    int reg_offset = base_offset + orig_src.reg_offset;
2193    src_reg index = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2194    vec4_instruction *load;
2195
2196    load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2197                                         temp, index);
2198    load->base_mrf = 14;
2199    load->mlen = 1;
2200    emit_before(inst, load);
2201 }
2202
2203 /**
2204  * Implements array access of uniforms by inserting a
2205  * PULL_CONSTANT_LOAD instruction.
2206  *
2207  * Unlike temporary GRF array access (where we don't support it due to
2208  * the difficulty of doing relative addressing on instruction
2209  * destinations), we could potentially do array access of uniforms
2210  * that were loaded in GRF space as push constants.  In real-world
2211  * usage we've seen, though, the arrays being used are always larger
2212  * than we could load as push constants, so just always move all
2213  * uniform array access out to a pull constant buffer.
2214  */
2215 void
2216 vec4_visitor::move_uniform_array_access_to_pull_constants()
2217 {
2218    int pull_constant_loc[this->uniforms];
2219
2220    for (int i = 0; i < this->uniforms; i++) {
2221       pull_constant_loc[i] = -1;
2222    }
2223
2224    /* Walk through and find array access of uniforms.  Put a copy of that
2225     * uniform in the pull constant buffer.
2226     *
2227     * Note that we don't move constant-indexed accesses to arrays.  No
2228     * testing has been done of the performance impact of this choice.
2229     */
2230    foreach_list_safe(node, &this->instructions) {
2231       vec4_instruction *inst = (vec4_instruction *)node;
2232
2233       for (int i = 0 ; i < 3; i++) {
2234          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2235             continue;
2236
2237          int uniform = inst->src[i].reg;
2238
2239          /* If this array isn't already present in the pull constant buffer,
2240           * add it.
2241           */
2242          if (pull_constant_loc[uniform] == -1) {
2243             const float **values = &prog_data->param[uniform * 4];
2244
2245             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2246
2247             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2248                prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2249             }
2250          }
2251
2252          /* Set up the annotation tracking for new generated instructions. */
2253          base_ir = inst->ir;
2254          current_annotation = inst->annotation;
2255
2256          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2257
2258          emit_pull_constant_load(inst, temp, inst->src[i],
2259                                  pull_constant_loc[uniform]);
2260
2261          inst->src[i].file = temp.file;
2262          inst->src[i].reg = temp.reg;
2263          inst->src[i].reg_offset = temp.reg_offset;
2264          inst->src[i].reladdr = NULL;
2265       }
2266    }
2267
2268    /* Now there are no accesses of the UNIFORM file with a reladdr, so
2269     * no need to track them as larger-than-vec4 objects.  This will be
2270     * relied on in cutting out unused uniform vectors from push
2271     * constants.
2272     */
2273    split_uniform_registers();
2274 }
2275
2276 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2277                            struct gl_shader_program *prog,
2278                            struct brw_shader *shader)
2279 {
2280    this->c = c;
2281    this->p = &c->func;
2282    this->brw = p->brw;
2283    this->intel = &brw->intel;
2284    this->ctx = &intel->ctx;
2285    this->prog = prog;
2286    this->shader = shader;
2287
2288    this->mem_ctx = ralloc_context(NULL);
2289    this->failed = false;
2290
2291    this->base_ir = NULL;
2292    this->current_annotation = NULL;
2293
2294    this->c = c;
2295    this->vp = prog->VertexProgram;
2296    this->prog_data = &c->prog_data;
2297
2298    this->variable_ht = hash_table_ctor(0,
2299                                        hash_table_pointer_hash,
2300                                        hash_table_pointer_compare);
2301
2302    this->virtual_grf_def = NULL;
2303    this->virtual_grf_use = NULL;
2304    this->virtual_grf_sizes = NULL;
2305    this->virtual_grf_count = 0;
2306    this->virtual_grf_reg_map = NULL;
2307    this->virtual_grf_reg_count = 0;
2308    this->virtual_grf_array_size = 0;
2309    this->live_intervals_valid = false;
2310
2311    this->uniforms = 0;
2312
2313    this->variable_ht = hash_table_ctor(0,
2314                                        hash_table_pointer_hash,
2315                                        hash_table_pointer_compare);
2316 }
2317
2318 vec4_visitor::~vec4_visitor()
2319 {
2320    ralloc_free(this->mem_ctx);
2321    hash_table_dtor(this->variable_ht);
2322 }
2323
2324
2325 void
2326 vec4_visitor::fail(const char *format, ...)
2327 {
2328    va_list va;
2329    char *msg;
2330
2331    if (failed)
2332       return;
2333
2334    failed = true;
2335
2336    va_start(va, format);
2337    msg = ralloc_vasprintf(mem_ctx, format, va);
2338    va_end(va);
2339    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2340
2341    this->fail_msg = msg;
2342
2343    if (INTEL_DEBUG & DEBUG_VS) {
2344       fprintf(stderr, "%s",  msg);
2345    }
2346 }
2347
2348 } /* namespace brw */