src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 extern "C" {
  26 #include "main/macros.h"
  27 #include "program/prog_parameter.h"
  28 }
  29
  30 namespace brw {
  31
  32 src_reg::src_reg(dst_reg reg)
  33 {
  34    init();
  35
  36    this->file = reg.file;
  37    this->reg = reg.reg;
  38    this->reg_offset = reg.reg_offset;
  39    this->type = reg.type;
  40    this->reladdr = reg.reladdr;
  41    this->fixed_hw_reg = reg.fixed_hw_reg;
  42
  43    int swizzles[4];
  44    int next_chan = 0;
  45    int last = 0;
  46
  47    for (int i = 0; i < 4; i++) {
  48       if (!(reg.writemask & (1 << i)))
  49          continue;
  50
  51       swizzles[next_chan++] = last = i;
  52    }
  53
  54    for (; next_chan < 4; next_chan++) {
  55       swizzles[next_chan] = last;
  56    }
  57
  58    this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
  59                                 swizzles[2], swizzles[3]);
  60 }
  61
  62 dst_reg::dst_reg(src_reg reg)
  63 {
  64    init();
  65
  66    this->file = reg.file;
  67    this->reg = reg.reg;
  68    this->reg_offset = reg.reg_offset;
  69    this->type = reg.type;
  70    this->writemask = WRITEMASK_XYZW;
  71    this->reladdr = reg.reladdr;
  72    this->fixed_hw_reg = reg.fixed_hw_reg;
  73 }
  74
  75 vec4_instruction::vec4_instruction(vec4_visitor *v,
  76                                    enum opcode opcode, dst_reg dst,
  77                                    src_reg src0, src_reg src1, src_reg src2)
  78 {
  79    this->opcode = opcode;
  80    this->dst = dst;
  81    this->src[0] = src0;
  82    this->src[1] = src1;
  83    this->src[2] = src2;
  84    this->ir = v->base_ir;
  85    this->annotation = v->current_annotation;
  86 }
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(vec4_instruction *inst)
  90 {
  91    this->instructions.push_tail(inst);
  92
  93    return inst;
  94 }
  95
  96 vec4_instruction *
  97 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  98 {
  99    new_inst->ir = inst->ir;
 100    new_inst->annotation = inst->annotation;
 101
 102    inst->insert_before(new_inst);
 103
 104    return inst;
 105 }
 106
 107 vec4_instruction *
 108 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
 109                    src_reg src0, src_reg src1, src_reg src2)
 110 {
 111    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
 112                                              src0, src1, src2));
 113 }
 114
 115
 116 vec4_instruction *
 117 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
 118 {
 119    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
 120 }
 121
 122 vec4_instruction *
 123 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
 124 {
 125    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
 126 }
 127
 128 vec4_instruction *
 129 vec4_visitor::emit(enum opcode opcode)
 130 {
 131    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 132 }
 133
 134 #define ALU1(op)                                                        \
 135    vec4_instruction *                                                   \
 136    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
 137    {                                                                    \
 138       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 139                                            src0);                       \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    vec4_instruction *                                                   \
 144    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 145    {                                                                    \
 146       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 147                                            src0, src1);                 \
 148    }
 149
 150 ALU1(NOT)
 151 ALU1(MOV)
 152 ALU1(FRC)
 153 ALU1(RNDD)
 154 ALU1(RNDE)
 155 ALU1(RNDZ)
 156 ALU2(ADD)
 157 ALU2(MUL)
 158 ALU2(MACH)
 159 ALU2(AND)
 160 ALU2(OR)
 161 ALU2(XOR)
 162 ALU2(DP3)
 163 ALU2(DP4)
 164
 165 /** Gen4 predicated IF. */
 166 vec4_instruction *
 167 vec4_visitor::IF(uint32_t predicate)
 168 {
 169    vec4_instruction *inst;
 170
 171    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 172    inst->predicate = predicate;
 173
 174    return inst;
 175 }
 176
 177 /** Gen6+ IF with embedded comparison. */
 178 vec4_instruction *
 179 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 180 {
 181    assert(intel->gen >= 6);
 182
 183    vec4_instruction *inst;
 184
 185    resolve_ud_negate(&src0);
 186    resolve_ud_negate(&src1);
 187
 188    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 189                                         src0, src1);
 190    inst->conditional_mod = condition;
 191
 192    return inst;
 193 }
 194
 195 /**
 196  * CMP: Sets the low bit of the destination channels with the result
 197  * of the comparison, while the upper bits are undefined, and updates
 198  * the flag register with the packed 16 bits of the result.
 199  */
 200 vec4_instruction *
 201 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 202 {
 203    vec4_instruction *inst;
 204
 205    /* original gen4 does type conversion to the destination type
 206     * before before comparison, producing garbage results for floating
 207     * point comparisons.
 208     */
 209    if (intel->gen == 4) {
 210       dst.type = src0.type;
 211       if (dst.file == HW_REG)
 212          dst.fixed_hw_reg.type = dst.type;
 213    }
 214
 215    resolve_ud_negate(&src0);
 216    resolve_ud_negate(&src1);
 217
 218    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 219    inst->conditional_mod = condition;
 220
 221    return inst;
 222 }
 223
 224 vec4_instruction *
 225 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 226 {
 227    vec4_instruction *inst;
 228
 229    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 230                                         dst, index);
 231    inst->base_mrf = 14;
 232    inst->mlen = 1;
 233
 234    return inst;
 235 }
 236
 237 vec4_instruction *
 238 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 239 {
 240    vec4_instruction *inst;
 241
 242    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 243                                         dst, src, index);
 244    inst->base_mrf = 13;
 245    inst->mlen = 2;
 246
 247    return inst;
 248 }
 249
 250 void
 251 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 252 {
 253    static enum opcode dot_opcodes[] = {
 254       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 255    };
 256
 257    emit(dot_opcodes[elements - 2], dst, src0, src1);
 258 }
 259
 260 void
 261 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 262 {
 263    /* The gen6 math instruction ignores the source modifiers --
 264     * swizzle, abs, negate, and at least some parts of the register
 265     * region description.
 266     *
 267     * While it would seem that this MOV could be avoided at this point
 268     * in the case that the swizzle is matched up with the destination
 269     * writemask, note that uniform packing and register allocation
 270     * could rearrange our swizzle, so let's leave this matter up to
 271     * copy propagation later.
 272     */
 273    src_reg temp_src = src_reg(this, glsl_type::vec4_type);
 274    emit(MOV(dst_reg(temp_src), src));
 275
 276    if (dst.writemask != WRITEMASK_XYZW) {
 277       /* The gen6 math instruction must be align1, so we can't do
 278        * writemasks.
 279        */
 280       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 281
 282       emit(opcode, temp_dst, temp_src);
 283
 284       emit(MOV(dst, src_reg(temp_dst)));
 285    } else {
 286       emit(opcode, dst, temp_src);
 287    }
 288 }
 289
 290 void
 291 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 292 {
 293    vec4_instruction *inst = emit(opcode, dst, src);
 294    inst->base_mrf = 1;
 295    inst->mlen = 1;
 296 }
 297
 298 void
 299 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 300 {
 301    switch (opcode) {
 302    case SHADER_OPCODE_RCP:
 303    case SHADER_OPCODE_RSQ:
 304    case SHADER_OPCODE_SQRT:
 305    case SHADER_OPCODE_EXP2:
 306    case SHADER_OPCODE_LOG2:
 307    case SHADER_OPCODE_SIN:
 308    case SHADER_OPCODE_COS:
 309       break;
 310    default:
 311       assert(!"not reached: bad math opcode");
 312       return;
 313    }
 314
 315    if (intel->gen >= 6) {
 316       return emit_math1_gen6(opcode, dst, src);
 317    } else {
 318       return emit_math1_gen4(opcode, dst, src);
 319    }
 320 }
 321
 322 void
 323 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 324                               dst_reg dst, src_reg src0, src_reg src1)
 325 {
 326    src_reg expanded;
 327
 328    /* The gen6 math instruction ignores the source modifiers --
 329     * swizzle, abs, negate, and at least some parts of the register
 330     * region description.  Move the sources to temporaries to make it
 331     * generally work.
 332     */
 333
 334    expanded = src_reg(this, glsl_type::vec4_type);
 335    expanded.type = src0.type;
 336    emit(MOV(dst_reg(expanded), src0));
 337    src0 = expanded;
 338
 339    expanded = src_reg(this, glsl_type::vec4_type);
 340    expanded.type = src1.type;
 341    emit(MOV(dst_reg(expanded), src1));
 342    src1 = expanded;
 343
 344    if (dst.writemask != WRITEMASK_XYZW) {
 345       /* The gen6 math instruction must be align1, so we can't do
 346        * writemasks.
 347        */
 348       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 349       temp_dst.type = dst.type;
 350
 351       emit(opcode, temp_dst, src0, src1);
 352
 353       emit(MOV(dst, src_reg(temp_dst)));
 354    } else {
 355       emit(opcode, dst, src0, src1);
 356    }
 357 }
 358
 359 void
 360 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 361                               dst_reg dst, src_reg src0, src_reg src1)
 362 {
 363    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 364    inst->base_mrf = 1;
 365    inst->mlen = 2;
 366 }
 367
 368 void
 369 vec4_visitor::emit_math(enum opcode opcode,
 370                         dst_reg dst, src_reg src0, src_reg src1)
 371 {
 372    switch (opcode) {
 373    case SHADER_OPCODE_POW:
 374    case SHADER_OPCODE_INT_QUOTIENT:
 375    case SHADER_OPCODE_INT_REMAINDER:
 376       break;
 377    default:
 378       assert(!"not reached: unsupported binary math opcode");
 379       return;
 380    }
 381
 382    if (intel->gen >= 6) {
 383       return emit_math2_gen6(opcode, dst, src0, src1);
 384    } else {
 385       return emit_math2_gen4(opcode, dst, src0, src1);
 386    }
 387 }
 388
 389 void
 390 vec4_visitor::visit_instructions(const exec_list *list)
 391 {
 392    foreach_list(node, list) {
 393       ir_instruction *ir = (ir_instruction *)node;
 394
 395       base_ir = ir;
 396       ir->accept(this);
 397    }
 398 }
 399
 400
 401 static int
 402 type_size(const struct glsl_type *type)
 403 {
 404    unsigned int i;
 405    int size;
 406
 407    switch (type->base_type) {
 408    case GLSL_TYPE_UINT:
 409    case GLSL_TYPE_INT:
 410    case GLSL_TYPE_FLOAT:
 411    case GLSL_TYPE_BOOL:
 412       if (type->is_matrix()) {
 413          return type->matrix_columns;
 414       } else {
 415          /* Regardless of size of vector, it gets a vec4. This is bad
 416           * packing for things like floats, but otherwise arrays become a
 417           * mess.  Hopefully a later pass over the code can pack scalars
 418           * down if appropriate.
 419           */
 420          return 1;
 421       }
 422    case GLSL_TYPE_ARRAY:
 423       assert(type->length > 0);
 424       return type_size(type->fields.array) * type->length;
 425    case GLSL_TYPE_STRUCT:
 426       size = 0;
 427       for (i = 0; i < type->length; i++) {
 428          size += type_size(type->fields.structure[i].type);
 429       }
 430       return size;
 431    case GLSL_TYPE_SAMPLER:
 432       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 433        * at link time.
 434        */
 435       return 1;
 436    default:
 437       assert(0);
 438       return 0;
 439    }
 440 }
 441
 442 int
 443 vec4_visitor::virtual_grf_alloc(int size)
 444 {
 445    if (virtual_grf_array_size <= virtual_grf_count) {
 446       if (virtual_grf_array_size == 0)
 447          virtual_grf_array_size = 16;
 448       else
 449          virtual_grf_array_size *= 2;
 450       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 451                                    virtual_grf_array_size);
 452       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 453                                      virtual_grf_array_size);
 454    }
 455    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 456    virtual_grf_reg_count += size;
 457    virtual_grf_sizes[virtual_grf_count] = size;
 458    return virtual_grf_count++;
 459 }
 460
 461 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 462 {
 463    init();
 464
 465    this->file = GRF;
 466    this->reg = v->virtual_grf_alloc(type_size(type));
 467
 468    if (type->is_array() || type->is_record()) {
 469       this->swizzle = BRW_SWIZZLE_NOOP;
 470    } else {
 471       this->swizzle = swizzle_for_size(type->vector_elements);
 472    }
 473
 474    this->type = brw_type_for_base_type(type);
 475 }
 476
 477 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 478 {
 479    init();
 480
 481    this->file = GRF;
 482    this->reg = v->virtual_grf_alloc(type_size(type));
 483
 484    if (type->is_array() || type->is_record()) {
 485       this->writemask = WRITEMASK_XYZW;
 486    } else {
 487       this->writemask = (1 << type->vector_elements) - 1;
 488    }
 489
 490    this->type = brw_type_for_base_type(type);
 491 }
 492
 493 /* Our support for uniforms is piggy-backed on the struct
 494  * gl_fragment_program, because that's where the values actually
 495  * get stored, rather than in some global gl_shader_program uniform
 496  * store.
 497  */
 498 int
 499 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
 500 {
 501    unsigned int offset = 0;
 502    float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
 503
 504    if (type->is_matrix()) {
 505       const glsl_type *column = type->column_type();
 506
 507       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 508          offset += setup_uniform_values(loc + offset, column);
 509       }
 510
 511       return offset;
 512    }
 513
 514    switch (type->base_type) {
 515    case GLSL_TYPE_FLOAT:
 516    case GLSL_TYPE_UINT:
 517    case GLSL_TYPE_INT:
 518    case GLSL_TYPE_BOOL:
 519       for (unsigned int i = 0; i < type->vector_elements; i++) {
 520          c->prog_data.param[this->uniforms * 4 + i] = &values[i];
 521       }
 522
 523       /* Set up pad elements to get things aligned to a vec4 boundary. */
 524       for (unsigned int i = type->vector_elements; i < 4; i++) {
 525          static float zero = 0;
 526
 527          c->prog_data.param[this->uniforms * 4 + i] = &zero;
 528       }
 529
 530       /* Track the size of this uniform vector, for future packing of
 531        * uniforms.
 532        */
 533       this->uniform_vector_size[this->uniforms] = type->vector_elements;
 534       this->uniforms++;
 535
 536       return 1;
 537
 538    case GLSL_TYPE_STRUCT:
 539       for (unsigned int i = 0; i < type->length; i++) {
 540          offset += setup_uniform_values(loc + offset,
 541                                         type->fields.structure[i].type);
 542       }
 543       return offset;
 544
 545    case GLSL_TYPE_ARRAY:
 546       for (unsigned int i = 0; i < type->length; i++) {
 547          offset += setup_uniform_values(loc + offset, type->fields.array);
 548       }
 549       return offset;
 550
 551    case GLSL_TYPE_SAMPLER:
 552       /* The sampler takes up a slot, but we don't use any values from it. */
 553       return 1;
 554
 555    default:
 556       assert(!"not reached");
 557       return 0;
 558    }
 559 }
 560
 561 void
 562 vec4_visitor::setup_uniform_clipplane_values()
 563 {
 564    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 565
 566    /* Pre-Gen6, we compact clip planes.  For example, if the user
 567     * enables just clip planes 0, 1, and 3, we will enable clip planes
 568     * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
 569     * plane 2.  This simplifies the implementation of the Gen6 clip
 570     * thread.
 571     *
 572     * In Gen6 and later, we don't compact clip planes, because this
 573     * simplifies the implementation of gl_ClipDistance.
 574     */
 575    int compacted_clipplane_index = 0;
 576    for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
 577       if (intel->gen < 6 &&
 578           !(c->key.userclip_planes_enabled_gen_4_5 & (1 << i))) {
 579          continue;
 580       }
 581       this->uniform_vector_size[this->uniforms] = 4;
 582       this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
 583       this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
 584       for (int j = 0; j < 4; ++j) {
 585          c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 586       }
 587       ++compacted_clipplane_index;
 588       ++this->uniforms;
 589    }
 590 }
 591
 592 /* Our support for builtin uniforms is even scarier than non-builtin.
 593  * It sits on top of the PROG_STATE_VAR parameters that are
 594  * automatically updated from GL context state.
 595  */
 596 void
 597 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 598 {
 599    const ir_state_slot *const slots = ir->state_slots;
 600    assert(ir->state_slots != NULL);
 601
 602    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 603       /* This state reference has already been setup by ir_to_mesa,
 604        * but we'll get the same index back here.  We can reference
 605        * ParameterValues directly, since unlike brw_fs.cpp, we never
 606        * add new state references during compile.
 607        */
 608       int index = _mesa_add_state_reference(this->vp->Base.Parameters,
 609                                             (gl_state_index *)slots[i].tokens);
 610       float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
 611
 612       this->uniform_vector_size[this->uniforms] = 0;
 613       /* Add each of the unique swizzled channels of the element.
 614        * This will end up matching the size of the glsl_type of this field.
 615        */
 616       int last_swiz = -1;
 617       for (unsigned int j = 0; j < 4; j++) {
 618          int swiz = GET_SWZ(slots[i].swizzle, j);
 619          last_swiz = swiz;
 620
 621          c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
 622          if (swiz <= last_swiz)
 623             this->uniform_vector_size[this->uniforms]++;
 624       }
 625       this->uniforms++;
 626    }
 627 }
 628
 629 dst_reg *
 630 vec4_visitor::variable_storage(ir_variable *var)
 631 {
 632    return (dst_reg *)hash_table_find(this->variable_ht, var);
 633 }
 634
 635 void
 636 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 637 {
 638    ir_expression *expr = ir->as_expression();
 639
 640    *predicate = BRW_PREDICATE_NORMAL;
 641
 642    if (expr) {
 643       src_reg op[2];
 644       vec4_instruction *inst;
 645
 646       assert(expr->get_num_operands() <= 2);
 647       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 648          expr->operands[i]->accept(this);
 649          op[i] = this->result;
 650
 651          resolve_ud_negate(&op[i]);
 652       }
 653
 654       switch (expr->operation) {
 655       case ir_unop_logic_not:
 656          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 657          inst->conditional_mod = BRW_CONDITIONAL_Z;
 658          break;
 659
 660       case ir_binop_logic_xor:
 661          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 662          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 663          break;
 664
 665       case ir_binop_logic_or:
 666          inst = emit(OR(dst_null_d(), op[0], op[1]));
 667          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 668          break;
 669
 670       case ir_binop_logic_and:
 671          inst = emit(AND(dst_null_d(), op[0], op[1]));
 672          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 673          break;
 674
 675       case ir_unop_f2b:
 676          if (intel->gen >= 6) {
 677             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 678          } else {
 679             inst = emit(MOV(dst_null_f(), op[0]));
 680             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 681          }
 682          break;
 683
 684       case ir_unop_i2b:
 685          if (intel->gen >= 6) {
 686             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 687          } else {
 688             inst = emit(MOV(dst_null_d(), op[0]));
 689             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 690          }
 691          break;
 692
 693       case ir_binop_all_equal:
 694          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 695          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 696          break;
 697
 698       case ir_binop_any_nequal:
 699          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 700          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 701          break;
 702
 703       case ir_unop_any:
 704          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 705          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 706          break;
 707
 708       case ir_binop_greater:
 709       case ir_binop_gequal:
 710       case ir_binop_less:
 711       case ir_binop_lequal:
 712       case ir_binop_equal:
 713       case ir_binop_nequal:
 714          emit(CMP(dst_null_d(), op[0], op[1],
 715                   brw_conditional_for_comparison(expr->operation)));
 716          break;
 717
 718       default:
 719          assert(!"not reached");
 720          break;
 721       }
 722       return;
 723    }
 724
 725    ir->accept(this);
 726
 727    resolve_ud_negate(&this->result);
 728
 729    if (intel->gen >= 6) {
 730       vec4_instruction *inst = emit(AND(dst_null_d(),
 731                                         this->result, src_reg(1)));
 732       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 733    } else {
 734       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 735       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 736    }
 737 }
 738
 739 /**
 740  * Emit a gen6 IF statement with the comparison folded into the IF
 741  * instruction.
 742  */
 743 void
 744 vec4_visitor::emit_if_gen6(ir_if *ir)
 745 {
 746    ir_expression *expr = ir->condition->as_expression();
 747
 748    if (expr) {
 749       src_reg op[2];
 750       dst_reg temp;
 751
 752       assert(expr->get_num_operands() <= 2);
 753       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 754          expr->operands[i]->accept(this);
 755          op[i] = this->result;
 756       }
 757
 758       switch (expr->operation) {
 759       case ir_unop_logic_not:
 760          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 761          return;
 762
 763       case ir_binop_logic_xor:
 764          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 765          return;
 766
 767       case ir_binop_logic_or:
 768          temp = dst_reg(this, glsl_type::bool_type);
 769          emit(OR(temp, op[0], op[1]));
 770          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 771          return;
 772
 773       case ir_binop_logic_and:
 774          temp = dst_reg(this, glsl_type::bool_type);
 775          emit(AND(temp, op[0], op[1]));
 776          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 777          return;
 778
 779       case ir_unop_f2b:
 780          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 781          return;
 782
 783       case ir_unop_i2b:
 784          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 785          return;
 786
 787       case ir_binop_greater:
 788       case ir_binop_gequal:
 789       case ir_binop_less:
 790       case ir_binop_lequal:
 791       case ir_binop_equal:
 792       case ir_binop_nequal:
 793          emit(IF(op[0], op[1],
 794                  brw_conditional_for_comparison(expr->operation)));
 795          return;
 796
 797       case ir_binop_all_equal:
 798          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 799          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 800          return;
 801
 802       case ir_binop_any_nequal:
 803          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 804          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 805          return;
 806
 807       case ir_unop_any:
 808          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 809          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 810          return;
 811
 812       default:
 813          assert(!"not reached");
 814          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 815          return;
 816       }
 817       return;
 818    }
 819
 820    ir->condition->accept(this);
 821
 822    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 823 }
 824
 825 void
 826 vec4_visitor::visit(ir_variable *ir)
 827 {
 828    dst_reg *reg = NULL;
 829
 830    if (variable_storage(ir))
 831       return;
 832
 833    switch (ir->mode) {
 834    case ir_var_in:
 835       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
 836
 837       /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 838        * come in as floating point conversions of the integer values.
 839        */
 840       for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
 841          if (!c->key.gl_fixed_input_size[i])
 842             continue;
 843
 844          dst_reg dst = *reg;
 845          dst.type = brw_type_for_base_type(ir->type);
 846          dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
 847          emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 848       }
 849       break;
 850
 851    case ir_var_out:
 852       reg = new(mem_ctx) dst_reg(this, ir->type);
 853
 854       for (int i = 0; i < type_size(ir->type); i++) {
 855          output_reg[ir->location + i] = *reg;
 856          output_reg[ir->location + i].reg_offset = i;
 857          output_reg[ir->location + i].type =
 858             brw_type_for_base_type(ir->type->get_scalar_type());
 859          output_reg_annotation[ir->location + i] = ir->name;
 860       }
 861       break;
 862
 863    case ir_var_auto:
 864    case ir_var_temporary:
 865       reg = new(mem_ctx) dst_reg(this, ir->type);
 866       break;
 867
 868    case ir_var_uniform:
 869       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 870
 871       /* Track how big the whole uniform variable is, in case we need to put a
 872        * copy of its data into pull constants for array access.
 873        */
 874       this->uniform_size[this->uniforms] = type_size(ir->type);
 875
 876       if (!strncmp(ir->name, "gl_", 3)) {
 877          setup_builtin_uniform_values(ir);
 878       } else {
 879          setup_uniform_values(ir->location, ir->type);
 880       }
 881       break;
 882
 883    case ir_var_system_value:
 884       /* VertexID is stored by the VF as the last vertex element, but
 885        * we don't represent it with a flag in inputs_read, so we call
 886        * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
 887        */
 888       reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
 889       prog_data->uses_vertexid = true;
 890
 891       switch (ir->location) {
 892       case SYSTEM_VALUE_VERTEX_ID:
 893          reg->writemask = WRITEMASK_X;
 894          break;
 895       case SYSTEM_VALUE_INSTANCE_ID:
 896          reg->writemask = WRITEMASK_Y;
 897          break;
 898       default:
 899          assert(!"not reached");
 900          break;
 901       }
 902       break;
 903
 904    default:
 905       assert(!"not reached");
 906    }
 907
 908    reg->type = brw_type_for_base_type(ir->type);
 909    hash_table_insert(this->variable_ht, reg, ir);
 910 }
 911
 912 void
 913 vec4_visitor::visit(ir_loop *ir)
 914 {
 915    dst_reg counter;
 916
 917    /* We don't want debugging output to print the whole body of the
 918     * loop as the annotation.
 919     */
 920    this->base_ir = NULL;
 921
 922    if (ir->counter != NULL) {
 923       this->base_ir = ir->counter;
 924       ir->counter->accept(this);
 925       counter = *(variable_storage(ir->counter));
 926
 927       if (ir->from != NULL) {
 928          this->base_ir = ir->from;
 929          ir->from->accept(this);
 930
 931          emit(MOV(counter, this->result));
 932       }
 933    }
 934
 935    emit(BRW_OPCODE_DO);
 936
 937    if (ir->to) {
 938       this->base_ir = ir->to;
 939       ir->to->accept(this);
 940
 941       emit(CMP(dst_null_d(), src_reg(counter), this->result,
 942                brw_conditional_for_comparison(ir->cmp)));
 943
 944       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
 945       inst->predicate = BRW_PREDICATE_NORMAL;
 946    }
 947
 948    visit_instructions(&ir->body_instructions);
 949
 950
 951    if (ir->increment) {
 952       this->base_ir = ir->increment;
 953       ir->increment->accept(this);
 954       emit(ADD(counter, src_reg(counter), this->result));
 955    }
 956
 957    emit(BRW_OPCODE_WHILE);
 958 }
 959
 960 void
 961 vec4_visitor::visit(ir_loop_jump *ir)
 962 {
 963    switch (ir->mode) {
 964    case ir_loop_jump::jump_break:
 965       emit(BRW_OPCODE_BREAK);
 966       break;
 967    case ir_loop_jump::jump_continue:
 968       emit(BRW_OPCODE_CONTINUE);
 969       break;
 970    }
 971 }
 972
 973
 974 void
 975 vec4_visitor::visit(ir_function_signature *ir)
 976 {
 977    assert(0);
 978    (void)ir;
 979 }
 980
 981 void
 982 vec4_visitor::visit(ir_function *ir)
 983 {
 984    /* Ignore function bodies other than main() -- we shouldn't see calls to
 985     * them since they should all be inlined.
 986     */
 987    if (strcmp(ir->name, "main") == 0) {
 988       const ir_function_signature *sig;
 989       exec_list empty;
 990
 991       sig = ir->matching_signature(&empty);
 992
 993       assert(sig);
 994
 995       visit_instructions(&sig->body);
 996    }
 997 }
 998
 999 bool
1000 vec4_visitor::try_emit_sat(ir_expression *ir)
1001 {
1002    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1003    if (!sat_src)
1004       return false;
1005
1006    sat_src->accept(this);
1007    src_reg src = this->result;
1008
1009    this->result = src_reg(this, ir->type);
1010    vec4_instruction *inst;
1011    inst = emit(MOV(dst_reg(this->result), src));
1012    inst->saturate = true;
1013
1014    return true;
1015 }
1016
1017 void
1018 vec4_visitor::emit_bool_comparison(unsigned int op,
1019                                  dst_reg dst, src_reg src0, src_reg src1)
1020 {
1021    /* original gen4 does destination conversion before comparison. */
1022    if (intel->gen < 5)
1023       dst.type = src0.type;
1024
1025    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1026
1027    dst.type = BRW_REGISTER_TYPE_D;
1028    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1029 }
1030
1031 void
1032 vec4_visitor::visit(ir_expression *ir)
1033 {
1034    unsigned int operand;
1035    src_reg op[Elements(ir->operands)];
1036    src_reg result_src;
1037    dst_reg result_dst;
1038    vec4_instruction *inst;
1039
1040    if (try_emit_sat(ir))
1041       return;
1042
1043    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1044       this->result.file = BAD_FILE;
1045       ir->operands[operand]->accept(this);
1046       if (this->result.file == BAD_FILE) {
1047          printf("Failed to get tree for expression operand:\n");
1048          ir->operands[operand]->print();
1049          exit(1);
1050       }
1051       op[operand] = this->result;
1052
1053       /* Matrix expression operands should have been broken down to vector
1054        * operations already.
1055        */
1056       assert(!ir->operands[operand]->type->is_matrix());
1057    }
1058
1059    int vector_elements = ir->operands[0]->type->vector_elements;
1060    if (ir->operands[1]) {
1061       vector_elements = MAX2(vector_elements,
1062                              ir->operands[1]->type->vector_elements);
1063    }
1064
1065    this->result.file = BAD_FILE;
1066
1067    /* Storage for our result.  Ideally for an assignment we'd be using
1068     * the actual storage for the result here, instead.
1069     */
1070    result_src = src_reg(this, ir->type);
1071    /* convenience for the emit functions below. */
1072    result_dst = dst_reg(result_src);
1073    /* If nothing special happens, this is the result. */
1074    this->result = result_src;
1075    /* Limit writes to the channels that will be used by result_src later.
1076     * This does limit this temp's use as a temporary for multi-instruction
1077     * sequences.
1078     */
1079    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1080
1081    switch (ir->operation) {
1082    case ir_unop_logic_not:
1083       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1084        * ones complement of the whole register, not just bit 0.
1085        */
1086       emit(XOR(result_dst, op[0], src_reg(1)));
1087       break;
1088    case ir_unop_neg:
1089       op[0].negate = !op[0].negate;
1090       this->result = op[0];
1091       break;
1092    case ir_unop_abs:
1093       op[0].abs = true;
1094       op[0].negate = false;
1095       this->result = op[0];
1096       break;
1097
1098    case ir_unop_sign:
1099       emit(MOV(result_dst, src_reg(0.0f)));
1100
1101       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1102       inst = emit(MOV(result_dst, src_reg(1.0f)));
1103       inst->predicate = BRW_PREDICATE_NORMAL;
1104
1105       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1106       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1107       inst->predicate = BRW_PREDICATE_NORMAL;
1108
1109       break;
1110
1111    case ir_unop_rcp:
1112       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1113       break;
1114
1115    case ir_unop_exp2:
1116       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1117       break;
1118    case ir_unop_log2:
1119       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1120       break;
1121    case ir_unop_exp:
1122    case ir_unop_log:
1123       assert(!"not reached: should be handled by ir_explog_to_explog2");
1124       break;
1125    case ir_unop_sin:
1126    case ir_unop_sin_reduced:
1127       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1128       break;
1129    case ir_unop_cos:
1130    case ir_unop_cos_reduced:
1131       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1132       break;
1133
1134    case ir_unop_dFdx:
1135    case ir_unop_dFdy:
1136       assert(!"derivatives not valid in vertex shader");
1137       break;
1138
1139    case ir_unop_noise:
1140       assert(!"not reached: should be handled by lower_noise");
1141       break;
1142
1143    case ir_binop_add:
1144       emit(ADD(result_dst, op[0], op[1]));
1145       break;
1146    case ir_binop_sub:
1147       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1148       break;
1149
1150    case ir_binop_mul:
1151       if (ir->type->is_integer()) {
1152          /* For integer multiplication, the MUL uses the low 16 bits
1153           * of one of the operands (src0 on gen6, src1 on gen7).  The
1154           * MACH accumulates in the contribution of the upper 16 bits
1155           * of that operand.
1156           *
1157           * FINISHME: Emit just the MUL if we know an operand is small
1158           * enough.
1159           */
1160          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1161
1162          emit(MUL(acc, op[0], op[1]));
1163          emit(MACH(dst_null_d(), op[0], op[1]));
1164          emit(MOV(result_dst, src_reg(acc)));
1165       } else {
1166          emit(MUL(result_dst, op[0], op[1]));
1167       }
1168       break;
1169    case ir_binop_div:
1170       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1171       assert(ir->type->is_integer());
1172       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1173       break;
1174    case ir_binop_mod:
1175       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1176       assert(ir->type->is_integer());
1177       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1178       break;
1179
1180    case ir_binop_less:
1181    case ir_binop_greater:
1182    case ir_binop_lequal:
1183    case ir_binop_gequal:
1184    case ir_binop_equal:
1185    case ir_binop_nequal: {
1186       emit(CMP(result_dst, op[0], op[1],
1187                brw_conditional_for_comparison(ir->operation)));
1188       emit(AND(result_dst, result_src, src_reg(0x1)));
1189       break;
1190    }
1191
1192    case ir_binop_all_equal:
1193       /* "==" operator producing a scalar boolean. */
1194       if (ir->operands[0]->type->is_vector() ||
1195           ir->operands[1]->type->is_vector()) {
1196          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1197          emit(MOV(result_dst, src_reg(0)));
1198          inst = emit(MOV(result_dst, src_reg(1)));
1199          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1200       } else {
1201          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1202          emit(AND(result_dst, result_src, src_reg(0x1)));
1203       }
1204       break;
1205    case ir_binop_any_nequal:
1206       /* "!=" operator producing a scalar boolean. */
1207       if (ir->operands[0]->type->is_vector() ||
1208           ir->operands[1]->type->is_vector()) {
1209          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1210
1211          emit(MOV(result_dst, src_reg(0)));
1212          inst = emit(MOV(result_dst, src_reg(1)));
1213          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1214       } else {
1215          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1216          emit(AND(result_dst, result_src, src_reg(0x1)));
1217       }
1218       break;
1219
1220    case ir_unop_any:
1221       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1222       emit(MOV(result_dst, src_reg(0)));
1223
1224       inst = emit(MOV(result_dst, src_reg(1)));
1225       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1226       break;
1227
1228    case ir_binop_logic_xor:
1229       emit(XOR(result_dst, op[0], op[1]));
1230       break;
1231
1232    case ir_binop_logic_or:
1233       emit(OR(result_dst, op[0], op[1]));
1234       break;
1235
1236    case ir_binop_logic_and:
1237       emit(AND(result_dst, op[0], op[1]));
1238       break;
1239
1240    case ir_binop_dot:
1241       assert(ir->operands[0]->type->is_vector());
1242       assert(ir->operands[0]->type == ir->operands[1]->type);
1243       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1244       break;
1245
1246    case ir_unop_sqrt:
1247       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1248       break;
1249    case ir_unop_rsq:
1250       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1251       break;
1252    case ir_unop_i2f:
1253    case ir_unop_i2u:
1254    case ir_unop_u2i:
1255    case ir_unop_u2f:
1256    case ir_unop_b2f:
1257    case ir_unop_b2i:
1258    case ir_unop_f2i:
1259       emit(MOV(result_dst, op[0]));
1260       break;
1261    case ir_unop_f2b:
1262    case ir_unop_i2b: {
1263       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1264       emit(AND(result_dst, result_src, src_reg(1)));
1265       break;
1266    }
1267
1268    case ir_unop_trunc:
1269       emit(RNDZ(result_dst, op[0]));
1270       break;
1271    case ir_unop_ceil:
1272       op[0].negate = !op[0].negate;
1273       inst = emit(RNDD(result_dst, op[0]));
1274       this->result.negate = true;
1275       break;
1276    case ir_unop_floor:
1277       inst = emit(RNDD(result_dst, op[0]));
1278       break;
1279    case ir_unop_fract:
1280       inst = emit(FRC(result_dst, op[0]));
1281       break;
1282    case ir_unop_round_even:
1283       emit(RNDE(result_dst, op[0]));
1284       break;
1285
1286    case ir_binop_min:
1287       emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_L));
1288
1289       inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1290       inst->predicate = BRW_PREDICATE_NORMAL;
1291       break;
1292    case ir_binop_max:
1293       emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_G));
1294
1295       inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1296       inst->predicate = BRW_PREDICATE_NORMAL;
1297       break;
1298
1299    case ir_binop_pow:
1300       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1301       break;
1302
1303    case ir_unop_bit_not:
1304       inst = emit(NOT(result_dst, op[0]));
1305       break;
1306    case ir_binop_bit_and:
1307       inst = emit(AND(result_dst, op[0], op[1]));
1308       break;
1309    case ir_binop_bit_xor:
1310       inst = emit(XOR(result_dst, op[0], op[1]));
1311       break;
1312    case ir_binop_bit_or:
1313       inst = emit(OR(result_dst, op[0], op[1]));
1314       break;
1315
1316    case ir_binop_lshift:
1317       inst = emit(BRW_OPCODE_SHL, result_dst, op[0], op[1]);
1318       break;
1319
1320    case ir_binop_rshift:
1321       if (ir->type->base_type == GLSL_TYPE_INT)
1322          inst = emit(BRW_OPCODE_ASR, result_dst, op[0], op[1]);
1323       else
1324          inst = emit(BRW_OPCODE_SHR, result_dst, op[0], op[1]);
1325       break;
1326
1327    case ir_quadop_vector:
1328       assert(!"not reached: should be handled by lower_quadop_vector");
1329       break;
1330    }
1331 }
1332
1333
1334 void
1335 vec4_visitor::visit(ir_swizzle *ir)
1336 {
1337    src_reg src;
1338    int i = 0;
1339    int swizzle[4];
1340
1341    /* Note that this is only swizzles in expressions, not those on the left
1342     * hand side of an assignment, which do write masking.  See ir_assignment
1343     * for that.
1344     */
1345
1346    ir->val->accept(this);
1347    src = this->result;
1348    assert(src.file != BAD_FILE);
1349
1350    for (i = 0; i < ir->type->vector_elements; i++) {
1351       switch (i) {
1352       case 0:
1353          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1354          break;
1355       case 1:
1356          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1357          break;
1358       case 2:
1359          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1360          break;
1361       case 3:
1362          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1363             break;
1364       }
1365    }
1366    for (; i < 4; i++) {
1367       /* Replicate the last channel out. */
1368       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1369    }
1370
1371    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1372
1373    this->result = src;
1374 }
1375
1376 void
1377 vec4_visitor::visit(ir_dereference_variable *ir)
1378 {
1379    const struct glsl_type *type = ir->type;
1380    dst_reg *reg = variable_storage(ir->var);
1381
1382    if (!reg) {
1383       fail("Failed to find variable storage for %s\n", ir->var->name);
1384       this->result = src_reg(brw_null_reg());
1385       return;
1386    }
1387
1388    this->result = src_reg(*reg);
1389
1390    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1391       this->result.swizzle = swizzle_for_size(type->vector_elements);
1392 }
1393
1394 void
1395 vec4_visitor::visit(ir_dereference_array *ir)
1396 {
1397    ir_constant *constant_index;
1398    src_reg src;
1399    int element_size = type_size(ir->type);
1400
1401    constant_index = ir->array_index->constant_expression_value();
1402
1403    ir->array->accept(this);
1404    src = this->result;
1405
1406    if (constant_index) {
1407       src.reg_offset += constant_index->value.i[0] * element_size;
1408    } else {
1409       /* Variable index array dereference.  It eats the "vec4" of the
1410        * base of the array and an index that offsets the Mesa register
1411        * index.
1412        */
1413       ir->array_index->accept(this);
1414
1415       src_reg index_reg;
1416
1417       if (element_size == 1) {
1418          index_reg = this->result;
1419       } else {
1420          index_reg = src_reg(this, glsl_type::int_type);
1421
1422          emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1423       }
1424
1425       if (src.reladdr) {
1426          src_reg temp = src_reg(this, glsl_type::int_type);
1427
1428          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1429
1430          index_reg = temp;
1431       }
1432
1433       src.reladdr = ralloc(mem_ctx, src_reg);
1434       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1435    }
1436
1437    /* If the type is smaller than a vec4, replicate the last channel out. */
1438    if (ir->type->is_scalar() || ir->type->is_vector())
1439       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1440    else
1441       src.swizzle = BRW_SWIZZLE_NOOP;
1442    src.type = brw_type_for_base_type(ir->type);
1443
1444    this->result = src;
1445 }
1446
1447 void
1448 vec4_visitor::visit(ir_dereference_record *ir)
1449 {
1450    unsigned int i;
1451    const glsl_type *struct_type = ir->record->type;
1452    int offset = 0;
1453
1454    ir->record->accept(this);
1455
1456    for (i = 0; i < struct_type->length; i++) {
1457       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1458          break;
1459       offset += type_size(struct_type->fields.structure[i].type);
1460    }
1461
1462    /* If the type is smaller than a vec4, replicate the last channel out. */
1463    if (ir->type->is_scalar() || ir->type->is_vector())
1464       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1465    else
1466       this->result.swizzle = BRW_SWIZZLE_NOOP;
1467    this->result.type = brw_type_for_base_type(ir->type);
1468
1469    this->result.reg_offset += offset;
1470 }
1471
1472 /**
1473  * We want to be careful in assignment setup to hit the actual storage
1474  * instead of potentially using a temporary like we might with the
1475  * ir_dereference handler.
1476  */
1477 static dst_reg
1478 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1479 {
1480    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1481     * access of a vector, it must be separated into a series conditional moves
1482     * before reaching this point (see ir_vec_index_to_cond_assign).
1483     */
1484    assert(ir->as_dereference());
1485    ir_dereference_array *deref_array = ir->as_dereference_array();
1486    if (deref_array) {
1487       assert(!deref_array->array->type->is_vector());
1488    }
1489
1490    /* Use the rvalue deref handler for the most part.  We'll ignore
1491     * swizzles in it and write swizzles using writemask, though.
1492     */
1493    ir->accept(v);
1494    return dst_reg(v->result);
1495 }
1496
1497 void
1498 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1499                               const struct glsl_type *type, uint32_t predicate)
1500 {
1501    if (type->base_type == GLSL_TYPE_STRUCT) {
1502       for (unsigned int i = 0; i < type->length; i++) {
1503          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1504       }
1505       return;
1506    }
1507
1508    if (type->is_array()) {
1509       for (unsigned int i = 0; i < type->length; i++) {
1510          emit_block_move(dst, src, type->fields.array, predicate);
1511       }
1512       return;
1513    }
1514
1515    if (type->is_matrix()) {
1516       const struct glsl_type *vec_type;
1517
1518       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1519                                          type->vector_elements, 1);
1520
1521       for (int i = 0; i < type->matrix_columns; i++) {
1522          emit_block_move(dst, src, vec_type, predicate);
1523       }
1524       return;
1525    }
1526
1527    assert(type->is_scalar() || type->is_vector());
1528
1529    dst->type = brw_type_for_base_type(type);
1530    src->type = dst->type;
1531
1532    dst->writemask = (1 << type->vector_elements) - 1;
1533
1534    /* Do we need to worry about swizzling a swizzle? */
1535    assert(src->swizzle == BRW_SWIZZLE_NOOP
1536           || src->swizzle == swizzle_for_size(type->vector_elements));
1537    src->swizzle = swizzle_for_size(type->vector_elements);
1538
1539    vec4_instruction *inst = emit(MOV(*dst, *src));
1540    inst->predicate = predicate;
1541
1542    dst->reg_offset++;
1543    src->reg_offset++;
1544 }
1545
1546
1547 /* If the RHS processing resulted in an instruction generating a
1548  * temporary value, and it would be easy to rewrite the instruction to
1549  * generate its result right into the LHS instead, do so.  This ends
1550  * up reliably removing instructions where it can be tricky to do so
1551  * later without real UD chain information.
1552  */
1553 bool
1554 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1555                                      dst_reg dst,
1556                                      src_reg src,
1557                                      vec4_instruction *pre_rhs_inst,
1558                                      vec4_instruction *last_rhs_inst)
1559 {
1560    /* This could be supported, but it would take more smarts. */
1561    if (ir->condition)
1562       return false;
1563
1564    if (pre_rhs_inst == last_rhs_inst)
1565       return false; /* No instructions generated to work with. */
1566
1567    /* Make sure the last instruction generated our source reg. */
1568    if (src.file != GRF ||
1569        src.file != last_rhs_inst->dst.file ||
1570        src.reg != last_rhs_inst->dst.reg ||
1571        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1572        src.reladdr ||
1573        src.abs ||
1574        src.negate ||
1575        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1576       return false;
1577
1578    /* Check that that last instruction fully initialized the channels
1579     * we want to use, in the order we want to use them.  We could
1580     * potentially reswizzle the operands of many instructions so that
1581     * we could handle out of order channels, but don't yet.
1582     */
1583
1584    for (unsigned i = 0; i < 4; i++) {
1585       if (dst.writemask & (1 << i)) {
1586          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1587             return false;
1588
1589          if (BRW_GET_SWZ(src.swizzle, i) != i)
1590             return false;
1591       }
1592    }
1593
1594    /* Success!  Rewrite the instruction. */
1595    last_rhs_inst->dst.file = dst.file;
1596    last_rhs_inst->dst.reg = dst.reg;
1597    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1598    last_rhs_inst->dst.reladdr = dst.reladdr;
1599    last_rhs_inst->dst.writemask &= dst.writemask;
1600
1601    return true;
1602 }
1603
1604 void
1605 vec4_visitor::visit(ir_assignment *ir)
1606 {
1607    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1608    uint32_t predicate = BRW_PREDICATE_NONE;
1609
1610    if (!ir->lhs->type->is_scalar() &&
1611        !ir->lhs->type->is_vector()) {
1612       ir->rhs->accept(this);
1613       src_reg src = this->result;
1614
1615       if (ir->condition) {
1616          emit_bool_to_cond_code(ir->condition, &predicate);
1617       }
1618
1619       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1620       return;
1621    }
1622
1623    /* Now we're down to just a scalar/vector with writemasks. */
1624    int i;
1625
1626    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1627    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1628
1629    ir->rhs->accept(this);
1630
1631    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1632
1633    src_reg src = this->result;
1634
1635    int swizzles[4];
1636    int first_enabled_chan = 0;
1637    int src_chan = 0;
1638
1639    assert(ir->lhs->type->is_vector() ||
1640           ir->lhs->type->is_scalar());
1641    dst.writemask = ir->write_mask;
1642
1643    for (int i = 0; i < 4; i++) {
1644       if (dst.writemask & (1 << i)) {
1645          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1646          break;
1647       }
1648    }
1649
1650    /* Swizzle a small RHS vector into the channels being written.
1651     *
1652     * glsl ir treats write_mask as dictating how many channels are
1653     * present on the RHS while in our instructions we need to make
1654     * those channels appear in the slots of the vec4 they're written to.
1655     */
1656    for (int i = 0; i < 4; i++) {
1657       if (dst.writemask & (1 << i))
1658          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1659       else
1660          swizzles[i] = first_enabled_chan;
1661    }
1662    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1663                               swizzles[2], swizzles[3]);
1664
1665    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1666       return;
1667    }
1668
1669    if (ir->condition) {
1670       emit_bool_to_cond_code(ir->condition, &predicate);
1671    }
1672
1673    for (i = 0; i < type_size(ir->lhs->type); i++) {
1674       vec4_instruction *inst = emit(MOV(dst, src));
1675       inst->predicate = predicate;
1676
1677       dst.reg_offset++;
1678       src.reg_offset++;
1679    }
1680 }
1681
1682 void
1683 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1684 {
1685    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1686       foreach_list(node, &ir->components) {
1687          ir_constant *field_value = (ir_constant *)node;
1688
1689          emit_constant_values(dst, field_value);
1690       }
1691       return;
1692    }
1693
1694    if (ir->type->is_array()) {
1695       for (unsigned int i = 0; i < ir->type->length; i++) {
1696          emit_constant_values(dst, ir->array_elements[i]);
1697       }
1698       return;
1699    }
1700
1701    if (ir->type->is_matrix()) {
1702       for (int i = 0; i < ir->type->matrix_columns; i++) {
1703          for (int j = 0; j < ir->type->vector_elements; j++) {
1704             dst->writemask = 1 << j;
1705             dst->type = BRW_REGISTER_TYPE_F;
1706
1707             emit(MOV(*dst,
1708                      src_reg(ir->value.f[i * ir->type->vector_elements + j])));
1709          }
1710          dst->reg_offset++;
1711       }
1712       return;
1713    }
1714
1715    for (int i = 0; i < ir->type->vector_elements; i++) {
1716       dst->writemask = 1 << i;
1717       dst->type = brw_type_for_base_type(ir->type);
1718
1719       switch (ir->type->base_type) {
1720       case GLSL_TYPE_FLOAT:
1721          emit(MOV(*dst, src_reg(ir->value.f[i])));
1722          break;
1723       case GLSL_TYPE_INT:
1724          emit(MOV(*dst, src_reg(ir->value.i[i])));
1725          break;
1726       case GLSL_TYPE_UINT:
1727          emit(MOV(*dst, src_reg(ir->value.u[i])));
1728          break;
1729       case GLSL_TYPE_BOOL:
1730          emit(MOV(*dst, src_reg(ir->value.b[i])));
1731          break;
1732       default:
1733          assert(!"Non-float/uint/int/bool constant");
1734          break;
1735       }
1736    }
1737    dst->reg_offset++;
1738 }
1739
1740 void
1741 vec4_visitor::visit(ir_constant *ir)
1742 {
1743    dst_reg dst = dst_reg(this, ir->type);
1744    this->result = src_reg(dst);
1745
1746    emit_constant_values(&dst, ir);
1747 }
1748
1749 void
1750 vec4_visitor::visit(ir_call *ir)
1751 {
1752    assert(!"not reached");
1753 }
1754
1755 void
1756 vec4_visitor::visit(ir_texture *ir)
1757 {
1758    /* FINISHME: Implement vertex texturing.
1759     *
1760     * With 0 vertex samplers available, the linker will reject
1761     * programs that do vertex texturing, but after our visitor has
1762     * run.
1763     */
1764    this->result = src_reg(this, glsl_type::vec4_type);
1765 }
1766
1767 void
1768 vec4_visitor::visit(ir_return *ir)
1769 {
1770    assert(!"not reached");
1771 }
1772
1773 void
1774 vec4_visitor::visit(ir_discard *ir)
1775 {
1776    assert(!"not reached");
1777 }
1778
1779 void
1780 vec4_visitor::visit(ir_if *ir)
1781 {
1782    /* Don't point the annotation at the if statement, because then it plus
1783     * the then and else blocks get printed.
1784     */
1785    this->base_ir = ir->condition;
1786
1787    if (intel->gen == 6) {
1788       emit_if_gen6(ir);
1789    } else {
1790       uint32_t predicate;
1791       emit_bool_to_cond_code(ir->condition, &predicate);
1792       emit(IF(predicate));
1793    }
1794
1795    visit_instructions(&ir->then_instructions);
1796
1797    if (!ir->else_instructions.is_empty()) {
1798       this->base_ir = ir->condition;
1799       emit(BRW_OPCODE_ELSE);
1800
1801       visit_instructions(&ir->else_instructions);
1802    }
1803
1804    this->base_ir = ir->condition;
1805    emit(BRW_OPCODE_ENDIF);
1806 }
1807
1808 void
1809 vec4_visitor::emit_ndc_computation()
1810 {
1811    /* Get the position */
1812    src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
1813
1814    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1815    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1816    output_reg[BRW_VERT_RESULT_NDC] = ndc;
1817
1818    current_annotation = "NDC";
1819    dst_reg ndc_w = ndc;
1820    ndc_w.writemask = WRITEMASK_W;
1821    src_reg pos_w = pos;
1822    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1823    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1824
1825    dst_reg ndc_xyz = ndc;
1826    ndc_xyz.writemask = WRITEMASK_XYZ;
1827
1828    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1829 }
1830
1831 void
1832 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
1833 {
1834    if (intel->gen < 6 &&
1835        ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1836         c->key.userclip_active || brw->has_negative_rhw_bug)) {
1837       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1838       dst_reg header1_w = header1;
1839       header1_w.writemask = WRITEMASK_W;
1840       GLuint i;
1841
1842       emit(MOV(header1, 0u));
1843
1844       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1845          src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
1846
1847          current_annotation = "Point size";
1848          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
1849          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
1850       }
1851
1852       current_annotation = "Clipping flags";
1853       for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
1854          vec4_instruction *inst;
1855
1856          inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
1857                          src_reg(this->userplane[i])));
1858          inst->conditional_mod = BRW_CONDITIONAL_L;
1859
1860          inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
1861          inst->predicate = BRW_PREDICATE_NORMAL;
1862       }
1863
1864       /* i965 clipping workaround:
1865        * 1) Test for -ve rhw
1866        * 2) If set,
1867        *      set ndc = (0,0,0,0)
1868        *      set ucp[6] = 1
1869        *
1870        * Later, clipping will detect ucp[6] and ensure the primitive is
1871        * clipped against all fixed planes.
1872        */
1873       if (brw->has_negative_rhw_bug) {
1874 #if 0
1875          /* FINISHME */
1876          brw_CMP(p,
1877                  vec8(brw_null_reg()),
1878                  BRW_CONDITIONAL_L,
1879                  brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
1880                  brw_imm_f(0));
1881
1882          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1883          brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
1884          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1885 #endif
1886       }
1887
1888       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1889    } else if (intel->gen < 6) {
1890       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
1891    } else {
1892       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
1893       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1894          emit(MOV(brw_writemask(reg, WRITEMASK_W),
1895                   src_reg(output_reg[VERT_RESULT_PSIZ])));
1896       }
1897    }
1898 }
1899
1900 void
1901 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
1902 {
1903    if (intel->gen < 6) {
1904       /* Clip distance slots are set aside in gen5, but they are not used.  It
1905        * is not clear whether we actually need to set aside space for them,
1906        * but the performance cost is negligible.
1907        */
1908       return;
1909    }
1910
1911    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
1912     *
1913     *     "If a linked set of shaders forming the vertex stage contains no
1914     *     static write to gl_ClipVertex or gl_ClipDistance, but the
1915     *     application has requested clipping against user clip planes through
1916     *     the API, then the coordinate written to gl_Position is used for
1917     *     comparison against the user clip planes."
1918     *
1919     * This function is only called if the shader didn't write to
1920     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
1921     * if the user wrote to it; otherwise we use gl_Position.
1922     */
1923    gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
1924    if (!(c->prog_data.outputs_written
1925          & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
1926       clip_vertex = VERT_RESULT_HPOS;
1927    }
1928
1929    for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
1930         ++i) {
1931       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
1932                src_reg(output_reg[clip_vertex]),
1933                src_reg(this->userplane[i + offset])));
1934    }
1935 }
1936
1937 void
1938 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
1939 {
1940    assert (vert_result < VERT_RESULT_MAX);
1941    reg.type = output_reg[vert_result].type;
1942    current_annotation = output_reg_annotation[vert_result];
1943    /* Copy the register, saturating if necessary */
1944    vec4_instruction *inst = emit(MOV(reg,
1945                                      src_reg(output_reg[vert_result])));
1946    if ((vert_result == VERT_RESULT_COL0 ||
1947         vert_result == VERT_RESULT_COL1 ||
1948         vert_result == VERT_RESULT_BFC0 ||
1949         vert_result == VERT_RESULT_BFC1) &&
1950        c->key.clamp_vertex_color) {
1951       inst->saturate = true;
1952    }
1953 }
1954
1955 void
1956 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
1957 {
1958    struct brw_reg hw_reg = brw_message_reg(mrf);
1959    dst_reg reg = dst_reg(MRF, mrf);
1960    reg.type = BRW_REGISTER_TYPE_F;
1961
1962    switch (vert_result) {
1963    case VERT_RESULT_PSIZ:
1964       /* PSIZ is always in slot 0, and is coupled with other flags. */
1965       current_annotation = "indices, point width, clip flags";
1966       emit_psiz_and_flags(hw_reg);
1967       break;
1968    case BRW_VERT_RESULT_NDC:
1969       current_annotation = "NDC";
1970       emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
1971       break;
1972    case BRW_VERT_RESULT_HPOS_DUPLICATE:
1973    case VERT_RESULT_HPOS:
1974       current_annotation = "gl_Position";
1975       emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
1976       break;
1977    case VERT_RESULT_CLIP_DIST0:
1978    case VERT_RESULT_CLIP_DIST1:
1979       if (this->c->key.uses_clip_distance) {
1980          emit_generic_urb_slot(reg, vert_result);
1981       } else {
1982          current_annotation = "user clip distances";
1983          emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
1984       }
1985       break;
1986    case BRW_VERT_RESULT_PAD:
1987       /* No need to write to this slot */
1988       break;
1989    default:
1990       emit_generic_urb_slot(reg, vert_result);
1991       break;
1992    }
1993 }
1994
1995 static int
1996 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
1997 {
1998    struct intel_context *intel = &brw->intel;
1999
2000    if (intel->gen >= 6) {
2001       /* URB data written (does not include the message header reg) must
2002        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2003        * section 5.4.3.2.2: URB_INTERLEAVED.
2004        *
2005        * URB entries are allocated on a multiple of 1024 bits, so an
2006        * extra 128 bits written here to make the end align to 256 is
2007        * no problem.
2008        */
2009       if ((mlen % 2) != 1)
2010          mlen++;
2011    }
2012
2013    return mlen;
2014 }
2015
2016 /**
2017  * Generates the VUE payload plus the 1 or 2 URB write instructions to
2018  * complete the VS thread.
2019  *
2020  * The VUE layout is documented in Volume 2a.
2021  */
2022 void
2023 vec4_visitor::emit_urb_writes()
2024 {
2025    /* MRF 0 is reserved for the debugger, so start with message header
2026     * in MRF 1.
2027     */
2028    int base_mrf = 1;
2029    int mrf = base_mrf;
2030    /* In the process of generating our URB write message contents, we
2031     * may need to unspill a register or load from an array.  Those
2032     * reads would use MRFs 14-15.
2033     */
2034    int max_usable_mrf = 13;
2035
2036    /* The following assertion verifies that max_usable_mrf causes an
2037     * even-numbered amount of URB write data, which will meet gen6's
2038     * requirements for length alignment.
2039     */
2040    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2041
2042    /* FINISHME: edgeflag */
2043
2044    brw_compute_vue_map(&c->vue_map, intel, c->key.userclip_active,
2045                        c->prog_data.outputs_written);
2046
2047    /* First mrf is the g0-based message header containing URB handles and such,
2048     * which is implied in VS_OPCODE_URB_WRITE.
2049     */
2050    mrf++;
2051
2052    if (intel->gen < 6) {
2053       emit_ndc_computation();
2054    }
2055
2056    /* Set up the VUE data for the first URB write */
2057    int slot;
2058    for (slot = 0; slot < c->vue_map.num_slots; ++slot) {
2059       emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
2060
2061       /* If this was max_usable_mrf, we can't fit anything more into this URB
2062        * WRITE.
2063        */
2064       if (mrf > max_usable_mrf) {
2065          slot++;
2066          break;
2067       }
2068    }
2069
2070    current_annotation = "URB write";
2071    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2072    inst->base_mrf = base_mrf;
2073    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2074    inst->eot = (slot >= c->vue_map.num_slots);
2075
2076    /* Optional second URB write */
2077    if (!inst->eot) {
2078       mrf = base_mrf + 1;
2079
2080       for (; slot < c->vue_map.num_slots; ++slot) {
2081          assert(mrf < max_usable_mrf);
2082
2083          emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
2084       }
2085
2086       current_annotation = "URB write";
2087       inst = emit(VS_OPCODE_URB_WRITE);
2088       inst->base_mrf = base_mrf;
2089       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2090       inst->eot = true;
2091       /* URB destination offset.  In the previous write, we got MRFs
2092        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2093        * URB row increments, and each of our MRFs is half of one of
2094        * those, since we're doing interleaved writes.
2095        */
2096       inst->offset = (max_usable_mrf - base_mrf) / 2;
2097    }
2098
2099    if (intel->gen == 6)
2100       c->prog_data.urb_entry_size = ALIGN(c->vue_map.num_slots, 8) / 8;
2101    else
2102       c->prog_data.urb_entry_size = ALIGN(c->vue_map.num_slots, 4) / 4;
2103 }
2104
2105 src_reg
2106 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2107                                  src_reg *reladdr, int reg_offset)
2108 {
2109    /* Because we store the values to scratch interleaved like our
2110     * vertex data, we need to scale the vec4 index by 2.
2111     */
2112    int message_header_scale = 2;
2113
2114    /* Pre-gen6, the message header uses byte offsets instead of vec4
2115     * (16-byte) offset units.
2116     */
2117    if (intel->gen < 6)
2118       message_header_scale *= 16;
2119
2120    if (reladdr) {
2121       src_reg index = src_reg(this, glsl_type::int_type);
2122
2123       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2124       emit_before(inst, MUL(dst_reg(index),
2125                             index, src_reg(message_header_scale)));
2126
2127       return index;
2128    } else {
2129       return src_reg(reg_offset * message_header_scale);
2130    }
2131 }
2132
2133 src_reg
2134 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2135                                        src_reg *reladdr, int reg_offset)
2136 {
2137    if (reladdr) {
2138       src_reg index = src_reg(this, glsl_type::int_type);
2139
2140       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2141
2142       /* Pre-gen6, the message header uses byte offsets instead of vec4
2143        * (16-byte) offset units.
2144        */
2145       if (intel->gen < 6) {
2146          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2147       }
2148
2149       return index;
2150    } else {
2151       int message_header_scale = intel->gen < 6 ? 16 : 1;
2152       return src_reg(reg_offset * message_header_scale);
2153    }
2154 }
2155
2156 /**
2157  * Emits an instruction before @inst to load the value named by @orig_src
2158  * from scratch space at @base_offset to @temp.
2159  */
2160 void
2161 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2162                                 dst_reg temp, src_reg orig_src,
2163                                 int base_offset)
2164 {
2165    int reg_offset = base_offset + orig_src.reg_offset;
2166    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2167
2168    emit_before(inst, SCRATCH_READ(temp, index));
2169 }
2170
2171 /**
2172  * Emits an instruction after @inst to store the value to be written
2173  * to @orig_dst to scratch space at @base_offset, from @temp.
2174  */
2175 void
2176 vec4_visitor::emit_scratch_write(vec4_instruction *inst,
2177                                  src_reg temp, dst_reg orig_dst,
2178                                  int base_offset)
2179 {
2180    int reg_offset = base_offset + orig_dst.reg_offset;
2181    src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
2182
2183    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2184                                        orig_dst.writemask));
2185    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2186    write->predicate = inst->predicate;
2187    write->ir = inst->ir;
2188    write->annotation = inst->annotation;
2189    inst->insert_after(write);
2190 }
2191
2192 /**
2193  * We can't generally support array access in GRF space, because a
2194  * single instruction's destination can only span 2 contiguous
2195  * registers.  So, we send all GRF arrays that get variable index
2196  * access to scratch space.
2197  */
2198 void
2199 vec4_visitor::move_grf_array_access_to_scratch()
2200 {
2201    int scratch_loc[this->virtual_grf_count];
2202
2203    for (int i = 0; i < this->virtual_grf_count; i++) {
2204       scratch_loc[i] = -1;
2205    }
2206
2207    /* First, calculate the set of virtual GRFs that need to be punted
2208     * to scratch due to having any array access on them, and where in
2209     * scratch.
2210     */
2211    foreach_list(node, &this->instructions) {
2212       vec4_instruction *inst = (vec4_instruction *)node;
2213
2214       if (inst->dst.file == GRF && inst->dst.reladdr &&
2215           scratch_loc[inst->dst.reg] == -1) {
2216          scratch_loc[inst->dst.reg] = c->last_scratch;
2217          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg] * 8 * 4;
2218       }
2219
2220       for (int i = 0 ; i < 3; i++) {
2221          src_reg *src = &inst->src[i];
2222
2223          if (src->file == GRF && src->reladdr &&
2224              scratch_loc[src->reg] == -1) {
2225             scratch_loc[src->reg] = c->last_scratch;
2226             c->last_scratch += this->virtual_grf_sizes[src->reg] * 8 * 4;
2227          }
2228       }
2229    }
2230
2231    /* Now, for anything that will be accessed through scratch, rewrite
2232     * it to load/store.  Note that this is a _safe list walk, because
2233     * we may generate a new scratch_write instruction after the one
2234     * we're processing.
2235     */
2236    foreach_list_safe(node, &this->instructions) {
2237       vec4_instruction *inst = (vec4_instruction *)node;
2238
2239       /* Set up the annotation tracking for new generated instructions. */
2240       base_ir = inst->ir;
2241       current_annotation = inst->annotation;
2242
2243       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2244          src_reg temp = src_reg(this, glsl_type::vec4_type);
2245
2246          emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
2247
2248          inst->dst.file = temp.file;
2249          inst->dst.reg = temp.reg;
2250          inst->dst.reg_offset = temp.reg_offset;
2251          inst->dst.reladdr = NULL;
2252       }
2253
2254       for (int i = 0 ; i < 3; i++) {
2255          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2256             continue;
2257
2258          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2259
2260          emit_scratch_read(inst, temp, inst->src[i],
2261                            scratch_loc[inst->src[i].reg]);
2262
2263          inst->src[i].file = temp.file;
2264          inst->src[i].reg = temp.reg;
2265          inst->src[i].reg_offset = temp.reg_offset;
2266          inst->src[i].reladdr = NULL;
2267       }
2268    }
2269 }
2270
2271 /**
2272  * Emits an instruction before @inst to load the value named by @orig_src
2273  * from the pull constant buffer (surface) at @base_offset to @temp.
2274  */
2275 void
2276 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2277                                       dst_reg temp, src_reg orig_src,
2278                                       int base_offset)
2279 {
2280    int reg_offset = base_offset + orig_src.reg_offset;
2281    src_reg index = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2282    vec4_instruction *load;
2283
2284    load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2285                                         temp, index);
2286    load->base_mrf = 14;
2287    load->mlen = 1;
2288    emit_before(inst, load);
2289 }
2290
2291 /**
2292  * Implements array access of uniforms by inserting a
2293  * PULL_CONSTANT_LOAD instruction.
2294  *
2295  * Unlike temporary GRF array access (where we don't support it due to
2296  * the difficulty of doing relative addressing on instruction
2297  * destinations), we could potentially do array access of uniforms
2298  * that were loaded in GRF space as push constants.  In real-world
2299  * usage we've seen, though, the arrays being used are always larger
2300  * than we could load as push constants, so just always move all
2301  * uniform array access out to a pull constant buffer.
2302  */
2303 void
2304 vec4_visitor::move_uniform_array_access_to_pull_constants()
2305 {
2306    int pull_constant_loc[this->uniforms];
2307
2308    for (int i = 0; i < this->uniforms; i++) {
2309       pull_constant_loc[i] = -1;
2310    }
2311
2312    /* Walk through and find array access of uniforms.  Put a copy of that
2313     * uniform in the pull constant buffer.
2314     *
2315     * Note that we don't move constant-indexed accesses to arrays.  No
2316     * testing has been done of the performance impact of this choice.
2317     */
2318    foreach_list_safe(node, &this->instructions) {
2319       vec4_instruction *inst = (vec4_instruction *)node;
2320
2321       for (int i = 0 ; i < 3; i++) {
2322          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2323             continue;
2324
2325          int uniform = inst->src[i].reg;
2326
2327          /* If this array isn't already present in the pull constant buffer,
2328           * add it.
2329           */
2330          if (pull_constant_loc[uniform] == -1) {
2331             const float **values = &prog_data->param[uniform * 4];
2332
2333             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2334
2335             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2336                prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2337             }
2338          }
2339
2340          /* Set up the annotation tracking for new generated instructions. */
2341          base_ir = inst->ir;
2342          current_annotation = inst->annotation;
2343
2344          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2345
2346          emit_pull_constant_load(inst, temp, inst->src[i],
2347                                  pull_constant_loc[uniform]);
2348
2349          inst->src[i].file = temp.file;
2350          inst->src[i].reg = temp.reg;
2351          inst->src[i].reg_offset = temp.reg_offset;
2352          inst->src[i].reladdr = NULL;
2353       }
2354    }
2355
2356    /* Now there are no accesses of the UNIFORM file with a reladdr, so
2357     * no need to track them as larger-than-vec4 objects.  This will be
2358     * relied on in cutting out unused uniform vectors from push
2359     * constants.
2360     */
2361    split_uniform_registers();
2362 }
2363
2364 void
2365 vec4_visitor::resolve_ud_negate(src_reg *reg)
2366 {
2367    if (reg->type != BRW_REGISTER_TYPE_UD ||
2368        !reg->negate)
2369       return;
2370
2371    src_reg temp = src_reg(this, glsl_type::uvec4_type);
2372    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2373    *reg = temp;
2374 }
2375
2376 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2377                            struct gl_shader_program *prog,
2378                            struct brw_shader *shader)
2379 {
2380    this->c = c;
2381    this->p = &c->func;
2382    this->brw = p->brw;
2383    this->intel = &brw->intel;
2384    this->ctx = &intel->ctx;
2385    this->prog = prog;
2386    this->shader = shader;
2387
2388    this->mem_ctx = ralloc_context(NULL);
2389    this->failed = false;
2390
2391    this->base_ir = NULL;
2392    this->current_annotation = NULL;
2393
2394    this->c = c;
2395    this->vp = (struct gl_vertex_program *)
2396      prog->_LinkedShaders[MESA_SHADER_VERTEX]->Program;
2397    this->prog_data = &c->prog_data;
2398
2399    this->variable_ht = hash_table_ctor(0,
2400                                        hash_table_pointer_hash,
2401                                        hash_table_pointer_compare);
2402
2403    this->virtual_grf_def = NULL;
2404    this->virtual_grf_use = NULL;
2405    this->virtual_grf_sizes = NULL;
2406    this->virtual_grf_count = 0;
2407    this->virtual_grf_reg_map = NULL;
2408    this->virtual_grf_reg_count = 0;
2409    this->virtual_grf_array_size = 0;
2410    this->live_intervals_valid = false;
2411
2412    this->uniforms = 0;
2413
2414    this->variable_ht = hash_table_ctor(0,
2415                                        hash_table_pointer_hash,
2416                                        hash_table_pointer_compare);
2417 }
2418
2419 vec4_visitor::~vec4_visitor()
2420 {
2421    ralloc_free(this->mem_ctx);
2422    hash_table_dtor(this->variable_ht);
2423 }
2424
2425
2426 void
2427 vec4_visitor::fail(const char *format, ...)
2428 {
2429    va_list va;
2430    char *msg;
2431
2432    if (failed)
2433       return;
2434
2435    failed = true;
2436
2437    va_start(va, format);
2438    msg = ralloc_vasprintf(mem_ctx, format, va);
2439    va_end(va);
2440    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2441
2442    this->fail_msg = msg;
2443
2444    if (INTEL_DEBUG & DEBUG_VS) {
2445       fprintf(stderr, "%s",  msg);
2446    }
2447 }
2448
2449 } /* namespace brw */