src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 extern "C" {
  26 #include "main/macros.h"
  27 #include "program/prog_parameter.h"
  28 }
  29
  30 namespace brw {
  31
  32 src_reg::src_reg(dst_reg reg)
  33 {
  34    init();
  35
  36    this->file = reg.file;
  37    this->reg = reg.reg;
  38    this->reg_offset = reg.reg_offset;
  39    this->type = reg.type;
  40    this->reladdr = reg.reladdr;
  41    this->fixed_hw_reg = reg.fixed_hw_reg;
  42
  43    int swizzles[4];
  44    int next_chan = 0;
  45    int last = 0;
  46
  47    for (int i = 0; i < 4; i++) {
  48       if (!(reg.writemask & (1 << i)))
  49          continue;
  50
  51       swizzles[next_chan++] = last = i;
  52    }
  53
  54    for (; next_chan < 4; next_chan++) {
  55       swizzles[next_chan] = last;
  56    }
  57
  58    this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
  59                                 swizzles[2], swizzles[3]);
  60 }
  61
  62 dst_reg::dst_reg(src_reg reg)
  63 {
  64    init();
  65
  66    this->file = reg.file;
  67    this->reg = reg.reg;
  68    this->reg_offset = reg.reg_offset;
  69    this->type = reg.type;
  70    this->writemask = WRITEMASK_XYZW;
  71    this->reladdr = reg.reladdr;
  72    this->fixed_hw_reg = reg.fixed_hw_reg;
  73 }
  74
  75 vec4_instruction::vec4_instruction(vec4_visitor *v,
  76                                    enum opcode opcode, dst_reg dst,
  77                                    src_reg src0, src_reg src1, src_reg src2)
  78 {
  79    this->opcode = opcode;
  80    this->dst = dst;
  81    this->src[0] = src0;
  82    this->src[1] = src1;
  83    this->src[2] = src2;
  84    this->ir = v->base_ir;
  85    this->annotation = v->current_annotation;
  86 }
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(vec4_instruction *inst)
  90 {
  91    this->instructions.push_tail(inst);
  92
  93    return inst;
  94 }
  95
  96 vec4_instruction *
  97 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  98 {
  99    new_inst->ir = inst->ir;
 100    new_inst->annotation = inst->annotation;
 101
 102    inst->insert_before(new_inst);
 103
 104    return inst;
 105 }
 106
 107 vec4_instruction *
 108 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
 109                    src_reg src0, src_reg src1, src_reg src2)
 110 {
 111    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
 112                                              src0, src1, src2));
 113 }
 114
 115
 116 vec4_instruction *
 117 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
 118 {
 119    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
 120 }
 121
 122 vec4_instruction *
 123 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
 124 {
 125    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
 126 }
 127
 128 vec4_instruction *
 129 vec4_visitor::emit(enum opcode opcode)
 130 {
 131    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
 132 }
 133
 134 #define ALU1(op)                                                        \
 135    vec4_instruction *                                                   \
 136    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
 137    {                                                                    \
 138       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 139                                            src0);                       \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    vec4_instruction *                                                   \
 144    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 145    {                                                                    \
 146       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 147                                            src0, src1);                 \
 148    }
 149
 150 ALU1(NOT)
 151 ALU1(MOV)
 152 ALU1(FRC)
 153 ALU1(RNDD)
 154 ALU1(RNDE)
 155 ALU1(RNDZ)
 156 ALU2(ADD)
 157 ALU2(MUL)
 158 ALU2(MACH)
 159 ALU2(AND)
 160 ALU2(OR)
 161 ALU2(XOR)
 162 ALU2(DP3)
 163 ALU2(DP4)
 164
 165 /** Gen4 predicated IF. */
 166 vec4_instruction *
 167 vec4_visitor::IF(uint32_t predicate)
 168 {
 169    vec4_instruction *inst;
 170
 171    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 172    inst->predicate = predicate;
 173
 174    return inst;
 175 }
 176
 177 /** Gen6+ IF with embedded comparison. */
 178 vec4_instruction *
 179 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 180 {
 181    assert(intel->gen >= 6);
 182
 183    vec4_instruction *inst;
 184
 185    resolve_ud_negate(&src0);
 186    resolve_ud_negate(&src1);
 187
 188    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 189                                         src0, src1);
 190    inst->conditional_mod = condition;
 191
 192    return inst;
 193 }
 194
 195 /**
 196  * CMP: Sets the low bit of the destination channels with the result
 197  * of the comparison, while the upper bits are undefined, and updates
 198  * the flag register with the packed 16 bits of the result.
 199  */
 200 vec4_instruction *
 201 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 202 {
 203    vec4_instruction *inst;
 204
 205    /* original gen4 does type conversion to the destination type
 206     * before before comparison, producing garbage results for floating
 207     * point comparisons.
 208     */
 209    if (intel->gen == 4) {
 210       dst.type = src0.type;
 211       if (dst.file == HW_REG)
 212          dst.fixed_hw_reg.type = dst.type;
 213    }
 214
 215    resolve_ud_negate(&src0);
 216    resolve_ud_negate(&src1);
 217
 218    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 219    inst->conditional_mod = condition;
 220
 221    return inst;
 222 }
 223
 224 vec4_instruction *
 225 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 226 {
 227    vec4_instruction *inst;
 228
 229    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 230                                         dst, index);
 231    inst->base_mrf = 14;
 232    inst->mlen = 1;
 233
 234    return inst;
 235 }
 236
 237 vec4_instruction *
 238 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 239 {
 240    vec4_instruction *inst;
 241
 242    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 243                                         dst, src, index);
 244    inst->base_mrf = 13;
 245    inst->mlen = 2;
 246
 247    return inst;
 248 }
 249
 250 void
 251 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 252 {
 253    static enum opcode dot_opcodes[] = {
 254       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 255    };
 256
 257    emit(dot_opcodes[elements - 2], dst, src0, src1);
 258 }
 259
 260 void
 261 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 262 {
 263    /* The gen6 math instruction ignores the source modifiers --
 264     * swizzle, abs, negate, and at least some parts of the register
 265     * region description.
 266     *
 267     * While it would seem that this MOV could be avoided at this point
 268     * in the case that the swizzle is matched up with the destination
 269     * writemask, note that uniform packing and register allocation
 270     * could rearrange our swizzle, so let's leave this matter up to
 271     * copy propagation later.
 272     */
 273    src_reg temp_src = src_reg(this, glsl_type::vec4_type);
 274    emit(MOV(dst_reg(temp_src), src));
 275
 276    if (dst.writemask != WRITEMASK_XYZW) {
 277       /* The gen6 math instruction must be align1, so we can't do
 278        * writemasks.
 279        */
 280       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 281
 282       emit(opcode, temp_dst, temp_src);
 283
 284       emit(MOV(dst, src_reg(temp_dst)));
 285    } else {
 286       emit(opcode, dst, temp_src);
 287    }
 288 }
 289
 290 void
 291 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 292 {
 293    vec4_instruction *inst = emit(opcode, dst, src);
 294    inst->base_mrf = 1;
 295    inst->mlen = 1;
 296 }
 297
 298 void
 299 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 300 {
 301    switch (opcode) {
 302    case SHADER_OPCODE_RCP:
 303    case SHADER_OPCODE_RSQ:
 304    case SHADER_OPCODE_SQRT:
 305    case SHADER_OPCODE_EXP2:
 306    case SHADER_OPCODE_LOG2:
 307    case SHADER_OPCODE_SIN:
 308    case SHADER_OPCODE_COS:
 309       break;
 310    default:
 311       assert(!"not reached: bad math opcode");
 312       return;
 313    }
 314
 315    if (intel->gen >= 6) {
 316       return emit_math1_gen6(opcode, dst, src);
 317    } else {
 318       return emit_math1_gen4(opcode, dst, src);
 319    }
 320 }
 321
 322 void
 323 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 324                               dst_reg dst, src_reg src0, src_reg src1)
 325 {
 326    src_reg expanded;
 327
 328    /* The gen6 math instruction ignores the source modifiers --
 329     * swizzle, abs, negate, and at least some parts of the register
 330     * region description.  Move the sources to temporaries to make it
 331     * generally work.
 332     */
 333
 334    expanded = src_reg(this, glsl_type::vec4_type);
 335    expanded.type = src0.type;
 336    emit(MOV(dst_reg(expanded), src0));
 337    src0 = expanded;
 338
 339    expanded = src_reg(this, glsl_type::vec4_type);
 340    expanded.type = src1.type;
 341    emit(MOV(dst_reg(expanded), src1));
 342    src1 = expanded;
 343
 344    if (dst.writemask != WRITEMASK_XYZW) {
 345       /* The gen6 math instruction must be align1, so we can't do
 346        * writemasks.
 347        */
 348       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 349       temp_dst.type = dst.type;
 350
 351       emit(opcode, temp_dst, src0, src1);
 352
 353       emit(MOV(dst, src_reg(temp_dst)));
 354    } else {
 355       emit(opcode, dst, src0, src1);
 356    }
 357 }
 358
 359 void
 360 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 361                               dst_reg dst, src_reg src0, src_reg src1)
 362 {
 363    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 364    inst->base_mrf = 1;
 365    inst->mlen = 2;
 366 }
 367
 368 void
 369 vec4_visitor::emit_math(enum opcode opcode,
 370                         dst_reg dst, src_reg src0, src_reg src1)
 371 {
 372    switch (opcode) {
 373    case SHADER_OPCODE_POW:
 374    case SHADER_OPCODE_INT_QUOTIENT:
 375    case SHADER_OPCODE_INT_REMAINDER:
 376       break;
 377    default:
 378       assert(!"not reached: unsupported binary math opcode");
 379       return;
 380    }
 381
 382    if (intel->gen >= 6) {
 383       return emit_math2_gen6(opcode, dst, src0, src1);
 384    } else {
 385       return emit_math2_gen4(opcode, dst, src0, src1);
 386    }
 387 }
 388
 389 void
 390 vec4_visitor::visit_instructions(const exec_list *list)
 391 {
 392    foreach_list(node, list) {
 393       ir_instruction *ir = (ir_instruction *)node;
 394
 395       base_ir = ir;
 396       ir->accept(this);
 397    }
 398 }
 399
 400
 401 static int
 402 type_size(const struct glsl_type *type)
 403 {
 404    unsigned int i;
 405    int size;
 406
 407    switch (type->base_type) {
 408    case GLSL_TYPE_UINT:
 409    case GLSL_TYPE_INT:
 410    case GLSL_TYPE_FLOAT:
 411    case GLSL_TYPE_BOOL:
 412       if (type->is_matrix()) {
 413          return type->matrix_columns;
 414       } else {
 415          /* Regardless of size of vector, it gets a vec4. This is bad
 416           * packing for things like floats, but otherwise arrays become a
 417           * mess.  Hopefully a later pass over the code can pack scalars
 418           * down if appropriate.
 419           */
 420          return 1;
 421       }
 422    case GLSL_TYPE_ARRAY:
 423       assert(type->length > 0);
 424       return type_size(type->fields.array) * type->length;
 425    case GLSL_TYPE_STRUCT:
 426       size = 0;
 427       for (i = 0; i < type->length; i++) {
 428          size += type_size(type->fields.structure[i].type);
 429       }
 430       return size;
 431    case GLSL_TYPE_SAMPLER:
 432       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 433        * at link time.
 434        */
 435       return 1;
 436    default:
 437       assert(0);
 438       return 0;
 439    }
 440 }
 441
 442 int
 443 vec4_visitor::virtual_grf_alloc(int size)
 444 {
 445    if (virtual_grf_array_size <= virtual_grf_count) {
 446       if (virtual_grf_array_size == 0)
 447          virtual_grf_array_size = 16;
 448       else
 449          virtual_grf_array_size *= 2;
 450       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 451                                    virtual_grf_array_size);
 452       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 453                                      virtual_grf_array_size);
 454    }
 455    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 456    virtual_grf_reg_count += size;
 457    virtual_grf_sizes[virtual_grf_count] = size;
 458    return virtual_grf_count++;
 459 }
 460
 461 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 462 {
 463    init();
 464
 465    this->file = GRF;
 466    this->reg = v->virtual_grf_alloc(type_size(type));
 467
 468    if (type->is_array() || type->is_record()) {
 469       this->swizzle = BRW_SWIZZLE_NOOP;
 470    } else {
 471       this->swizzle = swizzle_for_size(type->vector_elements);
 472    }
 473
 474    this->type = brw_type_for_base_type(type);
 475 }
 476
 477 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 478 {
 479    init();
 480
 481    this->file = GRF;
 482    this->reg = v->virtual_grf_alloc(type_size(type));
 483
 484    if (type->is_array() || type->is_record()) {
 485       this->writemask = WRITEMASK_XYZW;
 486    } else {
 487       this->writemask = (1 << type->vector_elements) - 1;
 488    }
 489
 490    this->type = brw_type_for_base_type(type);
 491 }
 492
 493 /* Our support for uniforms is piggy-backed on the struct
 494  * gl_fragment_program, because that's where the values actually
 495  * get stored, rather than in some global gl_shader_program uniform
 496  * store.
 497  */
 498 int
 499 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
 500 {
 501    unsigned int offset = 0;
 502    float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
 503
 504    if (type->is_matrix()) {
 505       const glsl_type *column = type->column_type();
 506
 507       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 508          offset += setup_uniform_values(loc + offset, column);
 509       }
 510
 511       return offset;
 512    }
 513
 514    switch (type->base_type) {
 515    case GLSL_TYPE_FLOAT:
 516    case GLSL_TYPE_UINT:
 517    case GLSL_TYPE_INT:
 518    case GLSL_TYPE_BOOL:
 519       for (unsigned int i = 0; i < type->vector_elements; i++) {
 520          c->prog_data.param[this->uniforms * 4 + i] = &values[i];
 521       }
 522
 523       /* Set up pad elements to get things aligned to a vec4 boundary. */
 524       for (unsigned int i = type->vector_elements; i < 4; i++) {
 525          static float zero = 0;
 526
 527          c->prog_data.param[this->uniforms * 4 + i] = &zero;
 528       }
 529
 530       /* Track the size of this uniform vector, for future packing of
 531        * uniforms.
 532        */
 533       this->uniform_vector_size[this->uniforms] = type->vector_elements;
 534       this->uniforms++;
 535
 536       return 1;
 537
 538    case GLSL_TYPE_STRUCT:
 539       for (unsigned int i = 0; i < type->length; i++) {
 540          offset += setup_uniform_values(loc + offset,
 541                                         type->fields.structure[i].type);
 542       }
 543       return offset;
 544
 545    case GLSL_TYPE_ARRAY:
 546       for (unsigned int i = 0; i < type->length; i++) {
 547          offset += setup_uniform_values(loc + offset, type->fields.array);
 548       }
 549       return offset;
 550
 551    case GLSL_TYPE_SAMPLER:
 552       /* The sampler takes up a slot, but we don't use any values from it. */
 553       return 1;
 554
 555    default:
 556       assert(!"not reached");
 557       return 0;
 558    }
 559 }
 560
 561 void
 562 vec4_visitor::setup_uniform_clipplane_values()
 563 {
 564    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 565
 566    /* Pre-Gen6, we compact clip planes.  For example, if the user
 567     * enables just clip planes 0, 1, and 3, we will enable clip planes
 568     * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
 569     * plane 2.  This simplifies the implementation of the Gen6 clip
 570     * thread.
 571     *
 572     * In Gen6 and later, we don't compact clip planes, because this
 573     * simplifies the implementation of gl_ClipDistance.
 574     */
 575    int compacted_clipplane_index = 0;
 576    for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
 577       if (intel->gen < 6 &&
 578           !(c->key.userclip_planes_enabled_gen_4_5 & (1 << i))) {
 579          continue;
 580       }
 581       this->uniform_vector_size[this->uniforms] = 4;
 582       this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
 583       this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
 584       for (int j = 0; j < 4; ++j) {
 585          c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 586       }
 587       ++compacted_clipplane_index;
 588       ++this->uniforms;
 589    }
 590 }
 591
 592 /* Our support for builtin uniforms is even scarier than non-builtin.
 593  * It sits on top of the PROG_STATE_VAR parameters that are
 594  * automatically updated from GL context state.
 595  */
 596 void
 597 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 598 {
 599    const ir_state_slot *const slots = ir->state_slots;
 600    assert(ir->state_slots != NULL);
 601
 602    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 603       /* This state reference has already been setup by ir_to_mesa,
 604        * but we'll get the same index back here.  We can reference
 605        * ParameterValues directly, since unlike brw_fs.cpp, we never
 606        * add new state references during compile.
 607        */
 608       int index = _mesa_add_state_reference(this->vp->Base.Parameters,
 609                                             (gl_state_index *)slots[i].tokens);
 610       float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
 611
 612       this->uniform_vector_size[this->uniforms] = 0;
 613       /* Add each of the unique swizzled channels of the element.
 614        * This will end up matching the size of the glsl_type of this field.
 615        */
 616       int last_swiz = -1;
 617       for (unsigned int j = 0; j < 4; j++) {
 618          int swiz = GET_SWZ(slots[i].swizzle, j);
 619          last_swiz = swiz;
 620
 621          c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
 622          if (swiz <= last_swiz)
 623             this->uniform_vector_size[this->uniforms]++;
 624       }
 625       this->uniforms++;
 626    }
 627 }
 628
 629 dst_reg *
 630 vec4_visitor::variable_storage(ir_variable *var)
 631 {
 632    return (dst_reg *)hash_table_find(this->variable_ht, var);
 633 }
 634
 635 void
 636 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 637 {
 638    ir_expression *expr = ir->as_expression();
 639
 640    *predicate = BRW_PREDICATE_NORMAL;
 641
 642    if (expr) {
 643       src_reg op[2];
 644       vec4_instruction *inst;
 645
 646       assert(expr->get_num_operands() <= 2);
 647       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 648          expr->operands[i]->accept(this);
 649          op[i] = this->result;
 650
 651          resolve_ud_negate(&op[i]);
 652       }
 653
 654       switch (expr->operation) {
 655       case ir_unop_logic_not:
 656          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 657          inst->conditional_mod = BRW_CONDITIONAL_Z;
 658          break;
 659
 660       case ir_binop_logic_xor:
 661          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 662          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 663          break;
 664
 665       case ir_binop_logic_or:
 666          inst = emit(OR(dst_null_d(), op[0], op[1]));
 667          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 668          break;
 669
 670       case ir_binop_logic_and:
 671          inst = emit(AND(dst_null_d(), op[0], op[1]));
 672          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 673          break;
 674
 675       case ir_unop_f2b:
 676          if (intel->gen >= 6) {
 677             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 678          } else {
 679             inst = emit(MOV(dst_null_f(), op[0]));
 680             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 681          }
 682          break;
 683
 684       case ir_unop_i2b:
 685          if (intel->gen >= 6) {
 686             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 687          } else {
 688             inst = emit(MOV(dst_null_d(), op[0]));
 689             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 690          }
 691          break;
 692
 693       case ir_binop_all_equal:
 694          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 695          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 696          break;
 697
 698       case ir_binop_any_nequal:
 699          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 700          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 701          break;
 702
 703       case ir_unop_any:
 704          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 705          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 706          break;
 707
 708       case ir_binop_greater:
 709       case ir_binop_gequal:
 710       case ir_binop_less:
 711       case ir_binop_lequal:
 712       case ir_binop_equal:
 713       case ir_binop_nequal:
 714          emit(CMP(dst_null_d(), op[0], op[1],
 715                   brw_conditional_for_comparison(expr->operation)));
 716          break;
 717
 718       default:
 719          assert(!"not reached");
 720          break;
 721       }
 722       return;
 723    }
 724
 725    ir->accept(this);
 726
 727    resolve_ud_negate(&this->result);
 728
 729    if (intel->gen >= 6) {
 730       vec4_instruction *inst = emit(AND(dst_null_d(),
 731                                         this->result, src_reg(1)));
 732       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 733    } else {
 734       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 735       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 736    }
 737 }
 738
 739 /**
 740  * Emit a gen6 IF statement with the comparison folded into the IF
 741  * instruction.
 742  */
 743 void
 744 vec4_visitor::emit_if_gen6(ir_if *ir)
 745 {
 746    ir_expression *expr = ir->condition->as_expression();
 747
 748    if (expr) {
 749       src_reg op[2];
 750       dst_reg temp;
 751
 752       assert(expr->get_num_operands() <= 2);
 753       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 754          expr->operands[i]->accept(this);
 755          op[i] = this->result;
 756       }
 757
 758       switch (expr->operation) {
 759       case ir_unop_logic_not:
 760          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 761          return;
 762
 763       case ir_binop_logic_xor:
 764          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 765          return;
 766
 767       case ir_binop_logic_or:
 768          temp = dst_reg(this, glsl_type::bool_type);
 769          emit(OR(temp, op[0], op[1]));
 770          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 771          return;
 772
 773       case ir_binop_logic_and:
 774          temp = dst_reg(this, glsl_type::bool_type);
 775          emit(AND(temp, op[0], op[1]));
 776          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 777          return;
 778
 779       case ir_unop_f2b:
 780          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 781          return;
 782
 783       case ir_unop_i2b:
 784          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 785          return;
 786
 787       case ir_binop_greater:
 788       case ir_binop_gequal:
 789       case ir_binop_less:
 790       case ir_binop_lequal:
 791       case ir_binop_equal:
 792       case ir_binop_nequal:
 793          emit(IF(op[0], op[1],
 794                  brw_conditional_for_comparison(expr->operation)));
 795          return;
 796
 797       case ir_binop_all_equal:
 798          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 799          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 800          return;
 801
 802       case ir_binop_any_nequal:
 803          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 804          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 805          return;
 806
 807       case ir_unop_any:
 808          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 809          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 810          return;
 811
 812       default:
 813          assert(!"not reached");
 814          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 815          return;
 816       }
 817       return;
 818    }
 819
 820    ir->condition->accept(this);
 821
 822    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 823 }
 824
 825 void
 826 vec4_visitor::visit(ir_variable *ir)
 827 {
 828    dst_reg *reg = NULL;
 829
 830    if (variable_storage(ir))
 831       return;
 832
 833    switch (ir->mode) {
 834    case ir_var_in:
 835       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
 836
 837       /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 838        * come in as floating point conversions of the integer values.
 839        */
 840       for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
 841          if (!c->key.gl_fixed_input_size[i])
 842             continue;
 843
 844          dst_reg dst = *reg;
 845          dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
 846          emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 847       }
 848       break;
 849
 850    case ir_var_out:
 851       reg = new(mem_ctx) dst_reg(this, ir->type);
 852
 853       for (int i = 0; i < type_size(ir->type); i++) {
 854          output_reg[ir->location + i] = *reg;
 855          output_reg[ir->location + i].reg_offset = i;
 856          output_reg[ir->location + i].type = BRW_REGISTER_TYPE_F;
 857          output_reg_annotation[ir->location + i] = ir->name;
 858       }
 859       break;
 860
 861    case ir_var_auto:
 862    case ir_var_temporary:
 863       reg = new(mem_ctx) dst_reg(this, ir->type);
 864       break;
 865
 866    case ir_var_uniform:
 867       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 868
 869       /* Track how big the whole uniform variable is, in case we need to put a
 870        * copy of its data into pull constants for array access.
 871        */
 872       this->uniform_size[this->uniforms] = type_size(ir->type);
 873
 874       if (!strncmp(ir->name, "gl_", 3)) {
 875          setup_builtin_uniform_values(ir);
 876       } else {
 877          setup_uniform_values(ir->location, ir->type);
 878       }
 879       break;
 880
 881    default:
 882       assert(!"not reached");
 883    }
 884
 885    reg->type = brw_type_for_base_type(ir->type);
 886    hash_table_insert(this->variable_ht, reg, ir);
 887 }
 888
 889 void
 890 vec4_visitor::visit(ir_loop *ir)
 891 {
 892    dst_reg counter;
 893
 894    /* We don't want debugging output to print the whole body of the
 895     * loop as the annotation.
 896     */
 897    this->base_ir = NULL;
 898
 899    if (ir->counter != NULL) {
 900       this->base_ir = ir->counter;
 901       ir->counter->accept(this);
 902       counter = *(variable_storage(ir->counter));
 903
 904       if (ir->from != NULL) {
 905          this->base_ir = ir->from;
 906          ir->from->accept(this);
 907
 908          emit(MOV(counter, this->result));
 909       }
 910    }
 911
 912    emit(BRW_OPCODE_DO);
 913
 914    if (ir->to) {
 915       this->base_ir = ir->to;
 916       ir->to->accept(this);
 917
 918       emit(CMP(dst_null_d(), src_reg(counter), this->result,
 919                brw_conditional_for_comparison(ir->cmp)));
 920
 921       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
 922       inst->predicate = BRW_PREDICATE_NORMAL;
 923    }
 924
 925    visit_instructions(&ir->body_instructions);
 926
 927
 928    if (ir->increment) {
 929       this->base_ir = ir->increment;
 930       ir->increment->accept(this);
 931       emit(ADD(counter, src_reg(counter), this->result));
 932    }
 933
 934    emit(BRW_OPCODE_WHILE);
 935 }
 936
 937 void
 938 vec4_visitor::visit(ir_loop_jump *ir)
 939 {
 940    switch (ir->mode) {
 941    case ir_loop_jump::jump_break:
 942       emit(BRW_OPCODE_BREAK);
 943       break;
 944    case ir_loop_jump::jump_continue:
 945       emit(BRW_OPCODE_CONTINUE);
 946       break;
 947    }
 948 }
 949
 950
 951 void
 952 vec4_visitor::visit(ir_function_signature *ir)
 953 {
 954    assert(0);
 955    (void)ir;
 956 }
 957
 958 void
 959 vec4_visitor::visit(ir_function *ir)
 960 {
 961    /* Ignore function bodies other than main() -- we shouldn't see calls to
 962     * them since they should all be inlined.
 963     */
 964    if (strcmp(ir->name, "main") == 0) {
 965       const ir_function_signature *sig;
 966       exec_list empty;
 967
 968       sig = ir->matching_signature(&empty);
 969
 970       assert(sig);
 971
 972       visit_instructions(&sig->body);
 973    }
 974 }
 975
 976 bool
 977 vec4_visitor::try_emit_sat(ir_expression *ir)
 978 {
 979    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
 980    if (!sat_src)
 981       return false;
 982
 983    sat_src->accept(this);
 984    src_reg src = this->result;
 985
 986    this->result = src_reg(this, ir->type);
 987    vec4_instruction *inst;
 988    inst = emit(MOV(dst_reg(this->result), src));
 989    inst->saturate = true;
 990
 991    return true;
 992 }
 993
 994 void
 995 vec4_visitor::emit_bool_comparison(unsigned int op,
 996                                  dst_reg dst, src_reg src0, src_reg src1)
 997 {
 998    /* original gen4 does destination conversion before comparison. */
 999    if (intel->gen < 5)
1000       dst.type = src0.type;
1001
1002    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1003
1004    dst.type = BRW_REGISTER_TYPE_D;
1005    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1006 }
1007
1008 void
1009 vec4_visitor::visit(ir_expression *ir)
1010 {
1011    unsigned int operand;
1012    src_reg op[Elements(ir->operands)];
1013    src_reg result_src;
1014    dst_reg result_dst;
1015    vec4_instruction *inst;
1016
1017    if (try_emit_sat(ir))
1018       return;
1019
1020    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1021       this->result.file = BAD_FILE;
1022       ir->operands[operand]->accept(this);
1023       if (this->result.file == BAD_FILE) {
1024          printf("Failed to get tree for expression operand:\n");
1025          ir->operands[operand]->print();
1026          exit(1);
1027       }
1028       op[operand] = this->result;
1029
1030       /* Matrix expression operands should have been broken down to vector
1031        * operations already.
1032        */
1033       assert(!ir->operands[operand]->type->is_matrix());
1034    }
1035
1036    int vector_elements = ir->operands[0]->type->vector_elements;
1037    if (ir->operands[1]) {
1038       vector_elements = MAX2(vector_elements,
1039                              ir->operands[1]->type->vector_elements);
1040    }
1041
1042    this->result.file = BAD_FILE;
1043
1044    /* Storage for our result.  Ideally for an assignment we'd be using
1045     * the actual storage for the result here, instead.
1046     */
1047    result_src = src_reg(this, ir->type);
1048    /* convenience for the emit functions below. */
1049    result_dst = dst_reg(result_src);
1050    /* If nothing special happens, this is the result. */
1051    this->result = result_src;
1052    /* Limit writes to the channels that will be used by result_src later.
1053     * This does limit this temp's use as a temporary for multi-instruction
1054     * sequences.
1055     */
1056    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1057
1058    switch (ir->operation) {
1059    case ir_unop_logic_not:
1060       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1061        * ones complement of the whole register, not just bit 0.
1062        */
1063       emit(XOR(result_dst, op[0], src_reg(1)));
1064       break;
1065    case ir_unop_neg:
1066       op[0].negate = !op[0].negate;
1067       this->result = op[0];
1068       break;
1069    case ir_unop_abs:
1070       op[0].abs = true;
1071       op[0].negate = false;
1072       this->result = op[0];
1073       break;
1074
1075    case ir_unop_sign:
1076       emit(MOV(result_dst, src_reg(0.0f)));
1077
1078       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1079       inst = emit(MOV(result_dst, src_reg(1.0f)));
1080       inst->predicate = BRW_PREDICATE_NORMAL;
1081
1082       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1083       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1084       inst->predicate = BRW_PREDICATE_NORMAL;
1085
1086       break;
1087
1088    case ir_unop_rcp:
1089       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1090       break;
1091
1092    case ir_unop_exp2:
1093       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1094       break;
1095    case ir_unop_log2:
1096       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1097       break;
1098    case ir_unop_exp:
1099    case ir_unop_log:
1100       assert(!"not reached: should be handled by ir_explog_to_explog2");
1101       break;
1102    case ir_unop_sin:
1103    case ir_unop_sin_reduced:
1104       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1105       break;
1106    case ir_unop_cos:
1107    case ir_unop_cos_reduced:
1108       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1109       break;
1110
1111    case ir_unop_dFdx:
1112    case ir_unop_dFdy:
1113       assert(!"derivatives not valid in vertex shader");
1114       break;
1115
1116    case ir_unop_noise:
1117       assert(!"not reached: should be handled by lower_noise");
1118       break;
1119
1120    case ir_binop_add:
1121       emit(ADD(result_dst, op[0], op[1]));
1122       break;
1123    case ir_binop_sub:
1124       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1125       break;
1126
1127    case ir_binop_mul:
1128       if (ir->type->is_integer()) {
1129          /* For integer multiplication, the MUL uses the low 16 bits
1130           * of one of the operands (src0 on gen6, src1 on gen7).  The
1131           * MACH accumulates in the contribution of the upper 16 bits
1132           * of that operand.
1133           *
1134           * FINISHME: Emit just the MUL if we know an operand is small
1135           * enough.
1136           */
1137          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1138
1139          emit(MUL(acc, op[0], op[1]));
1140          emit(MACH(dst_null_d(), op[0], op[1]));
1141          emit(MOV(result_dst, src_reg(acc)));
1142       } else {
1143          emit(MUL(result_dst, op[0], op[1]));
1144       }
1145       break;
1146    case ir_binop_div:
1147       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1148       assert(ir->type->is_integer());
1149       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1150       break;
1151    case ir_binop_mod:
1152       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1153       assert(ir->type->is_integer());
1154       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1155       break;
1156
1157    case ir_binop_less:
1158    case ir_binop_greater:
1159    case ir_binop_lequal:
1160    case ir_binop_gequal:
1161    case ir_binop_equal:
1162    case ir_binop_nequal: {
1163       emit(CMP(result_dst, op[0], op[1],
1164                brw_conditional_for_comparison(ir->operation)));
1165       emit(AND(result_dst, result_src, src_reg(0x1)));
1166       break;
1167    }
1168
1169    case ir_binop_all_equal:
1170       /* "==" operator producing a scalar boolean. */
1171       if (ir->operands[0]->type->is_vector() ||
1172           ir->operands[1]->type->is_vector()) {
1173          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1174          emit(MOV(result_dst, src_reg(0)));
1175          inst = emit(MOV(result_dst, src_reg(1)));
1176          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1177       } else {
1178          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1179          emit(AND(result_dst, result_src, src_reg(0x1)));
1180       }
1181       break;
1182    case ir_binop_any_nequal:
1183       /* "!=" operator producing a scalar boolean. */
1184       if (ir->operands[0]->type->is_vector() ||
1185           ir->operands[1]->type->is_vector()) {
1186          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1187
1188          emit(MOV(result_dst, src_reg(0)));
1189          inst = emit(MOV(result_dst, src_reg(1)));
1190          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1191       } else {
1192          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1193          emit(AND(result_dst, result_src, src_reg(0x1)));
1194       }
1195       break;
1196
1197    case ir_unop_any:
1198       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1199       emit(MOV(result_dst, src_reg(0)));
1200
1201       inst = emit(MOV(result_dst, src_reg(1)));
1202       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1203       break;
1204
1205    case ir_binop_logic_xor:
1206       emit(XOR(result_dst, op[0], op[1]));
1207       break;
1208
1209    case ir_binop_logic_or:
1210       emit(OR(result_dst, op[0], op[1]));
1211       break;
1212
1213    case ir_binop_logic_and:
1214       emit(AND(result_dst, op[0], op[1]));
1215       break;
1216
1217    case ir_binop_dot:
1218       assert(ir->operands[0]->type->is_vector());
1219       assert(ir->operands[0]->type == ir->operands[1]->type);
1220       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1221       break;
1222
1223    case ir_unop_sqrt:
1224       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1225       break;
1226    case ir_unop_rsq:
1227       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1228       break;
1229    case ir_unop_i2f:
1230    case ir_unop_i2u:
1231    case ir_unop_u2i:
1232    case ir_unop_u2f:
1233    case ir_unop_b2f:
1234    case ir_unop_b2i:
1235    case ir_unop_f2i:
1236       emit(MOV(result_dst, op[0]));
1237       break;
1238    case ir_unop_f2b:
1239    case ir_unop_i2b: {
1240       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1241       emit(AND(result_dst, result_src, src_reg(1)));
1242       break;
1243    }
1244
1245    case ir_unop_trunc:
1246       emit(RNDZ(result_dst, op[0]));
1247       break;
1248    case ir_unop_ceil:
1249       op[0].negate = !op[0].negate;
1250       inst = emit(RNDD(result_dst, op[0]));
1251       this->result.negate = true;
1252       break;
1253    case ir_unop_floor:
1254       inst = emit(RNDD(result_dst, op[0]));
1255       break;
1256    case ir_unop_fract:
1257       inst = emit(FRC(result_dst, op[0]));
1258       break;
1259    case ir_unop_round_even:
1260       emit(RNDE(result_dst, op[0]));
1261       break;
1262
1263    case ir_binop_min:
1264       emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_L));
1265
1266       inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1267       inst->predicate = BRW_PREDICATE_NORMAL;
1268       break;
1269    case ir_binop_max:
1270       emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_G));
1271
1272       inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1273       inst->predicate = BRW_PREDICATE_NORMAL;
1274       break;
1275
1276    case ir_binop_pow:
1277       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1278       break;
1279
1280    case ir_unop_bit_not:
1281       inst = emit(NOT(result_dst, op[0]));
1282       break;
1283    case ir_binop_bit_and:
1284       inst = emit(AND(result_dst, op[0], op[1]));
1285       break;
1286    case ir_binop_bit_xor:
1287       inst = emit(XOR(result_dst, op[0], op[1]));
1288       break;
1289    case ir_binop_bit_or:
1290       inst = emit(OR(result_dst, op[0], op[1]));
1291       break;
1292
1293    case ir_binop_lshift:
1294       inst = emit(BRW_OPCODE_SHL, result_dst, op[0], op[1]);
1295       break;
1296
1297    case ir_binop_rshift:
1298       if (ir->type->base_type == GLSL_TYPE_INT)
1299          inst = emit(BRW_OPCODE_ASR, result_dst, op[0], op[1]);
1300       else
1301          inst = emit(BRW_OPCODE_SHR, result_dst, op[0], op[1]);
1302       break;
1303
1304    case ir_quadop_vector:
1305       assert(!"not reached: should be handled by lower_quadop_vector");
1306       break;
1307    }
1308 }
1309
1310
1311 void
1312 vec4_visitor::visit(ir_swizzle *ir)
1313 {
1314    src_reg src;
1315    int i = 0;
1316    int swizzle[4];
1317
1318    /* Note that this is only swizzles in expressions, not those on the left
1319     * hand side of an assignment, which do write masking.  See ir_assignment
1320     * for that.
1321     */
1322
1323    ir->val->accept(this);
1324    src = this->result;
1325    assert(src.file != BAD_FILE);
1326
1327    for (i = 0; i < ir->type->vector_elements; i++) {
1328       switch (i) {
1329       case 0:
1330          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1331          break;
1332       case 1:
1333          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1334          break;
1335       case 2:
1336          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1337          break;
1338       case 3:
1339          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1340             break;
1341       }
1342    }
1343    for (; i < 4; i++) {
1344       /* Replicate the last channel out. */
1345       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1346    }
1347
1348    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1349
1350    this->result = src;
1351 }
1352
1353 void
1354 vec4_visitor::visit(ir_dereference_variable *ir)
1355 {
1356    const struct glsl_type *type = ir->type;
1357    dst_reg *reg = variable_storage(ir->var);
1358
1359    if (!reg) {
1360       fail("Failed to find variable storage for %s\n", ir->var->name);
1361       this->result = src_reg(brw_null_reg());
1362       return;
1363    }
1364
1365    this->result = src_reg(*reg);
1366
1367    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1368       this->result.swizzle = swizzle_for_size(type->vector_elements);
1369 }
1370
1371 void
1372 vec4_visitor::visit(ir_dereference_array *ir)
1373 {
1374    ir_constant *constant_index;
1375    src_reg src;
1376    int element_size = type_size(ir->type);
1377
1378    constant_index = ir->array_index->constant_expression_value();
1379
1380    ir->array->accept(this);
1381    src = this->result;
1382
1383    if (constant_index) {
1384       src.reg_offset += constant_index->value.i[0] * element_size;
1385    } else {
1386       /* Variable index array dereference.  It eats the "vec4" of the
1387        * base of the array and an index that offsets the Mesa register
1388        * index.
1389        */
1390       ir->array_index->accept(this);
1391
1392       src_reg index_reg;
1393
1394       if (element_size == 1) {
1395          index_reg = this->result;
1396       } else {
1397          index_reg = src_reg(this, glsl_type::int_type);
1398
1399          emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1400       }
1401
1402       if (src.reladdr) {
1403          src_reg temp = src_reg(this, glsl_type::int_type);
1404
1405          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1406
1407          index_reg = temp;
1408       }
1409
1410       src.reladdr = ralloc(mem_ctx, src_reg);
1411       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1412    }
1413
1414    /* If the type is smaller than a vec4, replicate the last channel out. */
1415    if (ir->type->is_scalar() || ir->type->is_vector())
1416       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1417    else
1418       src.swizzle = BRW_SWIZZLE_NOOP;
1419    src.type = brw_type_for_base_type(ir->type);
1420
1421    this->result = src;
1422 }
1423
1424 void
1425 vec4_visitor::visit(ir_dereference_record *ir)
1426 {
1427    unsigned int i;
1428    const glsl_type *struct_type = ir->record->type;
1429    int offset = 0;
1430
1431    ir->record->accept(this);
1432
1433    for (i = 0; i < struct_type->length; i++) {
1434       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1435          break;
1436       offset += type_size(struct_type->fields.structure[i].type);
1437    }
1438
1439    /* If the type is smaller than a vec4, replicate the last channel out. */
1440    if (ir->type->is_scalar() || ir->type->is_vector())
1441       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1442    else
1443       this->result.swizzle = BRW_SWIZZLE_NOOP;
1444    this->result.type = brw_type_for_base_type(ir->type);
1445
1446    this->result.reg_offset += offset;
1447 }
1448
1449 /**
1450  * We want to be careful in assignment setup to hit the actual storage
1451  * instead of potentially using a temporary like we might with the
1452  * ir_dereference handler.
1453  */
1454 static dst_reg
1455 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1456 {
1457    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1458     * access of a vector, it must be separated into a series conditional moves
1459     * before reaching this point (see ir_vec_index_to_cond_assign).
1460     */
1461    assert(ir->as_dereference());
1462    ir_dereference_array *deref_array = ir->as_dereference_array();
1463    if (deref_array) {
1464       assert(!deref_array->array->type->is_vector());
1465    }
1466
1467    /* Use the rvalue deref handler for the most part.  We'll ignore
1468     * swizzles in it and write swizzles using writemask, though.
1469     */
1470    ir->accept(v);
1471    return dst_reg(v->result);
1472 }
1473
1474 void
1475 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1476                               const struct glsl_type *type, uint32_t predicate)
1477 {
1478    if (type->base_type == GLSL_TYPE_STRUCT) {
1479       for (unsigned int i = 0; i < type->length; i++) {
1480          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1481       }
1482       return;
1483    }
1484
1485    if (type->is_array()) {
1486       for (unsigned int i = 0; i < type->length; i++) {
1487          emit_block_move(dst, src, type->fields.array, predicate);
1488       }
1489       return;
1490    }
1491
1492    if (type->is_matrix()) {
1493       const struct glsl_type *vec_type;
1494
1495       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1496                                          type->vector_elements, 1);
1497
1498       for (int i = 0; i < type->matrix_columns; i++) {
1499          emit_block_move(dst, src, vec_type, predicate);
1500       }
1501       return;
1502    }
1503
1504    assert(type->is_scalar() || type->is_vector());
1505
1506    dst->type = brw_type_for_base_type(type);
1507    src->type = dst->type;
1508
1509    dst->writemask = (1 << type->vector_elements) - 1;
1510
1511    /* Do we need to worry about swizzling a swizzle? */
1512    assert(src->swizzle == BRW_SWIZZLE_NOOP
1513           || src->swizzle == swizzle_for_size(type->vector_elements));
1514    src->swizzle = swizzle_for_size(type->vector_elements);
1515
1516    vec4_instruction *inst = emit(MOV(*dst, *src));
1517    inst->predicate = predicate;
1518
1519    dst->reg_offset++;
1520    src->reg_offset++;
1521 }
1522
1523
1524 /* If the RHS processing resulted in an instruction generating a
1525  * temporary value, and it would be easy to rewrite the instruction to
1526  * generate its result right into the LHS instead, do so.  This ends
1527  * up reliably removing instructions where it can be tricky to do so
1528  * later without real UD chain information.
1529  */
1530 bool
1531 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1532                                      dst_reg dst,
1533                                      src_reg src,
1534                                      vec4_instruction *pre_rhs_inst,
1535                                      vec4_instruction *last_rhs_inst)
1536 {
1537    /* This could be supported, but it would take more smarts. */
1538    if (ir->condition)
1539       return false;
1540
1541    if (pre_rhs_inst == last_rhs_inst)
1542       return false; /* No instructions generated to work with. */
1543
1544    /* Make sure the last instruction generated our source reg. */
1545    if (src.file != GRF ||
1546        src.file != last_rhs_inst->dst.file ||
1547        src.reg != last_rhs_inst->dst.reg ||
1548        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1549        src.reladdr ||
1550        src.abs ||
1551        src.negate ||
1552        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1553       return false;
1554
1555    /* Check that that last instruction fully initialized the channels
1556     * we want to use, in the order we want to use them.  We could
1557     * potentially reswizzle the operands of many instructions so that
1558     * we could handle out of order channels, but don't yet.
1559     */
1560
1561    for (unsigned i = 0; i < 4; i++) {
1562       if (dst.writemask & (1 << i)) {
1563          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1564             return false;
1565
1566          if (BRW_GET_SWZ(src.swizzle, i) != i)
1567             return false;
1568       }
1569    }
1570
1571    /* Success!  Rewrite the instruction. */
1572    last_rhs_inst->dst.file = dst.file;
1573    last_rhs_inst->dst.reg = dst.reg;
1574    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1575    last_rhs_inst->dst.reladdr = dst.reladdr;
1576    last_rhs_inst->dst.writemask &= dst.writemask;
1577
1578    return true;
1579 }
1580
1581 void
1582 vec4_visitor::visit(ir_assignment *ir)
1583 {
1584    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1585    uint32_t predicate = BRW_PREDICATE_NONE;
1586
1587    if (!ir->lhs->type->is_scalar() &&
1588        !ir->lhs->type->is_vector()) {
1589       ir->rhs->accept(this);
1590       src_reg src = this->result;
1591
1592       if (ir->condition) {
1593          emit_bool_to_cond_code(ir->condition, &predicate);
1594       }
1595
1596       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1597       return;
1598    }
1599
1600    /* Now we're down to just a scalar/vector with writemasks. */
1601    int i;
1602
1603    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1604    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1605
1606    ir->rhs->accept(this);
1607
1608    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1609
1610    src_reg src = this->result;
1611
1612    int swizzles[4];
1613    int first_enabled_chan = 0;
1614    int src_chan = 0;
1615
1616    assert(ir->lhs->type->is_vector() ||
1617           ir->lhs->type->is_scalar());
1618    dst.writemask = ir->write_mask;
1619
1620    for (int i = 0; i < 4; i++) {
1621       if (dst.writemask & (1 << i)) {
1622          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1623          break;
1624       }
1625    }
1626
1627    /* Swizzle a small RHS vector into the channels being written.
1628     *
1629     * glsl ir treats write_mask as dictating how many channels are
1630     * present on the RHS while in our instructions we need to make
1631     * those channels appear in the slots of the vec4 they're written to.
1632     */
1633    for (int i = 0; i < 4; i++) {
1634       if (dst.writemask & (1 << i))
1635          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1636       else
1637          swizzles[i] = first_enabled_chan;
1638    }
1639    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1640                               swizzles[2], swizzles[3]);
1641
1642    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1643       return;
1644    }
1645
1646    if (ir->condition) {
1647       emit_bool_to_cond_code(ir->condition, &predicate);
1648    }
1649
1650    for (i = 0; i < type_size(ir->lhs->type); i++) {
1651       vec4_instruction *inst = emit(MOV(dst, src));
1652       inst->predicate = predicate;
1653
1654       dst.reg_offset++;
1655       src.reg_offset++;
1656    }
1657 }
1658
1659 void
1660 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1661 {
1662    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1663       foreach_list(node, &ir->components) {
1664          ir_constant *field_value = (ir_constant *)node;
1665
1666          emit_constant_values(dst, field_value);
1667       }
1668       return;
1669    }
1670
1671    if (ir->type->is_array()) {
1672       for (unsigned int i = 0; i < ir->type->length; i++) {
1673          emit_constant_values(dst, ir->array_elements[i]);
1674       }
1675       return;
1676    }
1677
1678    if (ir->type->is_matrix()) {
1679       for (int i = 0; i < ir->type->matrix_columns; i++) {
1680          for (int j = 0; j < ir->type->vector_elements; j++) {
1681             dst->writemask = 1 << j;
1682             dst->type = BRW_REGISTER_TYPE_F;
1683
1684             emit(MOV(*dst,
1685                      src_reg(ir->value.f[i * ir->type->vector_elements + j])));
1686          }
1687          dst->reg_offset++;
1688       }
1689       return;
1690    }
1691
1692    for (int i = 0; i < ir->type->vector_elements; i++) {
1693       dst->writemask = 1 << i;
1694       dst->type = brw_type_for_base_type(ir->type);
1695
1696       switch (ir->type->base_type) {
1697       case GLSL_TYPE_FLOAT:
1698          emit(MOV(*dst, src_reg(ir->value.f[i])));
1699          break;
1700       case GLSL_TYPE_INT:
1701          emit(MOV(*dst, src_reg(ir->value.i[i])));
1702          break;
1703       case GLSL_TYPE_UINT:
1704          emit(MOV(*dst, src_reg(ir->value.u[i])));
1705          break;
1706       case GLSL_TYPE_BOOL:
1707          emit(MOV(*dst, src_reg(ir->value.b[i])));
1708          break;
1709       default:
1710          assert(!"Non-float/uint/int/bool constant");
1711          break;
1712       }
1713    }
1714    dst->reg_offset++;
1715 }
1716
1717 void
1718 vec4_visitor::visit(ir_constant *ir)
1719 {
1720    dst_reg dst = dst_reg(this, ir->type);
1721    this->result = src_reg(dst);
1722
1723    emit_constant_values(&dst, ir);
1724 }
1725
1726 void
1727 vec4_visitor::visit(ir_call *ir)
1728 {
1729    assert(!"not reached");
1730 }
1731
1732 void
1733 vec4_visitor::visit(ir_texture *ir)
1734 {
1735    /* FINISHME: Implement vertex texturing.
1736     *
1737     * With 0 vertex samplers available, the linker will reject
1738     * programs that do vertex texturing, but after our visitor has
1739     * run.
1740     */
1741    this->result = src_reg(this, glsl_type::vec4_type);
1742 }
1743
1744 void
1745 vec4_visitor::visit(ir_return *ir)
1746 {
1747    assert(!"not reached");
1748 }
1749
1750 void
1751 vec4_visitor::visit(ir_discard *ir)
1752 {
1753    assert(!"not reached");
1754 }
1755
1756 void
1757 vec4_visitor::visit(ir_if *ir)
1758 {
1759    /* Don't point the annotation at the if statement, because then it plus
1760     * the then and else blocks get printed.
1761     */
1762    this->base_ir = ir->condition;
1763
1764    if (intel->gen == 6) {
1765       emit_if_gen6(ir);
1766    } else {
1767       uint32_t predicate;
1768       emit_bool_to_cond_code(ir->condition, &predicate);
1769       emit(IF(predicate));
1770    }
1771
1772    visit_instructions(&ir->then_instructions);
1773
1774    if (!ir->else_instructions.is_empty()) {
1775       this->base_ir = ir->condition;
1776       emit(BRW_OPCODE_ELSE);
1777
1778       visit_instructions(&ir->else_instructions);
1779    }
1780
1781    this->base_ir = ir->condition;
1782    emit(BRW_OPCODE_ENDIF);
1783 }
1784
1785 void
1786 vec4_visitor::emit_ndc_computation()
1787 {
1788    /* Get the position */
1789    src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
1790
1791    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1792    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1793    output_reg[BRW_VERT_RESULT_NDC] = ndc;
1794
1795    current_annotation = "NDC";
1796    dst_reg ndc_w = ndc;
1797    ndc_w.writemask = WRITEMASK_W;
1798    src_reg pos_w = pos;
1799    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1800    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1801
1802    dst_reg ndc_xyz = ndc;
1803    ndc_xyz.writemask = WRITEMASK_XYZ;
1804
1805    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1806 }
1807
1808 void
1809 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
1810 {
1811    if (intel->gen < 6 &&
1812        ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1813         c->key.userclip_active || brw->has_negative_rhw_bug)) {
1814       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1815       dst_reg header1_w = header1;
1816       header1_w.writemask = WRITEMASK_W;
1817       GLuint i;
1818
1819       emit(MOV(header1, 0u));
1820
1821       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1822          src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
1823
1824          current_annotation = "Point size";
1825          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
1826          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
1827       }
1828
1829       current_annotation = "Clipping flags";
1830       for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
1831          vec4_instruction *inst;
1832
1833          inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
1834                          src_reg(this->userplane[i])));
1835          inst->conditional_mod = BRW_CONDITIONAL_L;
1836
1837          inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
1838          inst->predicate = BRW_PREDICATE_NORMAL;
1839       }
1840
1841       /* i965 clipping workaround:
1842        * 1) Test for -ve rhw
1843        * 2) If set,
1844        *      set ndc = (0,0,0,0)
1845        *      set ucp[6] = 1
1846        *
1847        * Later, clipping will detect ucp[6] and ensure the primitive is
1848        * clipped against all fixed planes.
1849        */
1850       if (brw->has_negative_rhw_bug) {
1851 #if 0
1852          /* FINISHME */
1853          brw_CMP(p,
1854                  vec8(brw_null_reg()),
1855                  BRW_CONDITIONAL_L,
1856                  brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
1857                  brw_imm_f(0));
1858
1859          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1860          brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
1861          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1862 #endif
1863       }
1864
1865       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1866    } else if (intel->gen < 6) {
1867       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
1868    } else {
1869       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
1870       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1871          emit(MOV(brw_writemask(reg, WRITEMASK_W),
1872                   src_reg(output_reg[VERT_RESULT_PSIZ])));
1873       }
1874    }
1875 }
1876
1877 void
1878 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
1879 {
1880    if (intel->gen < 6) {
1881       /* Clip distance slots are set aside in gen5, but they are not used.  It
1882        * is not clear whether we actually need to set aside space for them,
1883        * but the performance cost is negligible.
1884        */
1885       return;
1886    }
1887
1888    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
1889     *
1890     *     "If a linked set of shaders forming the vertex stage contains no
1891     *     static write to gl_ClipVertex or gl_ClipDistance, but the
1892     *     application has requested clipping against user clip planes through
1893     *     the API, then the coordinate written to gl_Position is used for
1894     *     comparison against the user clip planes."
1895     *
1896     * This function is only called if the shader didn't write to
1897     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
1898     * if the user wrote to it; otherwise we use gl_Position.
1899     */
1900    gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
1901    if (!(c->prog_data.outputs_written
1902          & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
1903       clip_vertex = VERT_RESULT_HPOS;
1904    }
1905
1906    for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
1907         ++i) {
1908       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
1909                src_reg(output_reg[clip_vertex]),
1910                src_reg(this->userplane[i + offset])));
1911    }
1912 }
1913
1914 void
1915 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
1916 {
1917    assert (vert_result < VERT_RESULT_MAX);
1918    current_annotation = output_reg_annotation[vert_result];
1919    /* Copy the register, saturating if necessary */
1920    vec4_instruction *inst = emit(MOV(reg,
1921                                      src_reg(output_reg[vert_result])));
1922    if ((vert_result == VERT_RESULT_COL0 ||
1923         vert_result == VERT_RESULT_COL1 ||
1924         vert_result == VERT_RESULT_BFC0 ||
1925         vert_result == VERT_RESULT_BFC1) &&
1926        c->key.clamp_vertex_color) {
1927       inst->saturate = true;
1928    }
1929 }
1930
1931 void
1932 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
1933 {
1934    struct brw_reg hw_reg = brw_message_reg(mrf);
1935    dst_reg reg = dst_reg(MRF, mrf);
1936    reg.type = BRW_REGISTER_TYPE_F;
1937
1938    switch (vert_result) {
1939    case VERT_RESULT_PSIZ:
1940       /* PSIZ is always in slot 0, and is coupled with other flags. */
1941       current_annotation = "indices, point width, clip flags";
1942       emit_psiz_and_flags(hw_reg);
1943       break;
1944    case BRW_VERT_RESULT_NDC:
1945       current_annotation = "NDC";
1946       emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
1947       break;
1948    case BRW_VERT_RESULT_HPOS_DUPLICATE:
1949    case VERT_RESULT_HPOS:
1950       current_annotation = "gl_Position";
1951       emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
1952       break;
1953    case VERT_RESULT_CLIP_DIST0:
1954    case VERT_RESULT_CLIP_DIST1:
1955       if (this->c->key.uses_clip_distance) {
1956          emit_generic_urb_slot(reg, vert_result);
1957       } else {
1958          current_annotation = "user clip distances";
1959          emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
1960       }
1961       break;
1962    case BRW_VERT_RESULT_PAD:
1963       /* No need to write to this slot */
1964       break;
1965    default:
1966       emit_generic_urb_slot(reg, vert_result);
1967       break;
1968    }
1969 }
1970
1971 static int
1972 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
1973 {
1974    struct intel_context *intel = &brw->intel;
1975
1976    if (intel->gen >= 6) {
1977       /* URB data written (does not include the message header reg) must
1978        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1979        * section 5.4.3.2.2: URB_INTERLEAVED.
1980        *
1981        * URB entries are allocated on a multiple of 1024 bits, so an
1982        * extra 128 bits written here to make the end align to 256 is
1983        * no problem.
1984        */
1985       if ((mlen % 2) != 1)
1986          mlen++;
1987    }
1988
1989    return mlen;
1990 }
1991
1992 /**
1993  * Generates the VUE payload plus the 1 or 2 URB write instructions to
1994  * complete the VS thread.
1995  *
1996  * The VUE layout is documented in Volume 2a.
1997  */
1998 void
1999 vec4_visitor::emit_urb_writes()
2000 {
2001    /* MRF 0 is reserved for the debugger, so start with message header
2002     * in MRF 1.
2003     */
2004    int base_mrf = 1;
2005    int mrf = base_mrf;
2006    /* In the process of generating our URB write message contents, we
2007     * may need to unspill a register or load from an array.  Those
2008     * reads would use MRFs 14-15.
2009     */
2010    int max_usable_mrf = 13;
2011
2012    /* The following assertion verifies that max_usable_mrf causes an
2013     * even-numbered amount of URB write data, which will meet gen6's
2014     * requirements for length alignment.
2015     */
2016    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2017
2018    /* FINISHME: edgeflag */
2019
2020    brw_compute_vue_map(&c->vue_map, intel, c->key.userclip_active,
2021                        c->prog_data.outputs_written);
2022
2023    /* First mrf is the g0-based message header containing URB handles and such,
2024     * which is implied in VS_OPCODE_URB_WRITE.
2025     */
2026    mrf++;
2027
2028    if (intel->gen < 6) {
2029       emit_ndc_computation();
2030    }
2031
2032    /* Set up the VUE data for the first URB write */
2033    int slot;
2034    for (slot = 0; slot < c->vue_map.num_slots; ++slot) {
2035       emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
2036
2037       /* If this was max_usable_mrf, we can't fit anything more into this URB
2038        * WRITE.
2039        */
2040       if (mrf > max_usable_mrf) {
2041          slot++;
2042          break;
2043       }
2044    }
2045
2046    current_annotation = "URB write";
2047    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2048    inst->base_mrf = base_mrf;
2049    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2050    inst->eot = (slot >= c->vue_map.num_slots);
2051
2052    /* Optional second URB write */
2053    if (!inst->eot) {
2054       mrf = base_mrf + 1;
2055
2056       for (; slot < c->vue_map.num_slots; ++slot) {
2057          assert(mrf < max_usable_mrf);
2058
2059          emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
2060       }
2061
2062       current_annotation = "URB write";
2063       inst = emit(VS_OPCODE_URB_WRITE);
2064       inst->base_mrf = base_mrf;
2065       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2066       inst->eot = true;
2067       /* URB destination offset.  In the previous write, we got MRFs
2068        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2069        * URB row increments, and each of our MRFs is half of one of
2070        * those, since we're doing interleaved writes.
2071        */
2072       inst->offset = (max_usable_mrf - base_mrf) / 2;
2073    }
2074
2075    if (intel->gen == 6)
2076       c->prog_data.urb_entry_size = ALIGN(c->vue_map.num_slots, 8) / 8;
2077    else
2078       c->prog_data.urb_entry_size = ALIGN(c->vue_map.num_slots, 4) / 4;
2079 }
2080
2081 src_reg
2082 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2083                                  src_reg *reladdr, int reg_offset)
2084 {
2085    /* Because we store the values to scratch interleaved like our
2086     * vertex data, we need to scale the vec4 index by 2.
2087     */
2088    int message_header_scale = 2;
2089
2090    /* Pre-gen6, the message header uses byte offsets instead of vec4
2091     * (16-byte) offset units.
2092     */
2093    if (intel->gen < 6)
2094       message_header_scale *= 16;
2095
2096    if (reladdr) {
2097       src_reg index = src_reg(this, glsl_type::int_type);
2098
2099       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2100       emit_before(inst, MUL(dst_reg(index),
2101                             index, src_reg(message_header_scale)));
2102
2103       return index;
2104    } else {
2105       return src_reg(reg_offset * message_header_scale);
2106    }
2107 }
2108
2109 src_reg
2110 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2111                                        src_reg *reladdr, int reg_offset)
2112 {
2113    if (reladdr) {
2114       src_reg index = src_reg(this, glsl_type::int_type);
2115
2116       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2117
2118       /* Pre-gen6, the message header uses byte offsets instead of vec4
2119        * (16-byte) offset units.
2120        */
2121       if (intel->gen < 6) {
2122          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2123       }
2124
2125       return index;
2126    } else {
2127       int message_header_scale = intel->gen < 6 ? 16 : 1;
2128       return src_reg(reg_offset * message_header_scale);
2129    }
2130 }
2131
2132 /**
2133  * Emits an instruction before @inst to load the value named by @orig_src
2134  * from scratch space at @base_offset to @temp.
2135  */
2136 void
2137 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2138                                 dst_reg temp, src_reg orig_src,
2139                                 int base_offset)
2140 {
2141    int reg_offset = base_offset + orig_src.reg_offset;
2142    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2143
2144    emit_before(inst, SCRATCH_READ(temp, index));
2145 }
2146
2147 /**
2148  * Emits an instruction after @inst to store the value to be written
2149  * to @orig_dst to scratch space at @base_offset, from @temp.
2150  */
2151 void
2152 vec4_visitor::emit_scratch_write(vec4_instruction *inst,
2153                                  src_reg temp, dst_reg orig_dst,
2154                                  int base_offset)
2155 {
2156    int reg_offset = base_offset + orig_dst.reg_offset;
2157    src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
2158
2159    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2160                                        orig_dst.writemask));
2161    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2162    write->predicate = inst->predicate;
2163    write->ir = inst->ir;
2164    write->annotation = inst->annotation;
2165    inst->insert_after(write);
2166 }
2167
2168 /**
2169  * We can't generally support array access in GRF space, because a
2170  * single instruction's destination can only span 2 contiguous
2171  * registers.  So, we send all GRF arrays that get variable index
2172  * access to scratch space.
2173  */
2174 void
2175 vec4_visitor::move_grf_array_access_to_scratch()
2176 {
2177    int scratch_loc[this->virtual_grf_count];
2178
2179    for (int i = 0; i < this->virtual_grf_count; i++) {
2180       scratch_loc[i] = -1;
2181    }
2182
2183    /* First, calculate the set of virtual GRFs that need to be punted
2184     * to scratch due to having any array access on them, and where in
2185     * scratch.
2186     */
2187    foreach_list(node, &this->instructions) {
2188       vec4_instruction *inst = (vec4_instruction *)node;
2189
2190       if (inst->dst.file == GRF && inst->dst.reladdr &&
2191           scratch_loc[inst->dst.reg] == -1) {
2192          scratch_loc[inst->dst.reg] = c->last_scratch;
2193          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg] * 8 * 4;
2194       }
2195
2196       for (int i = 0 ; i < 3; i++) {
2197          src_reg *src = &inst->src[i];
2198
2199          if (src->file == GRF && src->reladdr &&
2200              scratch_loc[src->reg] == -1) {
2201             scratch_loc[src->reg] = c->last_scratch;
2202             c->last_scratch += this->virtual_grf_sizes[src->reg] * 8 * 4;
2203          }
2204       }
2205    }
2206
2207    /* Now, for anything that will be accessed through scratch, rewrite
2208     * it to load/store.  Note that this is a _safe list walk, because
2209     * we may generate a new scratch_write instruction after the one
2210     * we're processing.
2211     */
2212    foreach_list_safe(node, &this->instructions) {
2213       vec4_instruction *inst = (vec4_instruction *)node;
2214
2215       /* Set up the annotation tracking for new generated instructions. */
2216       base_ir = inst->ir;
2217       current_annotation = inst->annotation;
2218
2219       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2220          src_reg temp = src_reg(this, glsl_type::vec4_type);
2221
2222          emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
2223
2224          inst->dst.file = temp.file;
2225          inst->dst.reg = temp.reg;
2226          inst->dst.reg_offset = temp.reg_offset;
2227          inst->dst.reladdr = NULL;
2228       }
2229
2230       for (int i = 0 ; i < 3; i++) {
2231          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2232             continue;
2233
2234          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2235
2236          emit_scratch_read(inst, temp, inst->src[i],
2237                            scratch_loc[inst->src[i].reg]);
2238
2239          inst->src[i].file = temp.file;
2240          inst->src[i].reg = temp.reg;
2241          inst->src[i].reg_offset = temp.reg_offset;
2242          inst->src[i].reladdr = NULL;
2243       }
2244    }
2245 }
2246
2247 /**
2248  * Emits an instruction before @inst to load the value named by @orig_src
2249  * from the pull constant buffer (surface) at @base_offset to @temp.
2250  */
2251 void
2252 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2253                                       dst_reg temp, src_reg orig_src,
2254                                       int base_offset)
2255 {
2256    int reg_offset = base_offset + orig_src.reg_offset;
2257    src_reg index = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2258    vec4_instruction *load;
2259
2260    load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2261                                         temp, index);
2262    load->base_mrf = 14;
2263    load->mlen = 1;
2264    emit_before(inst, load);
2265 }
2266
2267 /**
2268  * Implements array access of uniforms by inserting a
2269  * PULL_CONSTANT_LOAD instruction.
2270  *
2271  * Unlike temporary GRF array access (where we don't support it due to
2272  * the difficulty of doing relative addressing on instruction
2273  * destinations), we could potentially do array access of uniforms
2274  * that were loaded in GRF space as push constants.  In real-world
2275  * usage we've seen, though, the arrays being used are always larger
2276  * than we could load as push constants, so just always move all
2277  * uniform array access out to a pull constant buffer.
2278  */
2279 void
2280 vec4_visitor::move_uniform_array_access_to_pull_constants()
2281 {
2282    int pull_constant_loc[this->uniforms];
2283
2284    for (int i = 0; i < this->uniforms; i++) {
2285       pull_constant_loc[i] = -1;
2286    }
2287
2288    /* Walk through and find array access of uniforms.  Put a copy of that
2289     * uniform in the pull constant buffer.
2290     *
2291     * Note that we don't move constant-indexed accesses to arrays.  No
2292     * testing has been done of the performance impact of this choice.
2293     */
2294    foreach_list_safe(node, &this->instructions) {
2295       vec4_instruction *inst = (vec4_instruction *)node;
2296
2297       for (int i = 0 ; i < 3; i++) {
2298          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2299             continue;
2300
2301          int uniform = inst->src[i].reg;
2302
2303          /* If this array isn't already present in the pull constant buffer,
2304           * add it.
2305           */
2306          if (pull_constant_loc[uniform] == -1) {
2307             const float **values = &prog_data->param[uniform * 4];
2308
2309             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2310
2311             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2312                prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2313             }
2314          }
2315
2316          /* Set up the annotation tracking for new generated instructions. */
2317          base_ir = inst->ir;
2318          current_annotation = inst->annotation;
2319
2320          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2321
2322          emit_pull_constant_load(inst, temp, inst->src[i],
2323                                  pull_constant_loc[uniform]);
2324
2325          inst->src[i].file = temp.file;
2326          inst->src[i].reg = temp.reg;
2327          inst->src[i].reg_offset = temp.reg_offset;
2328          inst->src[i].reladdr = NULL;
2329       }
2330    }
2331
2332    /* Now there are no accesses of the UNIFORM file with a reladdr, so
2333     * no need to track them as larger-than-vec4 objects.  This will be
2334     * relied on in cutting out unused uniform vectors from push
2335     * constants.
2336     */
2337    split_uniform_registers();
2338 }
2339
2340 void
2341 vec4_visitor::resolve_ud_negate(src_reg *reg)
2342 {
2343    if (reg->type != BRW_REGISTER_TYPE_UD ||
2344        !reg->negate)
2345       return;
2346
2347    src_reg temp = src_reg(this, glsl_type::uvec4_type);
2348    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2349    *reg = temp;
2350 }
2351
2352 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2353                            struct gl_shader_program *prog,
2354                            struct brw_shader *shader)
2355 {
2356    this->c = c;
2357    this->p = &c->func;
2358    this->brw = p->brw;
2359    this->intel = &brw->intel;
2360    this->ctx = &intel->ctx;
2361    this->prog = prog;
2362    this->shader = shader;
2363
2364    this->mem_ctx = ralloc_context(NULL);
2365    this->failed = false;
2366
2367    this->base_ir = NULL;
2368    this->current_annotation = NULL;
2369
2370    this->c = c;
2371    this->vp = (struct gl_vertex_program *)
2372      prog->_LinkedShaders[MESA_SHADER_VERTEX]->Program;
2373    this->prog_data = &c->prog_data;
2374
2375    this->variable_ht = hash_table_ctor(0,
2376                                        hash_table_pointer_hash,
2377                                        hash_table_pointer_compare);
2378
2379    this->virtual_grf_def = NULL;
2380    this->virtual_grf_use = NULL;
2381    this->virtual_grf_sizes = NULL;
2382    this->virtual_grf_count = 0;
2383    this->virtual_grf_reg_map = NULL;
2384    this->virtual_grf_reg_count = 0;
2385    this->virtual_grf_array_size = 0;
2386    this->live_intervals_valid = false;
2387
2388    this->uniforms = 0;
2389
2390    this->variable_ht = hash_table_ctor(0,
2391                                        hash_table_pointer_hash,
2392                                        hash_table_pointer_compare);
2393 }
2394
2395 vec4_visitor::~vec4_visitor()
2396 {
2397    ralloc_free(this->mem_ctx);
2398    hash_table_dtor(this->variable_ht);
2399 }
2400
2401
2402 void
2403 vec4_visitor::fail(const char *format, ...)
2404 {
2405    va_list va;
2406    char *msg;
2407
2408    if (failed)
2409       return;
2410
2411    failed = true;
2412
2413    va_start(va, format);
2414    msg = ralloc_vasprintf(mem_ctx, format, va);
2415    va_end(va);
2416    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2417
2418    this->fail_msg = msg;
2419
2420    if (INTEL_DEBUG & DEBUG_VS) {
2421       fprintf(stderr, "%s",  msg);
2422    }
2423 }
2424
2425 } /* namespace brw */