src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "main/context.h"
  28 #include "main/macros.h"
  29 #include "program/prog_parameter.h"
  30 #include "program/sampler.h"
  31 }
  32
  33 namespace brw {
  34
  35 vec4_instruction::vec4_instruction(vec4_visitor *v,
  36                                    enum opcode opcode, dst_reg dst,
  37                                    src_reg src0, src_reg src1, src_reg src2)
  38 {
  39    this->opcode = opcode;
  40    this->dst = dst;
  41    this->src[0] = src0;
  42    this->src[1] = src1;
  43    this->src[2] = src2;
  44    this->ir = v->base_ir;
  45    this->annotation = v->current_annotation;
  46 }
  47
  48 vec4_instruction *
  49 vec4_visitor::emit(vec4_instruction *inst)
  50 {
  51    this->instructions.push_tail(inst);
  52
  53    return inst;
  54 }
  55
  56 vec4_instruction *
  57 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  58 {
  59    new_inst->ir = inst->ir;
  60    new_inst->annotation = inst->annotation;
  61
  62    inst->insert_before(new_inst);
  63
  64    return inst;
  65 }
  66
  67 vec4_instruction *
  68 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  69                    src_reg src0, src_reg src1, src_reg src2)
  70 {
  71    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  72                                              src0, src1, src2));
  73 }
  74
  75
  76 vec4_instruction *
  77 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  78 {
  79    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  80 }
  81
  82 vec4_instruction *
  83 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  84 {
  85    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  86 }
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  92 }
  93
  94 #define ALU1(op)                                                        \
  95    vec4_instruction *                                                   \
  96    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
  97    {                                                                    \
  98       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
  99                                            src0);                       \
 100    }
 101
 102 #define ALU2(op)                                                        \
 103    vec4_instruction *                                                   \
 104    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 105    {                                                                    \
 106       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 107                                            src0, src1);                 \
 108    }
 109
 110 ALU1(NOT)
 111 ALU1(MOV)
 112 ALU1(FRC)
 113 ALU1(RNDD)
 114 ALU1(RNDE)
 115 ALU1(RNDZ)
 116 ALU2(ADD)
 117 ALU2(MUL)
 118 ALU2(MACH)
 119 ALU2(AND)
 120 ALU2(OR)
 121 ALU2(XOR)
 122 ALU2(DP3)
 123 ALU2(DP4)
 124 ALU2(DPH)
 125 ALU2(SHL)
 126 ALU2(SHR)
 127 ALU2(ASR)
 128
 129 /** Gen4 predicated IF. */
 130 vec4_instruction *
 131 vec4_visitor::IF(uint32_t predicate)
 132 {
 133    vec4_instruction *inst;
 134
 135    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 136    inst->predicate = predicate;
 137
 138    return inst;
 139 }
 140
 141 /** Gen6+ IF with embedded comparison. */
 142 vec4_instruction *
 143 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 144 {
 145    assert(intel->gen >= 6);
 146
 147    vec4_instruction *inst;
 148
 149    resolve_ud_negate(&src0);
 150    resolve_ud_negate(&src1);
 151
 152    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 153                                         src0, src1);
 154    inst->conditional_mod = condition;
 155
 156    return inst;
 157 }
 158
 159 /**
 160  * CMP: Sets the low bit of the destination channels with the result
 161  * of the comparison, while the upper bits are undefined, and updates
 162  * the flag register with the packed 16 bits of the result.
 163  */
 164 vec4_instruction *
 165 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 166 {
 167    vec4_instruction *inst;
 168
 169    /* original gen4 does type conversion to the destination type
 170     * before before comparison, producing garbage results for floating
 171     * point comparisons.
 172     */
 173    if (intel->gen == 4) {
 174       dst.type = src0.type;
 175       if (dst.file == HW_REG)
 176          dst.fixed_hw_reg.type = dst.type;
 177    }
 178
 179    resolve_ud_negate(&src0);
 180    resolve_ud_negate(&src1);
 181
 182    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 183    inst->conditional_mod = condition;
 184
 185    return inst;
 186 }
 187
 188 vec4_instruction *
 189 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 194                                         dst, index);
 195    inst->base_mrf = 14;
 196    inst->mlen = 2;
 197
 198    return inst;
 199 }
 200
 201 vec4_instruction *
 202 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 203 {
 204    vec4_instruction *inst;
 205
 206    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 207                                         dst, src, index);
 208    inst->base_mrf = 13;
 209    inst->mlen = 3;
 210
 211    return inst;
 212 }
 213
 214 void
 215 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 216 {
 217    static enum opcode dot_opcodes[] = {
 218       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 219    };
 220
 221    emit(dot_opcodes[elements - 2], dst, src0, src1);
 222 }
 223
 224 src_reg
 225 vec4_visitor::fix_math_operand(src_reg src)
 226 {
 227    /* The gen6 math instruction ignores the source modifiers --
 228     * swizzle, abs, negate, and at least some parts of the register
 229     * region description.
 230     *
 231     * Rather than trying to enumerate all these cases, *always* expand the
 232     * operand to a temp GRF for gen6.
 233     *
 234     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 235     * can't use.
 236     */
 237
 238    if (intel->gen == 7 && src.file != IMM)
 239       return src;
 240
 241    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 242    expanded.type = src.type;
 243    emit(MOV(expanded, src));
 244    return src_reg(expanded);
 245 }
 246
 247 void
 248 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 249 {
 250    src = fix_math_operand(src);
 251
 252    if (dst.writemask != WRITEMASK_XYZW) {
 253       /* The gen6 math instruction must be align1, so we can't do
 254        * writemasks.
 255        */
 256       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 257
 258       emit(opcode, temp_dst, src);
 259
 260       emit(MOV(dst, src_reg(temp_dst)));
 261    } else {
 262       emit(opcode, dst, src);
 263    }
 264 }
 265
 266 void
 267 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 268 {
 269    vec4_instruction *inst = emit(opcode, dst, src);
 270    inst->base_mrf = 1;
 271    inst->mlen = 1;
 272 }
 273
 274 void
 275 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 276 {
 277    switch (opcode) {
 278    case SHADER_OPCODE_RCP:
 279    case SHADER_OPCODE_RSQ:
 280    case SHADER_OPCODE_SQRT:
 281    case SHADER_OPCODE_EXP2:
 282    case SHADER_OPCODE_LOG2:
 283    case SHADER_OPCODE_SIN:
 284    case SHADER_OPCODE_COS:
 285       break;
 286    default:
 287       assert(!"not reached: bad math opcode");
 288       return;
 289    }
 290
 291    if (intel->gen >= 6) {
 292       return emit_math1_gen6(opcode, dst, src);
 293    } else {
 294       return emit_math1_gen4(opcode, dst, src);
 295    }
 296 }
 297
 298 void
 299 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 300                               dst_reg dst, src_reg src0, src_reg src1)
 301 {
 302    src0 = fix_math_operand(src0);
 303    src1 = fix_math_operand(src1);
 304
 305    if (dst.writemask != WRITEMASK_XYZW) {
 306       /* The gen6 math instruction must be align1, so we can't do
 307        * writemasks.
 308        */
 309       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 310       temp_dst.type = dst.type;
 311
 312       emit(opcode, temp_dst, src0, src1);
 313
 314       emit(MOV(dst, src_reg(temp_dst)));
 315    } else {
 316       emit(opcode, dst, src0, src1);
 317    }
 318 }
 319
 320 void
 321 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 322                               dst_reg dst, src_reg src0, src_reg src1)
 323 {
 324    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 325    inst->base_mrf = 1;
 326    inst->mlen = 2;
 327 }
 328
 329 void
 330 vec4_visitor::emit_math(enum opcode opcode,
 331                         dst_reg dst, src_reg src0, src_reg src1)
 332 {
 333    switch (opcode) {
 334    case SHADER_OPCODE_POW:
 335    case SHADER_OPCODE_INT_QUOTIENT:
 336    case SHADER_OPCODE_INT_REMAINDER:
 337       break;
 338    default:
 339       assert(!"not reached: unsupported binary math opcode");
 340       return;
 341    }
 342
 343    if (intel->gen >= 6) {
 344       return emit_math2_gen6(opcode, dst, src0, src1);
 345    } else {
 346       return emit_math2_gen4(opcode, dst, src0, src1);
 347    }
 348 }
 349
 350 void
 351 vec4_visitor::visit_instructions(const exec_list *list)
 352 {
 353    foreach_list(node, list) {
 354       ir_instruction *ir = (ir_instruction *)node;
 355
 356       base_ir = ir;
 357       ir->accept(this);
 358    }
 359 }
 360
 361
 362 static int
 363 type_size(const struct glsl_type *type)
 364 {
 365    unsigned int i;
 366    int size;
 367
 368    switch (type->base_type) {
 369    case GLSL_TYPE_UINT:
 370    case GLSL_TYPE_INT:
 371    case GLSL_TYPE_FLOAT:
 372    case GLSL_TYPE_BOOL:
 373       if (type->is_matrix()) {
 374          return type->matrix_columns;
 375       } else {
 376          /* Regardless of size of vector, it gets a vec4. This is bad
 377           * packing for things like floats, but otherwise arrays become a
 378           * mess.  Hopefully a later pass over the code can pack scalars
 379           * down if appropriate.
 380           */
 381          return 1;
 382       }
 383    case GLSL_TYPE_ARRAY:
 384       assert(type->length > 0);
 385       return type_size(type->fields.array) * type->length;
 386    case GLSL_TYPE_STRUCT:
 387       size = 0;
 388       for (i = 0; i < type->length; i++) {
 389          size += type_size(type->fields.structure[i].type);
 390       }
 391       return size;
 392    case GLSL_TYPE_SAMPLER:
 393       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 394        * at link time.
 395        */
 396       return 1;
 397    default:
 398       assert(0);
 399       return 0;
 400    }
 401 }
 402
 403 int
 404 vec4_visitor::virtual_grf_alloc(int size)
 405 {
 406    if (virtual_grf_array_size <= virtual_grf_count) {
 407       if (virtual_grf_array_size == 0)
 408          virtual_grf_array_size = 16;
 409       else
 410          virtual_grf_array_size *= 2;
 411       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 412                                    virtual_grf_array_size);
 413       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 414                                      virtual_grf_array_size);
 415    }
 416    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 417    virtual_grf_reg_count += size;
 418    virtual_grf_sizes[virtual_grf_count] = size;
 419    return virtual_grf_count++;
 420 }
 421
 422 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 423 {
 424    init();
 425
 426    this->file = GRF;
 427    this->reg = v->virtual_grf_alloc(type_size(type));
 428
 429    if (type->is_array() || type->is_record()) {
 430       this->swizzle = BRW_SWIZZLE_NOOP;
 431    } else {
 432       this->swizzle = swizzle_for_size(type->vector_elements);
 433    }
 434
 435    this->type = brw_type_for_base_type(type);
 436 }
 437
 438 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 439 {
 440    init();
 441
 442    this->file = GRF;
 443    this->reg = v->virtual_grf_alloc(type_size(type));
 444
 445    if (type->is_array() || type->is_record()) {
 446       this->writemask = WRITEMASK_XYZW;
 447    } else {
 448       this->writemask = (1 << type->vector_elements) - 1;
 449    }
 450
 451    this->type = brw_type_for_base_type(type);
 452 }
 453
 454 /* Our support for uniforms is piggy-backed on the struct
 455  * gl_fragment_program, because that's where the values actually
 456  * get stored, rather than in some global gl_shader_program uniform
 457  * store.
 458  */
 459 void
 460 vec4_visitor::setup_uniform_values(ir_variable *ir)
 461 {
 462    int namelen = strlen(ir->name);
 463
 464    /* The data for our (non-builtin) uniforms is stored in a series of
 465     * gl_uniform_driver_storage structs for each subcomponent that
 466     * glGetUniformLocation() could name.  We know it's been set up in the same
 467     * order we'd walk the type, so walk the list of storage and find anything
 468     * with our name, or the prefix of a component that starts with our name.
 469     */
 470    for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
 471       struct gl_uniform_storage *storage = &prog->UniformStorage[u];
 472
 473       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 474           (storage->name[namelen] != 0 &&
 475            storage->name[namelen] != '.' &&
 476            storage->name[namelen] != '[')) {
 477          continue;
 478       }
 479
 480       gl_constant_value *components = storage->storage;
 481       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 482                                storage->type->matrix_columns);
 483
 484       for (unsigned s = 0; s < vector_count; s++) {
 485          uniform_vector_size[uniforms] = storage->type->vector_elements;
 486
 487          int i;
 488          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 489             c->prog_data.param[uniforms * 4 + i] = &components->f;
 490             components++;
 491          }
 492          for (; i < 4; i++) {
 493             static float zero = 0;
 494             c->prog_data.param[uniforms * 4 + i] = &zero;
 495          }
 496
 497          uniforms++;
 498       }
 499    }
 500 }
 501
 502 void
 503 vec4_visitor::setup_uniform_clipplane_values()
 504 {
 505    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 506
 507    if (intel->gen < 6) {
 508       /* Pre-Gen6, we compact clip planes.  For example, if the user
 509        * enables just clip planes 0, 1, and 3, we will enable clip planes
 510        * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
 511        * plane 2.  This simplifies the implementation of the Gen6 clip
 512        * thread.
 513        */
 514       int compacted_clipplane_index = 0;
 515       for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
 516          if (!(c->key.userclip_planes_enabled_gen_4_5 & (1 << i)))
 517             continue;
 518
 519          this->uniform_vector_size[this->uniforms] = 4;
 520          this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
 521          this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
 522          for (int j = 0; j < 4; ++j) {
 523             c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 524          }
 525          ++compacted_clipplane_index;
 526          ++this->uniforms;
 527       }
 528    } else {
 529       /* In Gen6 and later, we don't compact clip planes, because this
 530        * simplifies the implementation of gl_ClipDistance.
 531        */
 532       for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
 533          this->uniform_vector_size[this->uniforms] = 4;
 534          this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 535          this->userplane[i].type = BRW_REGISTER_TYPE_F;
 536          for (int j = 0; j < 4; ++j) {
 537             c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 538          }
 539          ++this->uniforms;
 540       }
 541    }
 542 }
 543
 544 /* Our support for builtin uniforms is even scarier than non-builtin.
 545  * It sits on top of the PROG_STATE_VAR parameters that are
 546  * automatically updated from GL context state.
 547  */
 548 void
 549 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 550 {
 551    const ir_state_slot *const slots = ir->state_slots;
 552    assert(ir->state_slots != NULL);
 553
 554    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 555       /* This state reference has already been setup by ir_to_mesa,
 556        * but we'll get the same index back here.  We can reference
 557        * ParameterValues directly, since unlike brw_fs.cpp, we never
 558        * add new state references during compile.
 559        */
 560       int index = _mesa_add_state_reference(this->vp->Base.Parameters,
 561                                             (gl_state_index *)slots[i].tokens);
 562       float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
 563
 564       this->uniform_vector_size[this->uniforms] = 0;
 565       /* Add each of the unique swizzled channels of the element.
 566        * This will end up matching the size of the glsl_type of this field.
 567        */
 568       int last_swiz = -1;
 569       for (unsigned int j = 0; j < 4; j++) {
 570          int swiz = GET_SWZ(slots[i].swizzle, j);
 571          last_swiz = swiz;
 572
 573          c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
 574          if (swiz <= last_swiz)
 575             this->uniform_vector_size[this->uniforms]++;
 576       }
 577       this->uniforms++;
 578    }
 579 }
 580
 581 dst_reg *
 582 vec4_visitor::variable_storage(ir_variable *var)
 583 {
 584    return (dst_reg *)hash_table_find(this->variable_ht, var);
 585 }
 586
 587 void
 588 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 589 {
 590    ir_expression *expr = ir->as_expression();
 591
 592    *predicate = BRW_PREDICATE_NORMAL;
 593
 594    if (expr) {
 595       src_reg op[2];
 596       vec4_instruction *inst;
 597
 598       assert(expr->get_num_operands() <= 2);
 599       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 600          expr->operands[i]->accept(this);
 601          op[i] = this->result;
 602
 603          resolve_ud_negate(&op[i]);
 604       }
 605
 606       switch (expr->operation) {
 607       case ir_unop_logic_not:
 608          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 609          inst->conditional_mod = BRW_CONDITIONAL_Z;
 610          break;
 611
 612       case ir_binop_logic_xor:
 613          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 614          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 615          break;
 616
 617       case ir_binop_logic_or:
 618          inst = emit(OR(dst_null_d(), op[0], op[1]));
 619          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 620          break;
 621
 622       case ir_binop_logic_and:
 623          inst = emit(AND(dst_null_d(), op[0], op[1]));
 624          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 625          break;
 626
 627       case ir_unop_f2b:
 628          if (intel->gen >= 6) {
 629             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 630          } else {
 631             inst = emit(MOV(dst_null_f(), op[0]));
 632             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 633          }
 634          break;
 635
 636       case ir_unop_i2b:
 637          if (intel->gen >= 6) {
 638             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 639          } else {
 640             inst = emit(MOV(dst_null_d(), op[0]));
 641             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 642          }
 643          break;
 644
 645       case ir_binop_all_equal:
 646          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 647          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 648          break;
 649
 650       case ir_binop_any_nequal:
 651          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 652          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 653          break;
 654
 655       case ir_unop_any:
 656          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 657          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 658          break;
 659
 660       case ir_binop_greater:
 661       case ir_binop_gequal:
 662       case ir_binop_less:
 663       case ir_binop_lequal:
 664       case ir_binop_equal:
 665       case ir_binop_nequal:
 666          emit(CMP(dst_null_d(), op[0], op[1],
 667                   brw_conditional_for_comparison(expr->operation)));
 668          break;
 669
 670       default:
 671          assert(!"not reached");
 672          break;
 673       }
 674       return;
 675    }
 676
 677    ir->accept(this);
 678
 679    resolve_ud_negate(&this->result);
 680
 681    if (intel->gen >= 6) {
 682       vec4_instruction *inst = emit(AND(dst_null_d(),
 683                                         this->result, src_reg(1)));
 684       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 685    } else {
 686       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 687       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 688    }
 689 }
 690
 691 /**
 692  * Emit a gen6 IF statement with the comparison folded into the IF
 693  * instruction.
 694  */
 695 void
 696 vec4_visitor::emit_if_gen6(ir_if *ir)
 697 {
 698    ir_expression *expr = ir->condition->as_expression();
 699
 700    if (expr) {
 701       src_reg op[2];
 702       dst_reg temp;
 703
 704       assert(expr->get_num_operands() <= 2);
 705       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 706          expr->operands[i]->accept(this);
 707          op[i] = this->result;
 708       }
 709
 710       switch (expr->operation) {
 711       case ir_unop_logic_not:
 712          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 713          return;
 714
 715       case ir_binop_logic_xor:
 716          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 717          return;
 718
 719       case ir_binop_logic_or:
 720          temp = dst_reg(this, glsl_type::bool_type);
 721          emit(OR(temp, op[0], op[1]));
 722          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 723          return;
 724
 725       case ir_binop_logic_and:
 726          temp = dst_reg(this, glsl_type::bool_type);
 727          emit(AND(temp, op[0], op[1]));
 728          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 729          return;
 730
 731       case ir_unop_f2b:
 732          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 733          return;
 734
 735       case ir_unop_i2b:
 736          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 737          return;
 738
 739       case ir_binop_greater:
 740       case ir_binop_gequal:
 741       case ir_binop_less:
 742       case ir_binop_lequal:
 743       case ir_binop_equal:
 744       case ir_binop_nequal:
 745          emit(IF(op[0], op[1],
 746                  brw_conditional_for_comparison(expr->operation)));
 747          return;
 748
 749       case ir_binop_all_equal:
 750          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 751          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 752          return;
 753
 754       case ir_binop_any_nequal:
 755          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 756          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 757          return;
 758
 759       case ir_unop_any:
 760          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 761          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 762          return;
 763
 764       default:
 765          assert(!"not reached");
 766          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 767          return;
 768       }
 769       return;
 770    }
 771
 772    ir->condition->accept(this);
 773
 774    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 775 }
 776
 777 static dst_reg
 778 with_writemask(dst_reg const & r, int mask)
 779 {
 780    dst_reg result = r;
 781    result.writemask = mask;
 782    return result;
 783 }
 784
 785 void
 786 vec4_visitor::emit_attribute_fixups()
 787 {
 788    dst_reg sign_recovery_shift;
 789    dst_reg normalize_factor;
 790    dst_reg es3_normalize_factor;
 791
 792    for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
 793       if (prog_data->inputs_read & BITFIELD64_BIT(i)) {
 794          uint8_t wa_flags = c->key.gl_attrib_wa_flags[i];
 795          dst_reg reg(ATTR, i);
 796          dst_reg reg_d = reg;
 797          reg_d.type = BRW_REGISTER_TYPE_D;
 798          dst_reg reg_ud = reg;
 799          reg_ud.type = BRW_REGISTER_TYPE_UD;
 800
 801          /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 802           * come in as floating point conversions of the integer values.
 803           */
 804          if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
 805             dst_reg dst = reg;
 806             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 807             dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
 808             emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 809          }
 810
 811          /* Do sign recovery for 2101010 formats if required. */
 812          if (wa_flags & BRW_ATTRIB_WA_SIGN) {
 813             if (sign_recovery_shift.file == BAD_FILE) {
 814                /* shift constant: <22,22,22,30> */
 815                sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
 816                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
 817                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
 818             }
 819
 820             emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
 821             emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
 822          }
 823
 824          /* Apply BGRA swizzle if required. */
 825          if (wa_flags & BRW_ATTRIB_WA_BGRA) {
 826             src_reg temp = src_reg(reg);
 827             temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
 828             emit(MOV(reg, temp));
 829          }
 830
 831          if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
 832             /* ES 3.0 has different rules for converting signed normalized
 833              * fixed-point numbers than desktop GL.
 834              */
 835             if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
 836                /* According to equation 2.2 of the ES 3.0 specification,
 837                 * signed normalization conversion is done by:
 838                 *
 839                 * f = c / (2^(b-1)-1)
 840                 */
 841                if (es3_normalize_factor.file == BAD_FILE) {
 842                   /* mul constant: 1 / (2^(b-1) - 1) */
 843                   es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
 844                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
 845                            src_reg(1.0f / ((1<<9) - 1))));
 846                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
 847                            src_reg(1.0f / ((1<<1) - 1))));
 848                }
 849
 850                dst_reg dst = reg;
 851                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 852                emit(MOV(dst, src_reg(reg_d)));
 853                emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
 854                emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
 855             } else {
 856                /* The following equations are from the OpenGL 3.2 specification:
 857                 *
 858                 * 2.1 unsigned normalization
 859                 * f = c/(2^n-1)
 860                 *
 861                 * 2.2 signed normalization
 862                 * f = (2c+1)/(2^n-1)
 863                 *
 864                 * Both of these share a common divisor, which is represented by
 865                 * "normalize_factor" in the code below.
 866                 */
 867                if (normalize_factor.file == BAD_FILE) {
 868                   /* 1 / (2^b - 1) for b=<10,10,10,2> */
 869                   normalize_factor = dst_reg(this, glsl_type::vec4_type);
 870                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
 871                            src_reg(1.0f / ((1<<10) - 1))));
 872                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
 873                            src_reg(1.0f / ((1<<2) - 1))));
 874                }
 875
 876                dst_reg dst = reg;
 877                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 878                emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
 879
 880                /* For signed normalization, we want the numerator to be 2c+1. */
 881                if (wa_flags & BRW_ATTRIB_WA_SIGN) {
 882                   emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
 883                   emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
 884                }
 885
 886                emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
 887             }
 888          }
 889
 890          if (wa_flags & BRW_ATTRIB_WA_SCALE) {
 891             dst_reg dst = reg;
 892             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 893             emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
 894          }
 895       }
 896    }
 897 }
 898
 899 void
 900 vec4_visitor::visit(ir_variable *ir)
 901 {
 902    dst_reg *reg = NULL;
 903
 904    if (variable_storage(ir))
 905       return;
 906
 907    switch (ir->mode) {
 908    case ir_var_in:
 909       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
 910       break;
 911
 912    case ir_var_out:
 913       reg = new(mem_ctx) dst_reg(this, ir->type);
 914
 915       for (int i = 0; i < type_size(ir->type); i++) {
 916          output_reg[ir->location + i] = *reg;
 917          output_reg[ir->location + i].reg_offset = i;
 918          output_reg[ir->location + i].type =
 919             brw_type_for_base_type(ir->type->get_scalar_type());
 920          output_reg_annotation[ir->location + i] = ir->name;
 921       }
 922       break;
 923
 924    case ir_var_auto:
 925    case ir_var_temporary:
 926       reg = new(mem_ctx) dst_reg(this, ir->type);
 927       break;
 928
 929    case ir_var_uniform:
 930       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 931
 932       /* Thanks to the lower_ubo_reference pass, we will see only
 933        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 934        * variables, so no need for them to be in variable_ht.
 935        */
 936       if (ir->uniform_block != -1)
 937          return;
 938
 939       /* Track how big the whole uniform variable is, in case we need to put a
 940        * copy of its data into pull constants for array access.
 941        */
 942       this->uniform_size[this->uniforms] = type_size(ir->type);
 943
 944       if (!strncmp(ir->name, "gl_", 3)) {
 945          setup_builtin_uniform_values(ir);
 946       } else {
 947          setup_uniform_values(ir);
 948       }
 949       break;
 950
 951    case ir_var_system_value:
 952       /* VertexID is stored by the VF as the last vertex element, but
 953        * we don't represent it with a flag in inputs_read, so we call
 954        * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
 955        */
 956       reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
 957       prog_data->uses_vertexid = true;
 958
 959       switch (ir->location) {
 960       case SYSTEM_VALUE_VERTEX_ID:
 961          reg->writemask = WRITEMASK_X;
 962          break;
 963       case SYSTEM_VALUE_INSTANCE_ID:
 964          reg->writemask = WRITEMASK_Y;
 965          break;
 966       default:
 967          assert(!"not reached");
 968          break;
 969       }
 970       break;
 971
 972    default:
 973       assert(!"not reached");
 974    }
 975
 976    reg->type = brw_type_for_base_type(ir->type);
 977    hash_table_insert(this->variable_ht, reg, ir);
 978 }
 979
 980 void
 981 vec4_visitor::visit(ir_loop *ir)
 982 {
 983    dst_reg counter;
 984
 985    /* We don't want debugging output to print the whole body of the
 986     * loop as the annotation.
 987     */
 988    this->base_ir = NULL;
 989
 990    if (ir->counter != NULL) {
 991       this->base_ir = ir->counter;
 992       ir->counter->accept(this);
 993       counter = *(variable_storage(ir->counter));
 994
 995       if (ir->from != NULL) {
 996          this->base_ir = ir->from;
 997          ir->from->accept(this);
 998
 999          emit(MOV(counter, this->result));
1000       }
1001    }
1002
1003    emit(BRW_OPCODE_DO);
1004
1005    if (ir->to) {
1006       this->base_ir = ir->to;
1007       ir->to->accept(this);
1008
1009       emit(CMP(dst_null_d(), src_reg(counter), this->result,
1010                brw_conditional_for_comparison(ir->cmp)));
1011
1012       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1013       inst->predicate = BRW_PREDICATE_NORMAL;
1014    }
1015
1016    visit_instructions(&ir->body_instructions);
1017
1018
1019    if (ir->increment) {
1020       this->base_ir = ir->increment;
1021       ir->increment->accept(this);
1022       emit(ADD(counter, src_reg(counter), this->result));
1023    }
1024
1025    emit(BRW_OPCODE_WHILE);
1026 }
1027
1028 void
1029 vec4_visitor::visit(ir_loop_jump *ir)
1030 {
1031    switch (ir->mode) {
1032    case ir_loop_jump::jump_break:
1033       emit(BRW_OPCODE_BREAK);
1034       break;
1035    case ir_loop_jump::jump_continue:
1036       emit(BRW_OPCODE_CONTINUE);
1037       break;
1038    }
1039 }
1040
1041
1042 void
1043 vec4_visitor::visit(ir_function_signature *ir)
1044 {
1045    assert(0);
1046    (void)ir;
1047 }
1048
1049 void
1050 vec4_visitor::visit(ir_function *ir)
1051 {
1052    /* Ignore function bodies other than main() -- we shouldn't see calls to
1053     * them since they should all be inlined.
1054     */
1055    if (strcmp(ir->name, "main") == 0) {
1056       const ir_function_signature *sig;
1057       exec_list empty;
1058
1059       sig = ir->matching_signature(&empty);
1060
1061       assert(sig);
1062
1063       visit_instructions(&sig->body);
1064    }
1065 }
1066
1067 bool
1068 vec4_visitor::try_emit_sat(ir_expression *ir)
1069 {
1070    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1071    if (!sat_src)
1072       return false;
1073
1074    sat_src->accept(this);
1075    src_reg src = this->result;
1076
1077    this->result = src_reg(this, ir->type);
1078    vec4_instruction *inst;
1079    inst = emit(MOV(dst_reg(this->result), src));
1080    inst->saturate = true;
1081
1082    return true;
1083 }
1084
1085 void
1086 vec4_visitor::emit_bool_comparison(unsigned int op,
1087                                  dst_reg dst, src_reg src0, src_reg src1)
1088 {
1089    /* original gen4 does destination conversion before comparison. */
1090    if (intel->gen < 5)
1091       dst.type = src0.type;
1092
1093    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1094
1095    dst.type = BRW_REGISTER_TYPE_D;
1096    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1097 }
1098
1099 void
1100 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1101                           src_reg src0, src_reg src1)
1102 {
1103    vec4_instruction *inst;
1104
1105    if (intel->gen >= 6) {
1106       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1107       inst->conditional_mod = conditionalmod;
1108    } else {
1109       emit(CMP(dst, src0, src1, conditionalmod));
1110
1111       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1112       inst->predicate = BRW_PREDICATE_NORMAL;
1113    }
1114 }
1115
1116 void
1117 vec4_visitor::visit(ir_expression *ir)
1118 {
1119    unsigned int operand;
1120    src_reg op[Elements(ir->operands)];
1121    src_reg result_src;
1122    dst_reg result_dst;
1123    vec4_instruction *inst;
1124
1125    if (try_emit_sat(ir))
1126       return;
1127
1128    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1129       this->result.file = BAD_FILE;
1130       ir->operands[operand]->accept(this);
1131       if (this->result.file == BAD_FILE) {
1132          printf("Failed to get tree for expression operand:\n");
1133          ir->operands[operand]->print();
1134          exit(1);
1135       }
1136       op[operand] = this->result;
1137
1138       /* Matrix expression operands should have been broken down to vector
1139        * operations already.
1140        */
1141       assert(!ir->operands[operand]->type->is_matrix());
1142    }
1143
1144    int vector_elements = ir->operands[0]->type->vector_elements;
1145    if (ir->operands[1]) {
1146       vector_elements = MAX2(vector_elements,
1147                              ir->operands[1]->type->vector_elements);
1148    }
1149
1150    this->result.file = BAD_FILE;
1151
1152    /* Storage for our result.  Ideally for an assignment we'd be using
1153     * the actual storage for the result here, instead.
1154     */
1155    result_src = src_reg(this, ir->type);
1156    /* convenience for the emit functions below. */
1157    result_dst = dst_reg(result_src);
1158    /* If nothing special happens, this is the result. */
1159    this->result = result_src;
1160    /* Limit writes to the channels that will be used by result_src later.
1161     * This does limit this temp's use as a temporary for multi-instruction
1162     * sequences.
1163     */
1164    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1165
1166    switch (ir->operation) {
1167    case ir_unop_logic_not:
1168       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1169        * ones complement of the whole register, not just bit 0.
1170        */
1171       emit(XOR(result_dst, op[0], src_reg(1)));
1172       break;
1173    case ir_unop_neg:
1174       op[0].negate = !op[0].negate;
1175       this->result = op[0];
1176       break;
1177    case ir_unop_abs:
1178       op[0].abs = true;
1179       op[0].negate = false;
1180       this->result = op[0];
1181       break;
1182
1183    case ir_unop_sign:
1184       emit(MOV(result_dst, src_reg(0.0f)));
1185
1186       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1187       inst = emit(MOV(result_dst, src_reg(1.0f)));
1188       inst->predicate = BRW_PREDICATE_NORMAL;
1189
1190       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1191       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1192       inst->predicate = BRW_PREDICATE_NORMAL;
1193
1194       break;
1195
1196    case ir_unop_rcp:
1197       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1198       break;
1199
1200    case ir_unop_exp2:
1201       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1202       break;
1203    case ir_unop_log2:
1204       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1205       break;
1206    case ir_unop_exp:
1207    case ir_unop_log:
1208       assert(!"not reached: should be handled by ir_explog_to_explog2");
1209       break;
1210    case ir_unop_sin:
1211    case ir_unop_sin_reduced:
1212       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1213       break;
1214    case ir_unop_cos:
1215    case ir_unop_cos_reduced:
1216       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1217       break;
1218
1219    case ir_unop_dFdx:
1220    case ir_unop_dFdy:
1221       assert(!"derivatives not valid in vertex shader");
1222       break;
1223
1224    case ir_unop_noise:
1225       assert(!"not reached: should be handled by lower_noise");
1226       break;
1227
1228    case ir_binop_add:
1229       emit(ADD(result_dst, op[0], op[1]));
1230       break;
1231    case ir_binop_sub:
1232       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1233       break;
1234
1235    case ir_binop_mul:
1236       if (ir->type->is_integer()) {
1237          /* For integer multiplication, the MUL uses the low 16 bits
1238           * of one of the operands (src0 on gen6, src1 on gen7).  The
1239           * MACH accumulates in the contribution of the upper 16 bits
1240           * of that operand.
1241           *
1242           * FINISHME: Emit just the MUL if we know an operand is small
1243           * enough.
1244           */
1245          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1246
1247          emit(MUL(acc, op[0], op[1]));
1248          emit(MACH(dst_null_d(), op[0], op[1]));
1249          emit(MOV(result_dst, src_reg(acc)));
1250       } else {
1251          emit(MUL(result_dst, op[0], op[1]));
1252       }
1253       break;
1254    case ir_binop_div:
1255       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1256       assert(ir->type->is_integer());
1257       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1258       break;
1259    case ir_binop_mod:
1260       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1261       assert(ir->type->is_integer());
1262       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1263       break;
1264
1265    case ir_binop_less:
1266    case ir_binop_greater:
1267    case ir_binop_lequal:
1268    case ir_binop_gequal:
1269    case ir_binop_equal:
1270    case ir_binop_nequal: {
1271       emit(CMP(result_dst, op[0], op[1],
1272                brw_conditional_for_comparison(ir->operation)));
1273       emit(AND(result_dst, result_src, src_reg(0x1)));
1274       break;
1275    }
1276
1277    case ir_binop_all_equal:
1278       /* "==" operator producing a scalar boolean. */
1279       if (ir->operands[0]->type->is_vector() ||
1280           ir->operands[1]->type->is_vector()) {
1281          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1282          emit(MOV(result_dst, src_reg(0)));
1283          inst = emit(MOV(result_dst, src_reg(1)));
1284          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1285       } else {
1286          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1287          emit(AND(result_dst, result_src, src_reg(0x1)));
1288       }
1289       break;
1290    case ir_binop_any_nequal:
1291       /* "!=" operator producing a scalar boolean. */
1292       if (ir->operands[0]->type->is_vector() ||
1293           ir->operands[1]->type->is_vector()) {
1294          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1295
1296          emit(MOV(result_dst, src_reg(0)));
1297          inst = emit(MOV(result_dst, src_reg(1)));
1298          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1299       } else {
1300          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1301          emit(AND(result_dst, result_src, src_reg(0x1)));
1302       }
1303       break;
1304
1305    case ir_unop_any:
1306       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1307       emit(MOV(result_dst, src_reg(0)));
1308
1309       inst = emit(MOV(result_dst, src_reg(1)));
1310       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1311       break;
1312
1313    case ir_binop_logic_xor:
1314       emit(XOR(result_dst, op[0], op[1]));
1315       break;
1316
1317    case ir_binop_logic_or:
1318       emit(OR(result_dst, op[0], op[1]));
1319       break;
1320
1321    case ir_binop_logic_and:
1322       emit(AND(result_dst, op[0], op[1]));
1323       break;
1324
1325    case ir_binop_dot:
1326       assert(ir->operands[0]->type->is_vector());
1327       assert(ir->operands[0]->type == ir->operands[1]->type);
1328       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1329       break;
1330
1331    case ir_unop_sqrt:
1332       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1333       break;
1334    case ir_unop_rsq:
1335       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1336       break;
1337
1338    case ir_unop_bitcast_i2f:
1339    case ir_unop_bitcast_u2f:
1340       this->result = op[0];
1341       this->result.type = BRW_REGISTER_TYPE_F;
1342       break;
1343
1344    case ir_unop_bitcast_f2i:
1345       this->result = op[0];
1346       this->result.type = BRW_REGISTER_TYPE_D;
1347       break;
1348
1349    case ir_unop_bitcast_f2u:
1350       this->result = op[0];
1351       this->result.type = BRW_REGISTER_TYPE_UD;
1352       break;
1353
1354    case ir_unop_i2f:
1355    case ir_unop_i2u:
1356    case ir_unop_u2i:
1357    case ir_unop_u2f:
1358    case ir_unop_b2f:
1359    case ir_unop_b2i:
1360    case ir_unop_f2i:
1361    case ir_unop_f2u:
1362       emit(MOV(result_dst, op[0]));
1363       break;
1364    case ir_unop_f2b:
1365    case ir_unop_i2b: {
1366       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1367       emit(AND(result_dst, result_src, src_reg(1)));
1368       break;
1369    }
1370
1371    case ir_unop_trunc:
1372       emit(RNDZ(result_dst, op[0]));
1373       break;
1374    case ir_unop_ceil:
1375       op[0].negate = !op[0].negate;
1376       inst = emit(RNDD(result_dst, op[0]));
1377       this->result.negate = true;
1378       break;
1379    case ir_unop_floor:
1380       inst = emit(RNDD(result_dst, op[0]));
1381       break;
1382    case ir_unop_fract:
1383       inst = emit(FRC(result_dst, op[0]));
1384       break;
1385    case ir_unop_round_even:
1386       emit(RNDE(result_dst, op[0]));
1387       break;
1388
1389    case ir_binop_min:
1390       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1391       break;
1392    case ir_binop_max:
1393       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1394       break;
1395
1396    case ir_binop_pow:
1397       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1398       break;
1399
1400    case ir_unop_bit_not:
1401       inst = emit(NOT(result_dst, op[0]));
1402       break;
1403    case ir_binop_bit_and:
1404       inst = emit(AND(result_dst, op[0], op[1]));
1405       break;
1406    case ir_binop_bit_xor:
1407       inst = emit(XOR(result_dst, op[0], op[1]));
1408       break;
1409    case ir_binop_bit_or:
1410       inst = emit(OR(result_dst, op[0], op[1]));
1411       break;
1412
1413    case ir_binop_lshift:
1414       inst = emit(SHL(result_dst, op[0], op[1]));
1415       break;
1416
1417    case ir_binop_rshift:
1418       if (ir->type->base_type == GLSL_TYPE_INT)
1419          inst = emit(ASR(result_dst, op[0], op[1]));
1420       else
1421          inst = emit(SHR(result_dst, op[0], op[1]));
1422       break;
1423
1424    case ir_binop_ubo_load: {
1425       ir_constant *uniform_block = ir->operands[0]->as_constant();
1426       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1427       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1428       src_reg offset = op[1];
1429
1430       /* Now, load the vector from that offset. */
1431       assert(ir->type->is_vector() || ir->type->is_scalar());
1432
1433       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1434       packed_consts.type = result.type;
1435       src_reg surf_index =
1436          src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1437       if (const_offset_ir) {
1438          offset = src_reg(const_offset / 16);
1439       } else {
1440          emit(SHR(dst_reg(offset), offset, src_reg(4)));
1441       }
1442
1443       vec4_instruction *pull =
1444          emit(new(mem_ctx) vec4_instruction(this,
1445                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1446                                             dst_reg(packed_consts),
1447                                             surf_index,
1448                                             offset));
1449       pull->base_mrf = 14;
1450       pull->mlen = 1;
1451
1452       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1453       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1454                                             const_offset % 16 / 4,
1455                                             const_offset % 16 / 4,
1456                                             const_offset % 16 / 4);
1457
1458       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1459       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1460          emit(CMP(result_dst, packed_consts, src_reg(0u),
1461                   BRW_CONDITIONAL_NZ));
1462          emit(AND(result_dst, result, src_reg(0x1)));
1463       } else {
1464          emit(MOV(result_dst, packed_consts));
1465       }
1466       break;
1467    }
1468
1469    case ir_quadop_vector:
1470       assert(!"not reached: should be handled by lower_quadop_vector");
1471       break;
1472    }
1473 }
1474
1475
1476 void
1477 vec4_visitor::visit(ir_swizzle *ir)
1478 {
1479    src_reg src;
1480    int i = 0;
1481    int swizzle[4];
1482
1483    /* Note that this is only swizzles in expressions, not those on the left
1484     * hand side of an assignment, which do write masking.  See ir_assignment
1485     * for that.
1486     */
1487
1488    ir->val->accept(this);
1489    src = this->result;
1490    assert(src.file != BAD_FILE);
1491
1492    for (i = 0; i < ir->type->vector_elements; i++) {
1493       switch (i) {
1494       case 0:
1495          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1496          break;
1497       case 1:
1498          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1499          break;
1500       case 2:
1501          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1502          break;
1503       case 3:
1504          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1505             break;
1506       }
1507    }
1508    for (; i < 4; i++) {
1509       /* Replicate the last channel out. */
1510       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1511    }
1512
1513    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1514
1515    this->result = src;
1516 }
1517
1518 void
1519 vec4_visitor::visit(ir_dereference_variable *ir)
1520 {
1521    const struct glsl_type *type = ir->type;
1522    dst_reg *reg = variable_storage(ir->var);
1523
1524    if (!reg) {
1525       fail("Failed to find variable storage for %s\n", ir->var->name);
1526       this->result = src_reg(brw_null_reg());
1527       return;
1528    }
1529
1530    this->result = src_reg(*reg);
1531
1532    /* System values get their swizzle from the dst_reg writemask */
1533    if (ir->var->mode == ir_var_system_value)
1534       return;
1535
1536    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1537       this->result.swizzle = swizzle_for_size(type->vector_elements);
1538 }
1539
1540 void
1541 vec4_visitor::visit(ir_dereference_array *ir)
1542 {
1543    ir_constant *constant_index;
1544    src_reg src;
1545    int element_size = type_size(ir->type);
1546
1547    constant_index = ir->array_index->constant_expression_value();
1548
1549    ir->array->accept(this);
1550    src = this->result;
1551
1552    if (constant_index) {
1553       src.reg_offset += constant_index->value.i[0] * element_size;
1554    } else {
1555       /* Variable index array dereference.  It eats the "vec4" of the
1556        * base of the array and an index that offsets the Mesa register
1557        * index.
1558        */
1559       ir->array_index->accept(this);
1560
1561       src_reg index_reg;
1562
1563       if (element_size == 1) {
1564          index_reg = this->result;
1565       } else {
1566          index_reg = src_reg(this, glsl_type::int_type);
1567
1568          emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1569       }
1570
1571       if (src.reladdr) {
1572          src_reg temp = src_reg(this, glsl_type::int_type);
1573
1574          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1575
1576          index_reg = temp;
1577       }
1578
1579       src.reladdr = ralloc(mem_ctx, src_reg);
1580       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1581    }
1582
1583    /* If the type is smaller than a vec4, replicate the last channel out. */
1584    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1585       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1586    else
1587       src.swizzle = BRW_SWIZZLE_NOOP;
1588    src.type = brw_type_for_base_type(ir->type);
1589
1590    this->result = src;
1591 }
1592
1593 void
1594 vec4_visitor::visit(ir_dereference_record *ir)
1595 {
1596    unsigned int i;
1597    const glsl_type *struct_type = ir->record->type;
1598    int offset = 0;
1599
1600    ir->record->accept(this);
1601
1602    for (i = 0; i < struct_type->length; i++) {
1603       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1604          break;
1605       offset += type_size(struct_type->fields.structure[i].type);
1606    }
1607
1608    /* If the type is smaller than a vec4, replicate the last channel out. */
1609    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1610       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1611    else
1612       this->result.swizzle = BRW_SWIZZLE_NOOP;
1613    this->result.type = brw_type_for_base_type(ir->type);
1614
1615    this->result.reg_offset += offset;
1616 }
1617
1618 /**
1619  * We want to be careful in assignment setup to hit the actual storage
1620  * instead of potentially using a temporary like we might with the
1621  * ir_dereference handler.
1622  */
1623 static dst_reg
1624 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1625 {
1626    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1627     * access of a vector, it must be separated into a series conditional moves
1628     * before reaching this point (see ir_vec_index_to_cond_assign).
1629     */
1630    assert(ir->as_dereference());
1631    ir_dereference_array *deref_array = ir->as_dereference_array();
1632    if (deref_array) {
1633       assert(!deref_array->array->type->is_vector());
1634    }
1635
1636    /* Use the rvalue deref handler for the most part.  We'll ignore
1637     * swizzles in it and write swizzles using writemask, though.
1638     */
1639    ir->accept(v);
1640    return dst_reg(v->result);
1641 }
1642
1643 void
1644 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1645                               const struct glsl_type *type, uint32_t predicate)
1646 {
1647    if (type->base_type == GLSL_TYPE_STRUCT) {
1648       for (unsigned int i = 0; i < type->length; i++) {
1649          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1650       }
1651       return;
1652    }
1653
1654    if (type->is_array()) {
1655       for (unsigned int i = 0; i < type->length; i++) {
1656          emit_block_move(dst, src, type->fields.array, predicate);
1657       }
1658       return;
1659    }
1660
1661    if (type->is_matrix()) {
1662       const struct glsl_type *vec_type;
1663
1664       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1665                                          type->vector_elements, 1);
1666
1667       for (int i = 0; i < type->matrix_columns; i++) {
1668          emit_block_move(dst, src, vec_type, predicate);
1669       }
1670       return;
1671    }
1672
1673    assert(type->is_scalar() || type->is_vector());
1674
1675    dst->type = brw_type_for_base_type(type);
1676    src->type = dst->type;
1677
1678    dst->writemask = (1 << type->vector_elements) - 1;
1679
1680    src->swizzle = swizzle_for_size(type->vector_elements);
1681
1682    vec4_instruction *inst = emit(MOV(*dst, *src));
1683    inst->predicate = predicate;
1684
1685    dst->reg_offset++;
1686    src->reg_offset++;
1687 }
1688
1689
1690 /* If the RHS processing resulted in an instruction generating a
1691  * temporary value, and it would be easy to rewrite the instruction to
1692  * generate its result right into the LHS instead, do so.  This ends
1693  * up reliably removing instructions where it can be tricky to do so
1694  * later without real UD chain information.
1695  */
1696 bool
1697 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1698                                      dst_reg dst,
1699                                      src_reg src,
1700                                      vec4_instruction *pre_rhs_inst,
1701                                      vec4_instruction *last_rhs_inst)
1702 {
1703    /* This could be supported, but it would take more smarts. */
1704    if (ir->condition)
1705       return false;
1706
1707    if (pre_rhs_inst == last_rhs_inst)
1708       return false; /* No instructions generated to work with. */
1709
1710    /* Make sure the last instruction generated our source reg. */
1711    if (src.file != GRF ||
1712        src.file != last_rhs_inst->dst.file ||
1713        src.reg != last_rhs_inst->dst.reg ||
1714        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1715        src.reladdr ||
1716        src.abs ||
1717        src.negate ||
1718        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1719       return false;
1720
1721    /* Check that that last instruction fully initialized the channels
1722     * we want to use, in the order we want to use them.  We could
1723     * potentially reswizzle the operands of many instructions so that
1724     * we could handle out of order channels, but don't yet.
1725     */
1726
1727    for (unsigned i = 0; i < 4; i++) {
1728       if (dst.writemask & (1 << i)) {
1729          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1730             return false;
1731
1732          if (BRW_GET_SWZ(src.swizzle, i) != i)
1733             return false;
1734       }
1735    }
1736
1737    /* Success!  Rewrite the instruction. */
1738    last_rhs_inst->dst.file = dst.file;
1739    last_rhs_inst->dst.reg = dst.reg;
1740    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1741    last_rhs_inst->dst.reladdr = dst.reladdr;
1742    last_rhs_inst->dst.writemask &= dst.writemask;
1743
1744    return true;
1745 }
1746
1747 void
1748 vec4_visitor::visit(ir_assignment *ir)
1749 {
1750    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1751    uint32_t predicate = BRW_PREDICATE_NONE;
1752
1753    if (!ir->lhs->type->is_scalar() &&
1754        !ir->lhs->type->is_vector()) {
1755       ir->rhs->accept(this);
1756       src_reg src = this->result;
1757
1758       if (ir->condition) {
1759          emit_bool_to_cond_code(ir->condition, &predicate);
1760       }
1761
1762       /* emit_block_move doesn't account for swizzles in the source register.
1763        * This should be ok, since the source register is a structure or an
1764        * array, and those can't be swizzled.  But double-check to be sure.
1765        */
1766       assert(src.swizzle ==
1767              (ir->rhs->type->is_matrix()
1768               ? swizzle_for_size(ir->rhs->type->vector_elements)
1769               : BRW_SWIZZLE_NOOP));
1770
1771       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1772       return;
1773    }
1774
1775    /* Now we're down to just a scalar/vector with writemasks. */
1776    int i;
1777
1778    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1779    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1780
1781    ir->rhs->accept(this);
1782
1783    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1784
1785    src_reg src = this->result;
1786
1787    int swizzles[4];
1788    int first_enabled_chan = 0;
1789    int src_chan = 0;
1790
1791    assert(ir->lhs->type->is_vector() ||
1792           ir->lhs->type->is_scalar());
1793    dst.writemask = ir->write_mask;
1794
1795    for (int i = 0; i < 4; i++) {
1796       if (dst.writemask & (1 << i)) {
1797          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1798          break;
1799       }
1800    }
1801
1802    /* Swizzle a small RHS vector into the channels being written.
1803     *
1804     * glsl ir treats write_mask as dictating how many channels are
1805     * present on the RHS while in our instructions we need to make
1806     * those channels appear in the slots of the vec4 they're written to.
1807     */
1808    for (int i = 0; i < 4; i++) {
1809       if (dst.writemask & (1 << i))
1810          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1811       else
1812          swizzles[i] = first_enabled_chan;
1813    }
1814    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1815                               swizzles[2], swizzles[3]);
1816
1817    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1818       return;
1819    }
1820
1821    if (ir->condition) {
1822       emit_bool_to_cond_code(ir->condition, &predicate);
1823    }
1824
1825    for (i = 0; i < type_size(ir->lhs->type); i++) {
1826       vec4_instruction *inst = emit(MOV(dst, src));
1827       inst->predicate = predicate;
1828
1829       dst.reg_offset++;
1830       src.reg_offset++;
1831    }
1832 }
1833
1834 void
1835 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1836 {
1837    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1838       foreach_list(node, &ir->components) {
1839          ir_constant *field_value = (ir_constant *)node;
1840
1841          emit_constant_values(dst, field_value);
1842       }
1843       return;
1844    }
1845
1846    if (ir->type->is_array()) {
1847       for (unsigned int i = 0; i < ir->type->length; i++) {
1848          emit_constant_values(dst, ir->array_elements[i]);
1849       }
1850       return;
1851    }
1852
1853    if (ir->type->is_matrix()) {
1854       for (int i = 0; i < ir->type->matrix_columns; i++) {
1855          float *vec = &ir->value.f[i * ir->type->vector_elements];
1856
1857          for (int j = 0; j < ir->type->vector_elements; j++) {
1858             dst->writemask = 1 << j;
1859             dst->type = BRW_REGISTER_TYPE_F;
1860
1861             emit(MOV(*dst, src_reg(vec[j])));
1862          }
1863          dst->reg_offset++;
1864       }
1865       return;
1866    }
1867
1868    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
1869
1870    for (int i = 0; i < ir->type->vector_elements; i++) {
1871       if (!(remaining_writemask & (1 << i)))
1872          continue;
1873
1874       dst->writemask = 1 << i;
1875       dst->type = brw_type_for_base_type(ir->type);
1876
1877       /* Find other components that match the one we're about to
1878        * write.  Emits fewer instructions for things like vec4(0.5,
1879        * 1.5, 1.5, 1.5).
1880        */
1881       for (int j = i + 1; j < ir->type->vector_elements; j++) {
1882          if (ir->type->base_type == GLSL_TYPE_BOOL) {
1883             if (ir->value.b[i] == ir->value.b[j])
1884                dst->writemask |= (1 << j);
1885          } else {
1886             /* u, i, and f storage all line up, so no need for a
1887              * switch case for comparing each type.
1888              */
1889             if (ir->value.u[i] == ir->value.u[j])
1890                dst->writemask |= (1 << j);
1891          }
1892       }
1893
1894       switch (ir->type->base_type) {
1895       case GLSL_TYPE_FLOAT:
1896          emit(MOV(*dst, src_reg(ir->value.f[i])));
1897          break;
1898       case GLSL_TYPE_INT:
1899          emit(MOV(*dst, src_reg(ir->value.i[i])));
1900          break;
1901       case GLSL_TYPE_UINT:
1902          emit(MOV(*dst, src_reg(ir->value.u[i])));
1903          break;
1904       case GLSL_TYPE_BOOL:
1905          emit(MOV(*dst, src_reg(ir->value.b[i])));
1906          break;
1907       default:
1908          assert(!"Non-float/uint/int/bool constant");
1909          break;
1910       }
1911
1912       remaining_writemask &= ~dst->writemask;
1913    }
1914    dst->reg_offset++;
1915 }
1916
1917 void
1918 vec4_visitor::visit(ir_constant *ir)
1919 {
1920    dst_reg dst = dst_reg(this, ir->type);
1921    this->result = src_reg(dst);
1922
1923    emit_constant_values(&dst, ir);
1924 }
1925
1926 void
1927 vec4_visitor::visit(ir_call *ir)
1928 {
1929    assert(!"not reached");
1930 }
1931
1932 void
1933 vec4_visitor::visit(ir_texture *ir)
1934 {
1935    int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
1936
1937    /* Should be lowered by do_lower_texture_projection */
1938    assert(!ir->projector);
1939
1940    /* Generate code to compute all the subexpression trees.  This has to be
1941     * done before loading any values into MRFs for the sampler message since
1942     * generating these values may involve SEND messages that need the MRFs.
1943     */
1944    src_reg coordinate;
1945    if (ir->coordinate) {
1946       ir->coordinate->accept(this);
1947       coordinate = this->result;
1948    }
1949
1950    src_reg shadow_comparitor;
1951    if (ir->shadow_comparitor) {
1952       ir->shadow_comparitor->accept(this);
1953       shadow_comparitor = this->result;
1954    }
1955
1956    const glsl_type *lod_type;
1957    src_reg lod, dPdx, dPdy;
1958    switch (ir->op) {
1959    case ir_txf:
1960    case ir_txl:
1961    case ir_txs:
1962       ir->lod_info.lod->accept(this);
1963       lod = this->result;
1964       lod_type = ir->lod_info.lod->type;
1965       break;
1966    case ir_txd:
1967       ir->lod_info.grad.dPdx->accept(this);
1968       dPdx = this->result;
1969
1970       ir->lod_info.grad.dPdy->accept(this);
1971       dPdy = this->result;
1972
1973       lod_type = ir->lod_info.grad.dPdx->type;
1974       break;
1975    case ir_tex:
1976    case ir_txb:
1977       break;
1978    }
1979
1980    vec4_instruction *inst = NULL;
1981    switch (ir->op) {
1982    case ir_tex:
1983    case ir_txl:
1984       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
1985       break;
1986    case ir_txd:
1987       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
1988       break;
1989    case ir_txf:
1990       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
1991       break;
1992    case ir_txs:
1993       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
1994       break;
1995    case ir_txb:
1996       assert(!"TXB is not valid for vertex shaders.");
1997    }
1998
1999    /* Texel offsets go in the message header; Gen4 also requires headers. */
2000    inst->header_present = ir->offset || intel->gen < 5;
2001    inst->base_mrf = 2;
2002    inst->mlen = inst->header_present + 1; /* always at least one */
2003    inst->sampler = sampler;
2004    inst->dst = dst_reg(this, ir->type);
2005    inst->shadow_compare = ir->shadow_comparitor != NULL;
2006
2007    if (ir->offset != NULL && ir->op != ir_txf)
2008       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2009
2010    /* MRF for the first parameter */
2011    int param_base = inst->base_mrf + inst->header_present;
2012
2013    if (ir->op == ir_txs) {
2014       int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2015       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2016    } else {
2017       int i, coord_mask = 0, zero_mask = 0;
2018       /* Load the coordinate */
2019       /* FINISHME: gl_clamp_mask and saturate */
2020       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2021          coord_mask |= (1 << i);
2022       for (; i < 4; i++)
2023          zero_mask |= (1 << i);
2024
2025       if (ir->offset && ir->op == ir_txf) {
2026          /* It appears that the ld instruction used for txf does its
2027           * address bounds check before adding in the offset.  To work
2028           * around this, just add the integer offset to the integer
2029           * texel coordinate, and don't put the offset in the header.
2030           */
2031          ir_constant *offset = ir->offset->as_constant();
2032          assert(offset);
2033
2034          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2035             src_reg src = coordinate;
2036             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2037                                        BRW_GET_SWZ(src.swizzle, j),
2038                                        BRW_GET_SWZ(src.swizzle, j),
2039                                        BRW_GET_SWZ(src.swizzle, j));
2040             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2041                      src, offset->value.i[j]));
2042          }
2043       } else {
2044          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2045                   coordinate));
2046       }
2047       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2048                src_reg(0)));
2049       /* Load the shadow comparitor */
2050       if (ir->shadow_comparitor) {
2051          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2052                           WRITEMASK_X),
2053                   shadow_comparitor));
2054          inst->mlen++;
2055       }
2056
2057       /* Load the LOD info */
2058       if (ir->op == ir_txl) {
2059          int mrf, writemask;
2060          if (intel->gen >= 5) {
2061             mrf = param_base + 1;
2062             if (ir->shadow_comparitor) {
2063                writemask = WRITEMASK_Y;
2064                /* mlen already incremented */
2065             } else {
2066                writemask = WRITEMASK_X;
2067                inst->mlen++;
2068             }
2069          } else /* intel->gen == 4 */ {
2070             mrf = param_base;
2071             writemask = WRITEMASK_Z;
2072          }
2073          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2074       } else if (ir->op == ir_txf) {
2075          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W),
2076                   lod));
2077       } else if (ir->op == ir_txd) {
2078          const glsl_type *type = lod_type;
2079
2080          if (intel->gen >= 5) {
2081             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2082             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2083             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2084             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2085             inst->mlen++;
2086
2087             if (ir->type->vector_elements == 3) {
2088                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2089                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2090                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2091                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2092                inst->mlen++;
2093             }
2094          } else /* intel->gen == 4 */ {
2095             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2096             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2097             inst->mlen += 2;
2098          }
2099       }
2100    }
2101
2102    emit(inst);
2103
2104    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2105     * spec requires layers.
2106     */
2107    if (ir->op == ir_txs) {
2108       glsl_type const *type = ir->sampler->type;
2109       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2110           type->sampler_array) {
2111          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2112                    with_writemask(inst->dst, WRITEMASK_Z),
2113                    src_reg(inst->dst), src_reg(6));
2114       }
2115    }
2116
2117    swizzle_result(ir, src_reg(inst->dst), sampler);
2118 }
2119
2120 void
2121 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2122 {
2123    this->result = orig_val;
2124
2125    int s = c->key.tex.swizzles[sampler];
2126
2127    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2128                         || s == SWIZZLE_NOOP)
2129       return;
2130
2131    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2132    int swizzle[4];
2133
2134    for (int i = 0; i < 4; i++) {
2135       switch (GET_SWZ(s, i)) {
2136       case SWIZZLE_ZERO:
2137          zero_mask |= (1 << i);
2138          break;
2139       case SWIZZLE_ONE:
2140          one_mask |= (1 << i);
2141          break;
2142       default:
2143          copy_mask |= (1 << i);
2144          swizzle[i] = GET_SWZ(s, i);
2145          break;
2146       }
2147    }
2148
2149    this->result = src_reg(this, ir->type);
2150    dst_reg swizzled_result(this->result);
2151
2152    if (copy_mask) {
2153       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2154       swizzled_result.writemask = copy_mask;
2155       emit(MOV(swizzled_result, orig_val));
2156    }
2157
2158    if (zero_mask) {
2159       swizzled_result.writemask = zero_mask;
2160       emit(MOV(swizzled_result, src_reg(0.0f)));
2161    }
2162
2163    if (one_mask) {
2164       swizzled_result.writemask = one_mask;
2165       emit(MOV(swizzled_result, src_reg(1.0f)));
2166    }
2167 }
2168
2169 void
2170 vec4_visitor::visit(ir_return *ir)
2171 {
2172    assert(!"not reached");
2173 }
2174
2175 void
2176 vec4_visitor::visit(ir_discard *ir)
2177 {
2178    assert(!"not reached");
2179 }
2180
2181 void
2182 vec4_visitor::visit(ir_if *ir)
2183 {
2184    /* Don't point the annotation at the if statement, because then it plus
2185     * the then and else blocks get printed.
2186     */
2187    this->base_ir = ir->condition;
2188
2189    if (intel->gen == 6) {
2190       emit_if_gen6(ir);
2191    } else {
2192       uint32_t predicate;
2193       emit_bool_to_cond_code(ir->condition, &predicate);
2194       emit(IF(predicate));
2195    }
2196
2197    visit_instructions(&ir->then_instructions);
2198
2199    if (!ir->else_instructions.is_empty()) {
2200       this->base_ir = ir->condition;
2201       emit(BRW_OPCODE_ELSE);
2202
2203       visit_instructions(&ir->else_instructions);
2204    }
2205
2206    this->base_ir = ir->condition;
2207    emit(BRW_OPCODE_ENDIF);
2208 }
2209
2210 void
2211 vec4_visitor::emit_ndc_computation()
2212 {
2213    /* Get the position */
2214    src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
2215
2216    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2217    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2218    output_reg[BRW_VERT_RESULT_NDC] = ndc;
2219
2220    current_annotation = "NDC";
2221    dst_reg ndc_w = ndc;
2222    ndc_w.writemask = WRITEMASK_W;
2223    src_reg pos_w = pos;
2224    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2225    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2226
2227    dst_reg ndc_xyz = ndc;
2228    ndc_xyz.writemask = WRITEMASK_XYZ;
2229
2230    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2231 }
2232
2233 void
2234 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2235 {
2236    if (intel->gen < 6 &&
2237        ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2238         c->key.userclip_active || brw->has_negative_rhw_bug)) {
2239       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2240       dst_reg header1_w = header1;
2241       header1_w.writemask = WRITEMASK_W;
2242       GLuint i;
2243
2244       emit(MOV(header1, 0u));
2245
2246       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2247          src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2248
2249          current_annotation = "Point size";
2250          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2251          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2252       }
2253
2254       current_annotation = "Clipping flags";
2255       for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2256          vec4_instruction *inst;
2257
2258          inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2259                          src_reg(this->userplane[i])));
2260          inst->conditional_mod = BRW_CONDITIONAL_L;
2261
2262          inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2263          inst->predicate = BRW_PREDICATE_NORMAL;
2264       }
2265
2266       /* i965 clipping workaround:
2267        * 1) Test for -ve rhw
2268        * 2) If set,
2269        *      set ndc = (0,0,0,0)
2270        *      set ucp[6] = 1
2271        *
2272        * Later, clipping will detect ucp[6] and ensure the primitive is
2273        * clipped against all fixed planes.
2274        */
2275       if (brw->has_negative_rhw_bug) {
2276 #if 0
2277          /* FINISHME */
2278          brw_CMP(p,
2279                  vec8(brw_null_reg()),
2280                  BRW_CONDITIONAL_L,
2281                  brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2282                  brw_imm_f(0));
2283
2284          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2285          brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2286          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2287 #endif
2288       }
2289
2290       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2291    } else if (intel->gen < 6) {
2292       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2293    } else {
2294       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2295       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2296          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2297                   src_reg(output_reg[VERT_RESULT_PSIZ])));
2298       }
2299    }
2300 }
2301
2302 void
2303 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2304 {
2305    if (intel->gen < 6) {
2306       /* Clip distance slots are set aside in gen5, but they are not used.  It
2307        * is not clear whether we actually need to set aside space for them,
2308        * but the performance cost is negligible.
2309        */
2310       return;
2311    }
2312
2313    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2314     *
2315     *     "If a linked set of shaders forming the vertex stage contains no
2316     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2317     *     application has requested clipping against user clip planes through
2318     *     the API, then the coordinate written to gl_Position is used for
2319     *     comparison against the user clip planes."
2320     *
2321     * This function is only called if the shader didn't write to
2322     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2323     * if the user wrote to it; otherwise we use gl_Position.
2324     */
2325    gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2326    if (!(c->prog_data.outputs_written
2327          & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2328       clip_vertex = VERT_RESULT_HPOS;
2329    }
2330
2331    for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2332         ++i) {
2333       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2334                src_reg(output_reg[clip_vertex]),
2335                src_reg(this->userplane[i + offset])));
2336    }
2337 }
2338
2339 void
2340 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2341 {
2342    assert (vert_result < VERT_RESULT_MAX);
2343    reg.type = output_reg[vert_result].type;
2344    current_annotation = output_reg_annotation[vert_result];
2345    /* Copy the register, saturating if necessary */
2346    vec4_instruction *inst = emit(MOV(reg,
2347                                      src_reg(output_reg[vert_result])));
2348    if ((vert_result == VERT_RESULT_COL0 ||
2349         vert_result == VERT_RESULT_COL1 ||
2350         vert_result == VERT_RESULT_BFC0 ||
2351         vert_result == VERT_RESULT_BFC1) &&
2352        c->key.clamp_vertex_color) {
2353       inst->saturate = true;
2354    }
2355 }
2356
2357 void
2358 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2359 {
2360    struct brw_reg hw_reg = brw_message_reg(mrf);
2361    dst_reg reg = dst_reg(MRF, mrf);
2362    reg.type = BRW_REGISTER_TYPE_F;
2363
2364    switch (vert_result) {
2365    case VERT_RESULT_PSIZ:
2366       /* PSIZ is always in slot 0, and is coupled with other flags. */
2367       current_annotation = "indices, point width, clip flags";
2368       emit_psiz_and_flags(hw_reg);
2369       break;
2370    case BRW_VERT_RESULT_NDC:
2371       current_annotation = "NDC";
2372       emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2373       break;
2374    case BRW_VERT_RESULT_HPOS_DUPLICATE:
2375    case VERT_RESULT_HPOS:
2376       current_annotation = "gl_Position";
2377       emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2378       break;
2379    case VERT_RESULT_CLIP_DIST0:
2380    case VERT_RESULT_CLIP_DIST1:
2381       if (this->c->key.uses_clip_distance) {
2382          emit_generic_urb_slot(reg, vert_result);
2383       } else {
2384          current_annotation = "user clip distances";
2385          emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2386       }
2387       break;
2388    case VERT_RESULT_EDGE:
2389       /* This is present when doing unfilled polygons.  We're supposed to copy
2390        * the edge flag from the user-provided vertex array
2391        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2392        * of that attribute (starts as 1.0f).  This is then used in clipping to
2393        * determine which edges should be drawn as wireframe.
2394        */
2395       current_annotation = "edge flag";
2396       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2397                                     glsl_type::float_type, WRITEMASK_XYZW))));
2398       break;
2399    case BRW_VERT_RESULT_PAD:
2400       /* No need to write to this slot */
2401       break;
2402    default:
2403       emit_generic_urb_slot(reg, vert_result);
2404       break;
2405    }
2406 }
2407
2408 static int
2409 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2410 {
2411    struct intel_context *intel = &brw->intel;
2412
2413    if (intel->gen >= 6) {
2414       /* URB data written (does not include the message header reg) must
2415        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2416        * section 5.4.3.2.2: URB_INTERLEAVED.
2417        *
2418        * URB entries are allocated on a multiple of 1024 bits, so an
2419        * extra 128 bits written here to make the end align to 256 is
2420        * no problem.
2421        */
2422       if ((mlen % 2) != 1)
2423          mlen++;
2424    }
2425
2426    return mlen;
2427 }
2428
2429 /**
2430  * Generates the VUE payload plus the 1 or 2 URB write instructions to
2431  * complete the VS thread.
2432  *
2433  * The VUE layout is documented in Volume 2a.
2434  */
2435 void
2436 vec4_visitor::emit_urb_writes()
2437 {
2438    /* MRF 0 is reserved for the debugger, so start with message header
2439     * in MRF 1.
2440     */
2441    int base_mrf = 1;
2442    int mrf = base_mrf;
2443    /* In the process of generating our URB write message contents, we
2444     * may need to unspill a register or load from an array.  Those
2445     * reads would use MRFs 14-15.
2446     */
2447    int max_usable_mrf = 13;
2448
2449    /* The following assertion verifies that max_usable_mrf causes an
2450     * even-numbered amount of URB write data, which will meet gen6's
2451     * requirements for length alignment.
2452     */
2453    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2454
2455    /* First mrf is the g0-based message header containing URB handles and such,
2456     * which is implied in VS_OPCODE_URB_WRITE.
2457     */
2458    mrf++;
2459
2460    if (intel->gen < 6) {
2461       emit_ndc_computation();
2462    }
2463
2464    /* Set up the VUE data for the first URB write */
2465    int slot;
2466    for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) {
2467       emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2468
2469       /* If this was max_usable_mrf, we can't fit anything more into this URB
2470        * WRITE.
2471        */
2472       if (mrf > max_usable_mrf) {
2473          slot++;
2474          break;
2475       }
2476    }
2477
2478    current_annotation = "URB write";
2479    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2480    inst->base_mrf = base_mrf;
2481    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2482    inst->eot = (slot >= c->prog_data.vue_map.num_slots);
2483
2484    /* Optional second URB write */
2485    if (!inst->eot) {
2486       mrf = base_mrf + 1;
2487
2488       for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
2489          assert(mrf < max_usable_mrf);
2490
2491          emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2492       }
2493
2494       current_annotation = "URB write";
2495       inst = emit(VS_OPCODE_URB_WRITE);
2496       inst->base_mrf = base_mrf;
2497       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2498       inst->eot = true;
2499       /* URB destination offset.  In the previous write, we got MRFs
2500        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2501        * URB row increments, and each of our MRFs is half of one of
2502        * those, since we're doing interleaved writes.
2503        */
2504       inst->offset = (max_usable_mrf - base_mrf) / 2;
2505    }
2506 }
2507
2508 src_reg
2509 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2510                                  src_reg *reladdr, int reg_offset)
2511 {
2512    /* Because we store the values to scratch interleaved like our
2513     * vertex data, we need to scale the vec4 index by 2.
2514     */
2515    int message_header_scale = 2;
2516
2517    /* Pre-gen6, the message header uses byte offsets instead of vec4
2518     * (16-byte) offset units.
2519     */
2520    if (intel->gen < 6)
2521       message_header_scale *= 16;
2522
2523    if (reladdr) {
2524       src_reg index = src_reg(this, glsl_type::int_type);
2525
2526       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2527       emit_before(inst, MUL(dst_reg(index),
2528                             index, src_reg(message_header_scale)));
2529
2530       return index;
2531    } else {
2532       return src_reg(reg_offset * message_header_scale);
2533    }
2534 }
2535
2536 src_reg
2537 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2538                                        src_reg *reladdr, int reg_offset)
2539 {
2540    if (reladdr) {
2541       src_reg index = src_reg(this, glsl_type::int_type);
2542
2543       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2544
2545       /* Pre-gen6, the message header uses byte offsets instead of vec4
2546        * (16-byte) offset units.
2547        */
2548       if (intel->gen < 6) {
2549          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2550       }
2551
2552       return index;
2553    } else {
2554       int message_header_scale = intel->gen < 6 ? 16 : 1;
2555       return src_reg(reg_offset * message_header_scale);
2556    }
2557 }
2558
2559 /**
2560  * Emits an instruction before @inst to load the value named by @orig_src
2561  * from scratch space at @base_offset to @temp.
2562  *
2563  * @base_offset is measured in 32-byte units (the size of a register).
2564  */
2565 void
2566 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2567                                 dst_reg temp, src_reg orig_src,
2568                                 int base_offset)
2569 {
2570    int reg_offset = base_offset + orig_src.reg_offset;
2571    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2572
2573    emit_before(inst, SCRATCH_READ(temp, index));
2574 }
2575
2576 /**
2577  * Emits an instruction after @inst to store the value to be written
2578  * to @orig_dst to scratch space at @base_offset, from @temp.
2579  *
2580  * @base_offset is measured in 32-byte units (the size of a register).
2581  */
2582 void
2583 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2584 {
2585    int reg_offset = base_offset + inst->dst.reg_offset;
2586    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2587
2588    /* Create a temporary register to store *inst's result in.
2589     *
2590     * We have to be careful in MOVing from our temporary result register in
2591     * the scratch write.  If we swizzle from channels of the temporary that
2592     * weren't initialized, it will confuse live interval analysis, which will
2593     * make spilling fail to make progress.
2594     */
2595    src_reg temp = src_reg(this, glsl_type::vec4_type);
2596    temp.type = inst->dst.type;
2597    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2598    int swizzles[4];
2599    for (int i = 0; i < 4; i++)
2600       if (inst->dst.writemask & (1 << i))
2601          swizzles[i] = i;
2602       else
2603          swizzles[i] = first_writemask_chan;
2604    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2605                                swizzles[2], swizzles[3]);
2606
2607    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2608                                        inst->dst.writemask));
2609    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2610    write->predicate = inst->predicate;
2611    write->ir = inst->ir;
2612    write->annotation = inst->annotation;
2613    inst->insert_after(write);
2614
2615    inst->dst.file = temp.file;
2616    inst->dst.reg = temp.reg;
2617    inst->dst.reg_offset = temp.reg_offset;
2618    inst->dst.reladdr = NULL;
2619 }
2620
2621 /**
2622  * We can't generally support array access in GRF space, because a
2623  * single instruction's destination can only span 2 contiguous
2624  * registers.  So, we send all GRF arrays that get variable index
2625  * access to scratch space.
2626  */
2627 void
2628 vec4_visitor::move_grf_array_access_to_scratch()
2629 {
2630    int scratch_loc[this->virtual_grf_count];
2631
2632    for (int i = 0; i < this->virtual_grf_count; i++) {
2633       scratch_loc[i] = -1;
2634    }
2635
2636    /* First, calculate the set of virtual GRFs that need to be punted
2637     * to scratch due to having any array access on them, and where in
2638     * scratch.
2639     */
2640    foreach_list(node, &this->instructions) {
2641       vec4_instruction *inst = (vec4_instruction *)node;
2642
2643       if (inst->dst.file == GRF && inst->dst.reladdr &&
2644           scratch_loc[inst->dst.reg] == -1) {
2645          scratch_loc[inst->dst.reg] = c->last_scratch;
2646          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2647       }
2648
2649       for (int i = 0 ; i < 3; i++) {
2650          src_reg *src = &inst->src[i];
2651
2652          if (src->file == GRF && src->reladdr &&
2653              scratch_loc[src->reg] == -1) {
2654             scratch_loc[src->reg] = c->last_scratch;
2655             c->last_scratch += this->virtual_grf_sizes[src->reg];
2656          }
2657       }
2658    }
2659
2660    /* Now, for anything that will be accessed through scratch, rewrite
2661     * it to load/store.  Note that this is a _safe list walk, because
2662     * we may generate a new scratch_write instruction after the one
2663     * we're processing.
2664     */
2665    foreach_list_safe(node, &this->instructions) {
2666       vec4_instruction *inst = (vec4_instruction *)node;
2667
2668       /* Set up the annotation tracking for new generated instructions. */
2669       base_ir = inst->ir;
2670       current_annotation = inst->annotation;
2671
2672       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2673          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2674       }
2675
2676       for (int i = 0 ; i < 3; i++) {
2677          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2678             continue;
2679
2680          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2681
2682          emit_scratch_read(inst, temp, inst->src[i],
2683                            scratch_loc[inst->src[i].reg]);
2684
2685          inst->src[i].file = temp.file;
2686          inst->src[i].reg = temp.reg;
2687          inst->src[i].reg_offset = temp.reg_offset;
2688          inst->src[i].reladdr = NULL;
2689       }
2690    }
2691 }
2692
2693 /**
2694  * Emits an instruction before @inst to load the value named by @orig_src
2695  * from the pull constant buffer (surface) at @base_offset to @temp.
2696  */
2697 void
2698 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2699                                       dst_reg temp, src_reg orig_src,
2700                                       int base_offset)
2701 {
2702    int reg_offset = base_offset + orig_src.reg_offset;
2703    src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2704    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2705    vec4_instruction *load;
2706
2707    load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2708                                         temp, index, offset);
2709    load->base_mrf = 14;
2710    load->mlen = 1;
2711    emit_before(inst, load);
2712 }
2713
2714 /**
2715  * Implements array access of uniforms by inserting a
2716  * PULL_CONSTANT_LOAD instruction.
2717  *
2718  * Unlike temporary GRF array access (where we don't support it due to
2719  * the difficulty of doing relative addressing on instruction
2720  * destinations), we could potentially do array access of uniforms
2721  * that were loaded in GRF space as push constants.  In real-world
2722  * usage we've seen, though, the arrays being used are always larger
2723  * than we could load as push constants, so just always move all
2724  * uniform array access out to a pull constant buffer.
2725  */
2726 void
2727 vec4_visitor::move_uniform_array_access_to_pull_constants()
2728 {
2729    int pull_constant_loc[this->uniforms];
2730
2731    for (int i = 0; i < this->uniforms; i++) {
2732       pull_constant_loc[i] = -1;
2733    }
2734
2735    /* Walk through and find array access of uniforms.  Put a copy of that
2736     * uniform in the pull constant buffer.
2737     *
2738     * Note that we don't move constant-indexed accesses to arrays.  No
2739     * testing has been done of the performance impact of this choice.
2740     */
2741    foreach_list_safe(node, &this->instructions) {
2742       vec4_instruction *inst = (vec4_instruction *)node;
2743
2744       for (int i = 0 ; i < 3; i++) {
2745          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2746             continue;
2747
2748          int uniform = inst->src[i].reg;
2749
2750          /* If this array isn't already present in the pull constant buffer,
2751           * add it.
2752           */
2753          if (pull_constant_loc[uniform] == -1) {
2754             const float **values = &prog_data->param[uniform * 4];
2755
2756             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2757
2758             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2759                prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2760             }
2761          }
2762
2763          /* Set up the annotation tracking for new generated instructions. */
2764          base_ir = inst->ir;
2765          current_annotation = inst->annotation;
2766
2767          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2768
2769          emit_pull_constant_load(inst, temp, inst->src[i],
2770                                  pull_constant_loc[uniform]);
2771
2772          inst->src[i].file = temp.file;
2773          inst->src[i].reg = temp.reg;
2774          inst->src[i].reg_offset = temp.reg_offset;
2775          inst->src[i].reladdr = NULL;
2776       }
2777    }
2778
2779    /* Now there are no accesses of the UNIFORM file with a reladdr, so
2780     * no need to track them as larger-than-vec4 objects.  This will be
2781     * relied on in cutting out unused uniform vectors from push
2782     * constants.
2783     */
2784    split_uniform_registers();
2785 }
2786
2787 void
2788 vec4_visitor::resolve_ud_negate(src_reg *reg)
2789 {
2790    if (reg->type != BRW_REGISTER_TYPE_UD ||
2791        !reg->negate)
2792       return;
2793
2794    src_reg temp = src_reg(this, glsl_type::uvec4_type);
2795    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2796    *reg = temp;
2797 }
2798
2799 vec4_visitor::vec4_visitor(struct brw_context *brw,
2800                            struct brw_vs_compile *c,
2801                            struct gl_shader_program *prog,
2802                            struct brw_shader *shader,
2803                            void *mem_ctx)
2804 {
2805    this->c = c;
2806    this->brw = brw;
2807    this->intel = &brw->intel;
2808    this->ctx = &intel->ctx;
2809    this->prog = prog;
2810    this->shader = shader;
2811
2812    this->mem_ctx = mem_ctx;
2813    this->failed = false;
2814
2815    this->base_ir = NULL;
2816    this->current_annotation = NULL;
2817    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
2818
2819    this->c = c;
2820    this->vp = &c->vp->program;
2821    this->prog_data = &c->prog_data;
2822
2823    this->variable_ht = hash_table_ctor(0,
2824                                        hash_table_pointer_hash,
2825                                        hash_table_pointer_compare);
2826
2827    this->virtual_grf_def = NULL;
2828    this->virtual_grf_use = NULL;
2829    this->virtual_grf_sizes = NULL;
2830    this->virtual_grf_count = 0;
2831    this->virtual_grf_reg_map = NULL;
2832    this->virtual_grf_reg_count = 0;
2833    this->virtual_grf_array_size = 0;
2834    this->live_intervals_valid = false;
2835
2836    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2837
2838    this->uniforms = 0;
2839 }
2840
2841 vec4_visitor::~vec4_visitor()
2842 {
2843    hash_table_dtor(this->variable_ht);
2844 }
2845
2846
2847 void
2848 vec4_visitor::fail(const char *format, ...)
2849 {
2850    va_list va;
2851    char *msg;
2852
2853    if (failed)
2854       return;
2855
2856    failed = true;
2857
2858    va_start(va, format);
2859    msg = ralloc_vasprintf(mem_ctx, format, va);
2860    va_end(va);
2861    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2862
2863    this->fail_msg = msg;
2864
2865    if (INTEL_DEBUG & DEBUG_VS) {
2866       fprintf(stderr, "%s",  msg);
2867    }
2868 }
2869
2870 } /* namespace brw */