src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "main/context.h"
  28 #include "main/macros.h"
  29 #include "program/prog_parameter.h"
  30 #include "program/sampler.h"
  31 }
  32
  33 namespace brw {
  34
  35 vec4_instruction::vec4_instruction(vec4_visitor *v,
  36                                    enum opcode opcode, dst_reg dst,
  37                                    src_reg src0, src_reg src1, src_reg src2)
  38 {
  39    this->opcode = opcode;
  40    this->dst = dst;
  41    this->src[0] = src0;
  42    this->src[1] = src1;
  43    this->src[2] = src2;
  44    this->ir = v->base_ir;
  45    this->annotation = v->current_annotation;
  46 }
  47
  48 vec4_instruction *
  49 vec4_visitor::emit(vec4_instruction *inst)
  50 {
  51    this->instructions.push_tail(inst);
  52
  53    return inst;
  54 }
  55
  56 vec4_instruction *
  57 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  58 {
  59    new_inst->ir = inst->ir;
  60    new_inst->annotation = inst->annotation;
  61
  62    inst->insert_before(new_inst);
  63
  64    return inst;
  65 }
  66
  67 vec4_instruction *
  68 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  69                    src_reg src0, src_reg src1, src_reg src2)
  70 {
  71    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  72                                              src0, src1, src2));
  73 }
  74
  75
  76 vec4_instruction *
  77 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  78 {
  79    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  80 }
  81
  82 vec4_instruction *
  83 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  84 {
  85    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  86 }
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  92 }
  93
  94 #define ALU1(op)                                                        \
  95    vec4_instruction *                                                   \
  96    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
  97    {                                                                    \
  98       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
  99                                            src0);                       \
 100    }
 101
 102 #define ALU2(op)                                                        \
 103    vec4_instruction *                                                   \
 104    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 105    {                                                                    \
 106       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 107                                            src0, src1);                 \
 108    }
 109
 110 ALU1(NOT)
 111 ALU1(MOV)
 112 ALU1(FRC)
 113 ALU1(RNDD)
 114 ALU1(RNDE)
 115 ALU1(RNDZ)
 116 ALU2(ADD)
 117 ALU2(MUL)
 118 ALU2(MACH)
 119 ALU2(AND)
 120 ALU2(OR)
 121 ALU2(XOR)
 122 ALU2(DP3)
 123 ALU2(DP4)
 124 ALU2(DPH)
 125 ALU2(SHL)
 126 ALU2(SHR)
 127 ALU2(ASR)
 128
 129 /** Gen4 predicated IF. */
 130 vec4_instruction *
 131 vec4_visitor::IF(uint32_t predicate)
 132 {
 133    vec4_instruction *inst;
 134
 135    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 136    inst->predicate = predicate;
 137
 138    return inst;
 139 }
 140
 141 /** Gen6+ IF with embedded comparison. */
 142 vec4_instruction *
 143 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 144 {
 145    assert(intel->gen >= 6);
 146
 147    vec4_instruction *inst;
 148
 149    resolve_ud_negate(&src0);
 150    resolve_ud_negate(&src1);
 151
 152    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 153                                         src0, src1);
 154    inst->conditional_mod = condition;
 155
 156    return inst;
 157 }
 158
 159 /**
 160  * CMP: Sets the low bit of the destination channels with the result
 161  * of the comparison, while the upper bits are undefined, and updates
 162  * the flag register with the packed 16 bits of the result.
 163  */
 164 vec4_instruction *
 165 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 166 {
 167    vec4_instruction *inst;
 168
 169    /* original gen4 does type conversion to the destination type
 170     * before before comparison, producing garbage results for floating
 171     * point comparisons.
 172     */
 173    if (intel->gen == 4) {
 174       dst.type = src0.type;
 175       if (dst.file == HW_REG)
 176          dst.fixed_hw_reg.type = dst.type;
 177    }
 178
 179    resolve_ud_negate(&src0);
 180    resolve_ud_negate(&src1);
 181
 182    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 183    inst->conditional_mod = condition;
 184
 185    return inst;
 186 }
 187
 188 vec4_instruction *
 189 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 194                                         dst, index);
 195    inst->base_mrf = 14;
 196    inst->mlen = 2;
 197
 198    return inst;
 199 }
 200
 201 vec4_instruction *
 202 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 203 {
 204    vec4_instruction *inst;
 205
 206    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 207                                         dst, src, index);
 208    inst->base_mrf = 13;
 209    inst->mlen = 3;
 210
 211    return inst;
 212 }
 213
 214 void
 215 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 216 {
 217    static enum opcode dot_opcodes[] = {
 218       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 219    };
 220
 221    emit(dot_opcodes[elements - 2], dst, src0, src1);
 222 }
 223
 224 src_reg
 225 vec4_visitor::fix_math_operand(src_reg src)
 226 {
 227    /* The gen6 math instruction ignores the source modifiers --
 228     * swizzle, abs, negate, and at least some parts of the register
 229     * region description.
 230     *
 231     * Rather than trying to enumerate all these cases, *always* expand the
 232     * operand to a temp GRF for gen6.
 233     *
 234     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 235     * can't use.
 236     */
 237
 238    if (intel->gen == 7 && src.file != IMM)
 239       return src;
 240
 241    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 242    expanded.type = src.type;
 243    emit(MOV(expanded, src));
 244    return src_reg(expanded);
 245 }
 246
 247 void
 248 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 249 {
 250    src = fix_math_operand(src);
 251
 252    if (dst.writemask != WRITEMASK_XYZW) {
 253       /* The gen6 math instruction must be align1, so we can't do
 254        * writemasks.
 255        */
 256       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 257
 258       emit(opcode, temp_dst, src);
 259
 260       emit(MOV(dst, src_reg(temp_dst)));
 261    } else {
 262       emit(opcode, dst, src);
 263    }
 264 }
 265
 266 void
 267 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 268 {
 269    vec4_instruction *inst = emit(opcode, dst, src);
 270    inst->base_mrf = 1;
 271    inst->mlen = 1;
 272 }
 273
 274 void
 275 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 276 {
 277    switch (opcode) {
 278    case SHADER_OPCODE_RCP:
 279    case SHADER_OPCODE_RSQ:
 280    case SHADER_OPCODE_SQRT:
 281    case SHADER_OPCODE_EXP2:
 282    case SHADER_OPCODE_LOG2:
 283    case SHADER_OPCODE_SIN:
 284    case SHADER_OPCODE_COS:
 285       break;
 286    default:
 287       assert(!"not reached: bad math opcode");
 288       return;
 289    }
 290
 291    if (intel->gen >= 6) {
 292       return emit_math1_gen6(opcode, dst, src);
 293    } else {
 294       return emit_math1_gen4(opcode, dst, src);
 295    }
 296 }
 297
 298 void
 299 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 300                               dst_reg dst, src_reg src0, src_reg src1)
 301 {
 302    src0 = fix_math_operand(src0);
 303    src1 = fix_math_operand(src1);
 304
 305    if (dst.writemask != WRITEMASK_XYZW) {
 306       /* The gen6 math instruction must be align1, so we can't do
 307        * writemasks.
 308        */
 309       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 310       temp_dst.type = dst.type;
 311
 312       emit(opcode, temp_dst, src0, src1);
 313
 314       emit(MOV(dst, src_reg(temp_dst)));
 315    } else {
 316       emit(opcode, dst, src0, src1);
 317    }
 318 }
 319
 320 void
 321 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 322                               dst_reg dst, src_reg src0, src_reg src1)
 323 {
 324    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 325    inst->base_mrf = 1;
 326    inst->mlen = 2;
 327 }
 328
 329 void
 330 vec4_visitor::emit_math(enum opcode opcode,
 331                         dst_reg dst, src_reg src0, src_reg src1)
 332 {
 333    switch (opcode) {
 334    case SHADER_OPCODE_POW:
 335    case SHADER_OPCODE_INT_QUOTIENT:
 336    case SHADER_OPCODE_INT_REMAINDER:
 337       break;
 338    default:
 339       assert(!"not reached: unsupported binary math opcode");
 340       return;
 341    }
 342
 343    if (intel->gen >= 6) {
 344       return emit_math2_gen6(opcode, dst, src0, src1);
 345    } else {
 346       return emit_math2_gen4(opcode, dst, src0, src1);
 347    }
 348 }
 349
 350 void
 351 vec4_visitor::visit_instructions(const exec_list *list)
 352 {
 353    foreach_list(node, list) {
 354       ir_instruction *ir = (ir_instruction *)node;
 355
 356       base_ir = ir;
 357       ir->accept(this);
 358    }
 359 }
 360
 361
 362 static int
 363 type_size(const struct glsl_type *type)
 364 {
 365    unsigned int i;
 366    int size;
 367
 368    switch (type->base_type) {
 369    case GLSL_TYPE_UINT:
 370    case GLSL_TYPE_INT:
 371    case GLSL_TYPE_FLOAT:
 372    case GLSL_TYPE_BOOL:
 373       if (type->is_matrix()) {
 374          return type->matrix_columns;
 375       } else {
 376          /* Regardless of size of vector, it gets a vec4. This is bad
 377           * packing for things like floats, but otherwise arrays become a
 378           * mess.  Hopefully a later pass over the code can pack scalars
 379           * down if appropriate.
 380           */
 381          return 1;
 382       }
 383    case GLSL_TYPE_ARRAY:
 384       assert(type->length > 0);
 385       return type_size(type->fields.array) * type->length;
 386    case GLSL_TYPE_STRUCT:
 387       size = 0;
 388       for (i = 0; i < type->length; i++) {
 389          size += type_size(type->fields.structure[i].type);
 390       }
 391       return size;
 392    case GLSL_TYPE_SAMPLER:
 393       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 394        * at link time.
 395        */
 396       return 1;
 397    default:
 398       assert(0);
 399       return 0;
 400    }
 401 }
 402
 403 int
 404 vec4_visitor::virtual_grf_alloc(int size)
 405 {
 406    if (virtual_grf_array_size <= virtual_grf_count) {
 407       if (virtual_grf_array_size == 0)
 408          virtual_grf_array_size = 16;
 409       else
 410          virtual_grf_array_size *= 2;
 411       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 412                                    virtual_grf_array_size);
 413       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 414                                      virtual_grf_array_size);
 415    }
 416    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 417    virtual_grf_reg_count += size;
 418    virtual_grf_sizes[virtual_grf_count] = size;
 419    return virtual_grf_count++;
 420 }
 421
 422 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 423 {
 424    init();
 425
 426    this->file = GRF;
 427    this->reg = v->virtual_grf_alloc(type_size(type));
 428
 429    if (type->is_array() || type->is_record()) {
 430       this->swizzle = BRW_SWIZZLE_NOOP;
 431    } else {
 432       this->swizzle = swizzle_for_size(type->vector_elements);
 433    }
 434
 435    this->type = brw_type_for_base_type(type);
 436 }
 437
 438 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 439 {
 440    init();
 441
 442    this->file = GRF;
 443    this->reg = v->virtual_grf_alloc(type_size(type));
 444
 445    if (type->is_array() || type->is_record()) {
 446       this->writemask = WRITEMASK_XYZW;
 447    } else {
 448       this->writemask = (1 << type->vector_elements) - 1;
 449    }
 450
 451    this->type = brw_type_for_base_type(type);
 452 }
 453
 454 /* Our support for uniforms is piggy-backed on the struct
 455  * gl_fragment_program, because that's where the values actually
 456  * get stored, rather than in some global gl_shader_program uniform
 457  * store.
 458  */
 459 void
 460 vec4_visitor::setup_uniform_values(ir_variable *ir)
 461 {
 462    int namelen = strlen(ir->name);
 463
 464    /* The data for our (non-builtin) uniforms is stored in a series of
 465     * gl_uniform_driver_storage structs for each subcomponent that
 466     * glGetUniformLocation() could name.  We know it's been set up in the same
 467     * order we'd walk the type, so walk the list of storage and find anything
 468     * with our name, or the prefix of a component that starts with our name.
 469     */
 470    for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
 471       struct gl_uniform_storage *storage = &prog->UniformStorage[u];
 472
 473       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 474           (storage->name[namelen] != 0 &&
 475            storage->name[namelen] != '.' &&
 476            storage->name[namelen] != '[')) {
 477          continue;
 478       }
 479
 480       gl_constant_value *components = storage->storage;
 481       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 482                                storage->type->matrix_columns);
 483
 484       for (unsigned s = 0; s < vector_count; s++) {
 485          uniform_vector_size[uniforms] = storage->type->vector_elements;
 486
 487          int i;
 488          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 489             c->prog_data.param[uniforms * 4 + i] = &components->f;
 490             components++;
 491          }
 492          for (; i < 4; i++) {
 493             static float zero = 0;
 494             c->prog_data.param[uniforms * 4 + i] = &zero;
 495          }
 496
 497          uniforms++;
 498       }
 499    }
 500 }
 501
 502 void
 503 vec4_visitor::setup_uniform_clipplane_values()
 504 {
 505    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 506
 507    if (intel->gen < 6) {
 508       /* Pre-Gen6, we compact clip planes.  For example, if the user
 509        * enables just clip planes 0, 1, and 3, we will enable clip planes
 510        * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
 511        * plane 2.  This simplifies the implementation of the Gen6 clip
 512        * thread.
 513        */
 514       int compacted_clipplane_index = 0;
 515       for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
 516          if (!(c->key.userclip_planes_enabled_gen_4_5 & (1 << i)))
 517             continue;
 518
 519          this->uniform_vector_size[this->uniforms] = 4;
 520          this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
 521          this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
 522          for (int j = 0; j < 4; ++j) {
 523             c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 524          }
 525          ++compacted_clipplane_index;
 526          ++this->uniforms;
 527       }
 528    } else {
 529       /* In Gen6 and later, we don't compact clip planes, because this
 530        * simplifies the implementation of gl_ClipDistance.
 531        */
 532       for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
 533          this->uniform_vector_size[this->uniforms] = 4;
 534          this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 535          this->userplane[i].type = BRW_REGISTER_TYPE_F;
 536          for (int j = 0; j < 4; ++j) {
 537             c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 538          }
 539          ++this->uniforms;
 540       }
 541    }
 542 }
 543
 544 /* Our support for builtin uniforms is even scarier than non-builtin.
 545  * It sits on top of the PROG_STATE_VAR parameters that are
 546  * automatically updated from GL context state.
 547  */
 548 void
 549 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 550 {
 551    const ir_state_slot *const slots = ir->state_slots;
 552    assert(ir->state_slots != NULL);
 553
 554    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 555       /* This state reference has already been setup by ir_to_mesa,
 556        * but we'll get the same index back here.  We can reference
 557        * ParameterValues directly, since unlike brw_fs.cpp, we never
 558        * add new state references during compile.
 559        */
 560       int index = _mesa_add_state_reference(this->vp->Base.Parameters,
 561                                             (gl_state_index *)slots[i].tokens);
 562       float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
 563
 564       this->uniform_vector_size[this->uniforms] = 0;
 565       /* Add each of the unique swizzled channels of the element.
 566        * This will end up matching the size of the glsl_type of this field.
 567        */
 568       int last_swiz = -1;
 569       for (unsigned int j = 0; j < 4; j++) {
 570          int swiz = GET_SWZ(slots[i].swizzle, j);
 571          last_swiz = swiz;
 572
 573          c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
 574          if (swiz <= last_swiz)
 575             this->uniform_vector_size[this->uniforms]++;
 576       }
 577       this->uniforms++;
 578    }
 579 }
 580
 581 dst_reg *
 582 vec4_visitor::variable_storage(ir_variable *var)
 583 {
 584    return (dst_reg *)hash_table_find(this->variable_ht, var);
 585 }
 586
 587 void
 588 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 589 {
 590    ir_expression *expr = ir->as_expression();
 591
 592    *predicate = BRW_PREDICATE_NORMAL;
 593
 594    if (expr) {
 595       src_reg op[2];
 596       vec4_instruction *inst;
 597
 598       assert(expr->get_num_operands() <= 2);
 599       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 600          expr->operands[i]->accept(this);
 601          op[i] = this->result;
 602
 603          resolve_ud_negate(&op[i]);
 604       }
 605
 606       switch (expr->operation) {
 607       case ir_unop_logic_not:
 608          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 609          inst->conditional_mod = BRW_CONDITIONAL_Z;
 610          break;
 611
 612       case ir_binop_logic_xor:
 613          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 614          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 615          break;
 616
 617       case ir_binop_logic_or:
 618          inst = emit(OR(dst_null_d(), op[0], op[1]));
 619          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 620          break;
 621
 622       case ir_binop_logic_and:
 623          inst = emit(AND(dst_null_d(), op[0], op[1]));
 624          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 625          break;
 626
 627       case ir_unop_f2b:
 628          if (intel->gen >= 6) {
 629             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 630          } else {
 631             inst = emit(MOV(dst_null_f(), op[0]));
 632             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 633          }
 634          break;
 635
 636       case ir_unop_i2b:
 637          if (intel->gen >= 6) {
 638             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 639          } else {
 640             inst = emit(MOV(dst_null_d(), op[0]));
 641             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 642          }
 643          break;
 644
 645       case ir_binop_all_equal:
 646          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 647          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 648          break;
 649
 650       case ir_binop_any_nequal:
 651          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 652          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 653          break;
 654
 655       case ir_unop_any:
 656          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 657          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 658          break;
 659
 660       case ir_binop_greater:
 661       case ir_binop_gequal:
 662       case ir_binop_less:
 663       case ir_binop_lequal:
 664       case ir_binop_equal:
 665       case ir_binop_nequal:
 666          emit(CMP(dst_null_d(), op[0], op[1],
 667                   brw_conditional_for_comparison(expr->operation)));
 668          break;
 669
 670       default:
 671          assert(!"not reached");
 672          break;
 673       }
 674       return;
 675    }
 676
 677    ir->accept(this);
 678
 679    resolve_ud_negate(&this->result);
 680
 681    if (intel->gen >= 6) {
 682       vec4_instruction *inst = emit(AND(dst_null_d(),
 683                                         this->result, src_reg(1)));
 684       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 685    } else {
 686       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 687       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 688    }
 689 }
 690
 691 /**
 692  * Emit a gen6 IF statement with the comparison folded into the IF
 693  * instruction.
 694  */
 695 void
 696 vec4_visitor::emit_if_gen6(ir_if *ir)
 697 {
 698    ir_expression *expr = ir->condition->as_expression();
 699
 700    if (expr) {
 701       src_reg op[2];
 702       dst_reg temp;
 703
 704       assert(expr->get_num_operands() <= 2);
 705       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 706          expr->operands[i]->accept(this);
 707          op[i] = this->result;
 708       }
 709
 710       switch (expr->operation) {
 711       case ir_unop_logic_not:
 712          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 713          return;
 714
 715       case ir_binop_logic_xor:
 716          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 717          return;
 718
 719       case ir_binop_logic_or:
 720          temp = dst_reg(this, glsl_type::bool_type);
 721          emit(OR(temp, op[0], op[1]));
 722          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 723          return;
 724
 725       case ir_binop_logic_and:
 726          temp = dst_reg(this, glsl_type::bool_type);
 727          emit(AND(temp, op[0], op[1]));
 728          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 729          return;
 730
 731       case ir_unop_f2b:
 732          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 733          return;
 734
 735       case ir_unop_i2b:
 736          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 737          return;
 738
 739       case ir_binop_greater:
 740       case ir_binop_gequal:
 741       case ir_binop_less:
 742       case ir_binop_lequal:
 743       case ir_binop_equal:
 744       case ir_binop_nequal:
 745          emit(IF(op[0], op[1],
 746                  brw_conditional_for_comparison(expr->operation)));
 747          return;
 748
 749       case ir_binop_all_equal:
 750          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 751          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 752          return;
 753
 754       case ir_binop_any_nequal:
 755          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 756          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 757          return;
 758
 759       case ir_unop_any:
 760          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 761          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 762          return;
 763
 764       default:
 765          assert(!"not reached");
 766          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 767          return;
 768       }
 769       return;
 770    }
 771
 772    ir->condition->accept(this);
 773
 774    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 775 }
 776
 777 static dst_reg
 778 with_writemask(dst_reg const & r, int mask)
 779 {
 780    dst_reg result = r;
 781    result.writemask = mask;
 782    return result;
 783 }
 784
 785 void
 786 vec4_visitor::emit_attribute_fixups()
 787 {
 788    dst_reg sign_recovery_shift;
 789    dst_reg normalize_factor;
 790    dst_reg es3_normalize_factor;
 791
 792    for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
 793       if (prog_data->inputs_read & BITFIELD64_BIT(i)) {
 794          uint8_t wa_flags = c->key.gl_attrib_wa_flags[i];
 795          dst_reg reg(ATTR, i);
 796          dst_reg reg_d = reg;
 797          reg_d.type = BRW_REGISTER_TYPE_D;
 798          dst_reg reg_ud = reg;
 799          reg_ud.type = BRW_REGISTER_TYPE_UD;
 800
 801          /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 802           * come in as floating point conversions of the integer values.
 803           */
 804          if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
 805             dst_reg dst = reg;
 806             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 807             dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
 808             emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 809          }
 810
 811          /* Do sign recovery for 2101010 formats if required. */
 812          if (wa_flags & BRW_ATTRIB_WA_SIGN) {
 813             if (sign_recovery_shift.file == BAD_FILE) {
 814                /* shift constant: <22,22,22,30> */
 815                sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
 816                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
 817                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
 818             }
 819
 820             emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
 821             emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
 822          }
 823
 824          /* Apply BGRA swizzle if required. */
 825          if (wa_flags & BRW_ATTRIB_WA_BGRA) {
 826             src_reg temp = src_reg(reg);
 827             temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
 828             emit(MOV(reg, temp));
 829          }
 830
 831          if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
 832             /* ES 3.0 has different rules for converting signed normalized
 833              * fixed-point numbers than desktop GL.
 834              */
 835             if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
 836                /* According to equation 2.2 of the ES 3.0 specification,
 837                 * signed normalization conversion is done by:
 838                 *
 839                 * f = c / (2^(b-1)-1)
 840                 */
 841                if (es3_normalize_factor.file == BAD_FILE) {
 842                   /* mul constant: 1 / (2^(b-1) - 1) */
 843                   es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
 844                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
 845                            src_reg(1.0f / ((1<<9) - 1))));
 846                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
 847                            src_reg(1.0f / ((1<<1) - 1))));
 848                }
 849
 850                dst_reg dst = reg;
 851                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 852                emit(MOV(dst, src_reg(reg_d)));
 853                emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
 854                emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
 855             } else {
 856                /* The following equations are from the OpenGL 3.2 specification:
 857                 *
 858                 * 2.1 unsigned normalization
 859                 * f = c/(2^n-1)
 860                 *
 861                 * 2.2 signed normalization
 862                 * f = (2c+1)/(2^n-1)
 863                 *
 864                 * Both of these share a common divisor, which is represented by
 865                 * "normalize_factor" in the code below.
 866                 */
 867                if (normalize_factor.file == BAD_FILE) {
 868                   /* 1 / (2^b - 1) for b=<10,10,10,2> */
 869                   normalize_factor = dst_reg(this, glsl_type::vec4_type);
 870                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
 871                            src_reg(1.0f / ((1<<10) - 1))));
 872                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
 873                            src_reg(1.0f / ((1<<2) - 1))));
 874                }
 875
 876                dst_reg dst = reg;
 877                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 878                emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
 879
 880                /* For signed normalization, we want the numerator to be 2c+1. */
 881                if (wa_flags & BRW_ATTRIB_WA_SIGN) {
 882                   emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
 883                   emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
 884                }
 885
 886                emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
 887             }
 888          }
 889
 890          if (wa_flags & BRW_ATTRIB_WA_SCALE) {
 891             dst_reg dst = reg;
 892             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 893             emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
 894          }
 895       }
 896    }
 897 }
 898
 899 void
 900 vec4_visitor::visit(ir_variable *ir)
 901 {
 902    dst_reg *reg = NULL;
 903
 904    if (variable_storage(ir))
 905       return;
 906
 907    switch (ir->mode) {
 908    case ir_var_in:
 909       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
 910       break;
 911
 912    case ir_var_out:
 913       reg = new(mem_ctx) dst_reg(this, ir->type);
 914
 915       for (int i = 0; i < type_size(ir->type); i++) {
 916          output_reg[ir->location + i] = *reg;
 917          output_reg[ir->location + i].reg_offset = i;
 918          output_reg[ir->location + i].type =
 919             brw_type_for_base_type(ir->type->get_scalar_type());
 920          output_reg_annotation[ir->location + i] = ir->name;
 921       }
 922       break;
 923
 924    case ir_var_auto:
 925    case ir_var_temporary:
 926       reg = new(mem_ctx) dst_reg(this, ir->type);
 927       break;
 928
 929    case ir_var_uniform:
 930       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 931
 932       /* Thanks to the lower_ubo_reference pass, we will see only
 933        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 934        * variables, so no need for them to be in variable_ht.
 935        */
 936       if (ir->uniform_block != -1)
 937          return;
 938
 939       /* Track how big the whole uniform variable is, in case we need to put a
 940        * copy of its data into pull constants for array access.
 941        */
 942       this->uniform_size[this->uniforms] = type_size(ir->type);
 943
 944       if (!strncmp(ir->name, "gl_", 3)) {
 945          setup_builtin_uniform_values(ir);
 946       } else {
 947          setup_uniform_values(ir);
 948       }
 949       break;
 950
 951    case ir_var_system_value:
 952       /* VertexID is stored by the VF as the last vertex element, but
 953        * we don't represent it with a flag in inputs_read, so we call
 954        * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
 955        */
 956       reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
 957       prog_data->uses_vertexid = true;
 958
 959       switch (ir->location) {
 960       case SYSTEM_VALUE_VERTEX_ID:
 961          reg->writemask = WRITEMASK_X;
 962          break;
 963       case SYSTEM_VALUE_INSTANCE_ID:
 964          reg->writemask = WRITEMASK_Y;
 965          break;
 966       default:
 967          assert(!"not reached");
 968          break;
 969       }
 970       break;
 971
 972    default:
 973       assert(!"not reached");
 974    }
 975
 976    reg->type = brw_type_for_base_type(ir->type);
 977    hash_table_insert(this->variable_ht, reg, ir);
 978 }
 979
 980 void
 981 vec4_visitor::visit(ir_loop *ir)
 982 {
 983    dst_reg counter;
 984
 985    /* We don't want debugging output to print the whole body of the
 986     * loop as the annotation.
 987     */
 988    this->base_ir = NULL;
 989
 990    if (ir->counter != NULL) {
 991       this->base_ir = ir->counter;
 992       ir->counter->accept(this);
 993       counter = *(variable_storage(ir->counter));
 994
 995       if (ir->from != NULL) {
 996          this->base_ir = ir->from;
 997          ir->from->accept(this);
 998
 999          emit(MOV(counter, this->result));
1000       }
1001    }
1002
1003    emit(BRW_OPCODE_DO);
1004
1005    if (ir->to) {
1006       this->base_ir = ir->to;
1007       ir->to->accept(this);
1008
1009       emit(CMP(dst_null_d(), src_reg(counter), this->result,
1010                brw_conditional_for_comparison(ir->cmp)));
1011
1012       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1013       inst->predicate = BRW_PREDICATE_NORMAL;
1014    }
1015
1016    visit_instructions(&ir->body_instructions);
1017
1018
1019    if (ir->increment) {
1020       this->base_ir = ir->increment;
1021       ir->increment->accept(this);
1022       emit(ADD(counter, src_reg(counter), this->result));
1023    }
1024
1025    emit(BRW_OPCODE_WHILE);
1026 }
1027
1028 void
1029 vec4_visitor::visit(ir_loop_jump *ir)
1030 {
1031    switch (ir->mode) {
1032    case ir_loop_jump::jump_break:
1033       emit(BRW_OPCODE_BREAK);
1034       break;
1035    case ir_loop_jump::jump_continue:
1036       emit(BRW_OPCODE_CONTINUE);
1037       break;
1038    }
1039 }
1040
1041
1042 void
1043 vec4_visitor::visit(ir_function_signature *ir)
1044 {
1045    assert(0);
1046    (void)ir;
1047 }
1048
1049 void
1050 vec4_visitor::visit(ir_function *ir)
1051 {
1052    /* Ignore function bodies other than main() -- we shouldn't see calls to
1053     * them since they should all be inlined.
1054     */
1055    if (strcmp(ir->name, "main") == 0) {
1056       const ir_function_signature *sig;
1057       exec_list empty;
1058
1059       sig = ir->matching_signature(&empty);
1060
1061       assert(sig);
1062
1063       visit_instructions(&sig->body);
1064    }
1065 }
1066
1067 bool
1068 vec4_visitor::try_emit_sat(ir_expression *ir)
1069 {
1070    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1071    if (!sat_src)
1072       return false;
1073
1074    sat_src->accept(this);
1075    src_reg src = this->result;
1076
1077    this->result = src_reg(this, ir->type);
1078    vec4_instruction *inst;
1079    inst = emit(MOV(dst_reg(this->result), src));
1080    inst->saturate = true;
1081
1082    return true;
1083 }
1084
1085 void
1086 vec4_visitor::emit_bool_comparison(unsigned int op,
1087                                  dst_reg dst, src_reg src0, src_reg src1)
1088 {
1089    /* original gen4 does destination conversion before comparison. */
1090    if (intel->gen < 5)
1091       dst.type = src0.type;
1092
1093    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1094
1095    dst.type = BRW_REGISTER_TYPE_D;
1096    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1097 }
1098
1099 void
1100 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1101                           src_reg src0, src_reg src1)
1102 {
1103    vec4_instruction *inst;
1104
1105    if (intel->gen >= 6) {
1106       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1107       inst->conditional_mod = conditionalmod;
1108    } else {
1109       emit(CMP(dst, src0, src1, conditionalmod));
1110
1111       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1112       inst->predicate = BRW_PREDICATE_NORMAL;
1113    }
1114 }
1115
1116 void
1117 vec4_visitor::visit(ir_expression *ir)
1118 {
1119    unsigned int operand;
1120    src_reg op[Elements(ir->operands)];
1121    src_reg result_src;
1122    dst_reg result_dst;
1123    vec4_instruction *inst;
1124
1125    if (try_emit_sat(ir))
1126       return;
1127
1128    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1129       this->result.file = BAD_FILE;
1130       ir->operands[operand]->accept(this);
1131       if (this->result.file == BAD_FILE) {
1132          printf("Failed to get tree for expression operand:\n");
1133          ir->operands[operand]->print();
1134          exit(1);
1135       }
1136       op[operand] = this->result;
1137
1138       /* Matrix expression operands should have been broken down to vector
1139        * operations already.
1140        */
1141       assert(!ir->operands[operand]->type->is_matrix());
1142    }
1143
1144    int vector_elements = ir->operands[0]->type->vector_elements;
1145    if (ir->operands[1]) {
1146       vector_elements = MAX2(vector_elements,
1147                              ir->operands[1]->type->vector_elements);
1148    }
1149
1150    this->result.file = BAD_FILE;
1151
1152    /* Storage for our result.  Ideally for an assignment we'd be using
1153     * the actual storage for the result here, instead.
1154     */
1155    result_src = src_reg(this, ir->type);
1156    /* convenience for the emit functions below. */
1157    result_dst = dst_reg(result_src);
1158    /* If nothing special happens, this is the result. */
1159    this->result = result_src;
1160    /* Limit writes to the channels that will be used by result_src later.
1161     * This does limit this temp's use as a temporary for multi-instruction
1162     * sequences.
1163     */
1164    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1165
1166    switch (ir->operation) {
1167    case ir_unop_logic_not:
1168       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1169        * ones complement of the whole register, not just bit 0.
1170        */
1171       emit(XOR(result_dst, op[0], src_reg(1)));
1172       break;
1173    case ir_unop_neg:
1174       op[0].negate = !op[0].negate;
1175       this->result = op[0];
1176       break;
1177    case ir_unop_abs:
1178       op[0].abs = true;
1179       op[0].negate = false;
1180       this->result = op[0];
1181       break;
1182
1183    case ir_unop_sign:
1184       emit(MOV(result_dst, src_reg(0.0f)));
1185
1186       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1187       inst = emit(MOV(result_dst, src_reg(1.0f)));
1188       inst->predicate = BRW_PREDICATE_NORMAL;
1189
1190       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1191       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1192       inst->predicate = BRW_PREDICATE_NORMAL;
1193
1194       break;
1195
1196    case ir_unop_rcp:
1197       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1198       break;
1199
1200    case ir_unop_exp2:
1201       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1202       break;
1203    case ir_unop_log2:
1204       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1205       break;
1206    case ir_unop_exp:
1207    case ir_unop_log:
1208       assert(!"not reached: should be handled by ir_explog_to_explog2");
1209       break;
1210    case ir_unop_sin:
1211    case ir_unop_sin_reduced:
1212       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1213       break;
1214    case ir_unop_cos:
1215    case ir_unop_cos_reduced:
1216       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1217       break;
1218
1219    case ir_unop_dFdx:
1220    case ir_unop_dFdy:
1221       assert(!"derivatives not valid in vertex shader");
1222       break;
1223
1224    case ir_unop_noise:
1225       assert(!"not reached: should be handled by lower_noise");
1226       break;
1227
1228    case ir_binop_add:
1229       emit(ADD(result_dst, op[0], op[1]));
1230       break;
1231    case ir_binop_sub:
1232       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1233       break;
1234
1235    case ir_binop_mul:
1236       if (ir->type->is_integer()) {
1237          /* For integer multiplication, the MUL uses the low 16 bits
1238           * of one of the operands (src0 on gen6, src1 on gen7).  The
1239           * MACH accumulates in the contribution of the upper 16 bits
1240           * of that operand.
1241           *
1242           * FINISHME: Emit just the MUL if we know an operand is small
1243           * enough.
1244           */
1245          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1246
1247          emit(MUL(acc, op[0], op[1]));
1248          emit(MACH(dst_null_d(), op[0], op[1]));
1249          emit(MOV(result_dst, src_reg(acc)));
1250       } else {
1251          emit(MUL(result_dst, op[0], op[1]));
1252       }
1253       break;
1254    case ir_binop_div:
1255       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1256       assert(ir->type->is_integer());
1257       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1258       break;
1259    case ir_binop_mod:
1260       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1261       assert(ir->type->is_integer());
1262       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1263       break;
1264
1265    case ir_binop_less:
1266    case ir_binop_greater:
1267    case ir_binop_lequal:
1268    case ir_binop_gequal:
1269    case ir_binop_equal:
1270    case ir_binop_nequal: {
1271       emit(CMP(result_dst, op[0], op[1],
1272                brw_conditional_for_comparison(ir->operation)));
1273       emit(AND(result_dst, result_src, src_reg(0x1)));
1274       break;
1275    }
1276
1277    case ir_binop_all_equal:
1278       /* "==" operator producing a scalar boolean. */
1279       if (ir->operands[0]->type->is_vector() ||
1280           ir->operands[1]->type->is_vector()) {
1281          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1282          emit(MOV(result_dst, src_reg(0)));
1283          inst = emit(MOV(result_dst, src_reg(1)));
1284          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1285       } else {
1286          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1287          emit(AND(result_dst, result_src, src_reg(0x1)));
1288       }
1289       break;
1290    case ir_binop_any_nequal:
1291       /* "!=" operator producing a scalar boolean. */
1292       if (ir->operands[0]->type->is_vector() ||
1293           ir->operands[1]->type->is_vector()) {
1294          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1295
1296          emit(MOV(result_dst, src_reg(0)));
1297          inst = emit(MOV(result_dst, src_reg(1)));
1298          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1299       } else {
1300          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1301          emit(AND(result_dst, result_src, src_reg(0x1)));
1302       }
1303       break;
1304
1305    case ir_unop_any:
1306       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1307       emit(MOV(result_dst, src_reg(0)));
1308
1309       inst = emit(MOV(result_dst, src_reg(1)));
1310       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1311       break;
1312
1313    case ir_binop_logic_xor:
1314       emit(XOR(result_dst, op[0], op[1]));
1315       break;
1316
1317    case ir_binop_logic_or:
1318       emit(OR(result_dst, op[0], op[1]));
1319       break;
1320
1321    case ir_binop_logic_and:
1322       emit(AND(result_dst, op[0], op[1]));
1323       break;
1324
1325    case ir_binop_dot:
1326       assert(ir->operands[0]->type->is_vector());
1327       assert(ir->operands[0]->type == ir->operands[1]->type);
1328       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1329       break;
1330
1331    case ir_unop_sqrt:
1332       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1333       break;
1334    case ir_unop_rsq:
1335       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1336       break;
1337
1338    case ir_unop_bitcast_i2f:
1339    case ir_unop_bitcast_u2f:
1340       this->result = op[0];
1341       this->result.type = BRW_REGISTER_TYPE_F;
1342       break;
1343
1344    case ir_unop_bitcast_f2i:
1345       this->result = op[0];
1346       this->result.type = BRW_REGISTER_TYPE_D;
1347       break;
1348
1349    case ir_unop_bitcast_f2u:
1350       this->result = op[0];
1351       this->result.type = BRW_REGISTER_TYPE_UD;
1352       break;
1353
1354    case ir_unop_i2f:
1355    case ir_unop_i2u:
1356    case ir_unop_u2i:
1357    case ir_unop_u2f:
1358    case ir_unop_b2f:
1359    case ir_unop_b2i:
1360    case ir_unop_f2i:
1361    case ir_unop_f2u:
1362       emit(MOV(result_dst, op[0]));
1363       break;
1364    case ir_unop_f2b:
1365    case ir_unop_i2b: {
1366       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1367       emit(AND(result_dst, result_src, src_reg(1)));
1368       break;
1369    }
1370
1371    case ir_unop_trunc:
1372       emit(RNDZ(result_dst, op[0]));
1373       break;
1374    case ir_unop_ceil:
1375       op[0].negate = !op[0].negate;
1376       inst = emit(RNDD(result_dst, op[0]));
1377       this->result.negate = true;
1378       break;
1379    case ir_unop_floor:
1380       inst = emit(RNDD(result_dst, op[0]));
1381       break;
1382    case ir_unop_fract:
1383       inst = emit(FRC(result_dst, op[0]));
1384       break;
1385    case ir_unop_round_even:
1386       emit(RNDE(result_dst, op[0]));
1387       break;
1388
1389    case ir_binop_min:
1390       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1391       break;
1392    case ir_binop_max:
1393       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1394       break;
1395
1396    case ir_binop_pow:
1397       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1398       break;
1399
1400    case ir_unop_bit_not:
1401       inst = emit(NOT(result_dst, op[0]));
1402       break;
1403    case ir_binop_bit_and:
1404       inst = emit(AND(result_dst, op[0], op[1]));
1405       break;
1406    case ir_binop_bit_xor:
1407       inst = emit(XOR(result_dst, op[0], op[1]));
1408       break;
1409    case ir_binop_bit_or:
1410       inst = emit(OR(result_dst, op[0], op[1]));
1411       break;
1412
1413    case ir_binop_lshift:
1414       inst = emit(SHL(result_dst, op[0], op[1]));
1415       break;
1416
1417    case ir_binop_rshift:
1418       if (ir->type->base_type == GLSL_TYPE_INT)
1419          inst = emit(ASR(result_dst, op[0], op[1]));
1420       else
1421          inst = emit(SHR(result_dst, op[0], op[1]));
1422       break;
1423
1424    case ir_binop_ubo_load: {
1425       ir_constant *uniform_block = ir->operands[0]->as_constant();
1426       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1427       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1428       src_reg offset = op[1];
1429
1430       /* Now, load the vector from that offset. */
1431       assert(ir->type->is_vector() || ir->type->is_scalar());
1432
1433       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1434       packed_consts.type = result.type;
1435       src_reg surf_index =
1436          src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1437       if (const_offset_ir) {
1438          offset = src_reg(const_offset / 16);
1439       } else {
1440          emit(SHR(dst_reg(offset), offset, src_reg(4)));
1441       }
1442
1443       vec4_instruction *pull =
1444          emit(new(mem_ctx) vec4_instruction(this,
1445                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1446                                             dst_reg(packed_consts),
1447                                             surf_index,
1448                                             offset));
1449       pull->base_mrf = 14;
1450       pull->mlen = 1;
1451
1452       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1453       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1454                                             const_offset % 16 / 4,
1455                                             const_offset % 16 / 4,
1456                                             const_offset % 16 / 4);
1457
1458       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1459       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1460          emit(CMP(result_dst, packed_consts, src_reg(0u),
1461                   BRW_CONDITIONAL_NZ));
1462          emit(AND(result_dst, result, src_reg(0x1)));
1463       } else {
1464          emit(MOV(result_dst, packed_consts));
1465       }
1466       break;
1467    }
1468
1469    case ir_quadop_vector:
1470       assert(!"not reached: should be handled by lower_quadop_vector");
1471       break;
1472    }
1473 }
1474
1475
1476 void
1477 vec4_visitor::visit(ir_swizzle *ir)
1478 {
1479    src_reg src;
1480    int i = 0;
1481    int swizzle[4];
1482
1483    /* Note that this is only swizzles in expressions, not those on the left
1484     * hand side of an assignment, which do write masking.  See ir_assignment
1485     * for that.
1486     */
1487
1488    ir->val->accept(this);
1489    src = this->result;
1490    assert(src.file != BAD_FILE);
1491
1492    for (i = 0; i < ir->type->vector_elements; i++) {
1493       switch (i) {
1494       case 0:
1495          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1496          break;
1497       case 1:
1498          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1499          break;
1500       case 2:
1501          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1502          break;
1503       case 3:
1504          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1505             break;
1506       }
1507    }
1508    for (; i < 4; i++) {
1509       /* Replicate the last channel out. */
1510       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1511    }
1512
1513    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1514
1515    this->result = src;
1516 }
1517
1518 void
1519 vec4_visitor::visit(ir_dereference_variable *ir)
1520 {
1521    const struct glsl_type *type = ir->type;
1522    dst_reg *reg = variable_storage(ir->var);
1523
1524    if (!reg) {
1525       fail("Failed to find variable storage for %s\n", ir->var->name);
1526       this->result = src_reg(brw_null_reg());
1527       return;
1528    }
1529
1530    this->result = src_reg(*reg);
1531
1532    /* System values get their swizzle from the dst_reg writemask */
1533    if (ir->var->mode == ir_var_system_value)
1534       return;
1535
1536    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1537       this->result.swizzle = swizzle_for_size(type->vector_elements);
1538 }
1539
1540 void
1541 vec4_visitor::visit(ir_dereference_array *ir)
1542 {
1543    ir_constant *constant_index;
1544    src_reg src;
1545    int element_size = type_size(ir->type);
1546
1547    constant_index = ir->array_index->constant_expression_value();
1548
1549    ir->array->accept(this);
1550    src = this->result;
1551
1552    if (constant_index) {
1553       src.reg_offset += constant_index->value.i[0] * element_size;
1554    } else {
1555       /* Variable index array dereference.  It eats the "vec4" of the
1556        * base of the array and an index that offsets the Mesa register
1557        * index.
1558        */
1559       ir->array_index->accept(this);
1560
1561       src_reg index_reg;
1562
1563       if (element_size == 1) {
1564          index_reg = this->result;
1565       } else {
1566          index_reg = src_reg(this, glsl_type::int_type);
1567
1568          emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1569       }
1570
1571       if (src.reladdr) {
1572          src_reg temp = src_reg(this, glsl_type::int_type);
1573
1574          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1575
1576          index_reg = temp;
1577       }
1578
1579       src.reladdr = ralloc(mem_ctx, src_reg);
1580       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1581    }
1582
1583    /* If the type is smaller than a vec4, replicate the last channel out. */
1584    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1585       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1586    else
1587       src.swizzle = BRW_SWIZZLE_NOOP;
1588    src.type = brw_type_for_base_type(ir->type);
1589
1590    this->result = src;
1591 }
1592
1593 void
1594 vec4_visitor::visit(ir_dereference_record *ir)
1595 {
1596    unsigned int i;
1597    const glsl_type *struct_type = ir->record->type;
1598    int offset = 0;
1599
1600    ir->record->accept(this);
1601
1602    for (i = 0; i < struct_type->length; i++) {
1603       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1604          break;
1605       offset += type_size(struct_type->fields.structure[i].type);
1606    }
1607
1608    /* If the type is smaller than a vec4, replicate the last channel out. */
1609    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1610       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1611    else
1612       this->result.swizzle = BRW_SWIZZLE_NOOP;
1613    this->result.type = brw_type_for_base_type(ir->type);
1614
1615    this->result.reg_offset += offset;
1616 }
1617
1618 /**
1619  * We want to be careful in assignment setup to hit the actual storage
1620  * instead of potentially using a temporary like we might with the
1621  * ir_dereference handler.
1622  */
1623 static dst_reg
1624 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1625 {
1626    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1627     * access of a vector, it must be separated into a series conditional moves
1628     * before reaching this point (see ir_vec_index_to_cond_assign).
1629     */
1630    assert(ir->as_dereference());
1631    ir_dereference_array *deref_array = ir->as_dereference_array();
1632    if (deref_array) {
1633       assert(!deref_array->array->type->is_vector());
1634    }
1635
1636    /* Use the rvalue deref handler for the most part.  We'll ignore
1637     * swizzles in it and write swizzles using writemask, though.
1638     */
1639    ir->accept(v);
1640    return dst_reg(v->result);
1641 }
1642
1643 void
1644 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1645                               const struct glsl_type *type, uint32_t predicate)
1646 {
1647    if (type->base_type == GLSL_TYPE_STRUCT) {
1648       for (unsigned int i = 0; i < type->length; i++) {
1649          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1650       }
1651       return;
1652    }
1653
1654    if (type->is_array()) {
1655       for (unsigned int i = 0; i < type->length; i++) {
1656          emit_block_move(dst, src, type->fields.array, predicate);
1657       }
1658       return;
1659    }
1660
1661    if (type->is_matrix()) {
1662       const struct glsl_type *vec_type;
1663
1664       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1665                                          type->vector_elements, 1);
1666
1667       for (int i = 0; i < type->matrix_columns; i++) {
1668          emit_block_move(dst, src, vec_type, predicate);
1669       }
1670       return;
1671    }
1672
1673    assert(type->is_scalar() || type->is_vector());
1674
1675    dst->type = brw_type_for_base_type(type);
1676    src->type = dst->type;
1677
1678    dst->writemask = (1 << type->vector_elements) - 1;
1679
1680    src->swizzle = swizzle_for_size(type->vector_elements);
1681
1682    vec4_instruction *inst = emit(MOV(*dst, *src));
1683    inst->predicate = predicate;
1684
1685    dst->reg_offset++;
1686    src->reg_offset++;
1687 }
1688
1689
1690 /* If the RHS processing resulted in an instruction generating a
1691  * temporary value, and it would be easy to rewrite the instruction to
1692  * generate its result right into the LHS instead, do so.  This ends
1693  * up reliably removing instructions where it can be tricky to do so
1694  * later without real UD chain information.
1695  */
1696 bool
1697 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1698                                      dst_reg dst,
1699                                      src_reg src,
1700                                      vec4_instruction *pre_rhs_inst,
1701                                      vec4_instruction *last_rhs_inst)
1702 {
1703    /* This could be supported, but it would take more smarts. */
1704    if (ir->condition)
1705       return false;
1706
1707    if (pre_rhs_inst == last_rhs_inst)
1708       return false; /* No instructions generated to work with. */
1709
1710    /* Make sure the last instruction generated our source reg. */
1711    if (src.file != GRF ||
1712        src.file != last_rhs_inst->dst.file ||
1713        src.reg != last_rhs_inst->dst.reg ||
1714        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1715        src.reladdr ||
1716        src.abs ||
1717        src.negate ||
1718        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1719       return false;
1720
1721    /* Check that that last instruction fully initialized the channels
1722     * we want to use, in the order we want to use them.  We could
1723     * potentially reswizzle the operands of many instructions so that
1724     * we could handle out of order channels, but don't yet.
1725     */
1726
1727    for (unsigned i = 0; i < 4; i++) {
1728       if (dst.writemask & (1 << i)) {
1729          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1730             return false;
1731
1732          if (BRW_GET_SWZ(src.swizzle, i) != i)
1733             return false;
1734       }
1735    }
1736
1737    /* Success!  Rewrite the instruction. */
1738    last_rhs_inst->dst.file = dst.file;
1739    last_rhs_inst->dst.reg = dst.reg;
1740    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1741    last_rhs_inst->dst.reladdr = dst.reladdr;
1742    last_rhs_inst->dst.writemask &= dst.writemask;
1743
1744    return true;
1745 }
1746
1747 void
1748 vec4_visitor::visit(ir_assignment *ir)
1749 {
1750    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1751    uint32_t predicate = BRW_PREDICATE_NONE;
1752
1753    if (!ir->lhs->type->is_scalar() &&
1754        !ir->lhs->type->is_vector()) {
1755       ir->rhs->accept(this);
1756       src_reg src = this->result;
1757
1758       if (ir->condition) {
1759          emit_bool_to_cond_code(ir->condition, &predicate);
1760       }
1761
1762       /* emit_block_move doesn't account for swizzles in the source register.
1763        * This should be ok, since the source register is a structure or an
1764        * array, and those can't be swizzled.  But double-check to be sure.
1765        */
1766       assert(src.swizzle ==
1767              (ir->rhs->type->is_matrix()
1768               ? swizzle_for_size(ir->rhs->type->vector_elements)
1769               : BRW_SWIZZLE_NOOP));
1770
1771       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1772       return;
1773    }
1774
1775    /* Now we're down to just a scalar/vector with writemasks. */
1776    int i;
1777
1778    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1779    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1780
1781    ir->rhs->accept(this);
1782
1783    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1784
1785    src_reg src = this->result;
1786
1787    int swizzles[4];
1788    int first_enabled_chan = 0;
1789    int src_chan = 0;
1790
1791    assert(ir->lhs->type->is_vector() ||
1792           ir->lhs->type->is_scalar());
1793    dst.writemask = ir->write_mask;
1794
1795    for (int i = 0; i < 4; i++) {
1796       if (dst.writemask & (1 << i)) {
1797          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1798          break;
1799       }
1800    }
1801
1802    /* Swizzle a small RHS vector into the channels being written.
1803     *
1804     * glsl ir treats write_mask as dictating how many channels are
1805     * present on the RHS while in our instructions we need to make
1806     * those channels appear in the slots of the vec4 they're written to.
1807     */
1808    for (int i = 0; i < 4; i++) {
1809       if (dst.writemask & (1 << i))
1810          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1811       else
1812          swizzles[i] = first_enabled_chan;
1813    }
1814    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1815                               swizzles[2], swizzles[3]);
1816
1817    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1818       return;
1819    }
1820
1821    if (ir->condition) {
1822       emit_bool_to_cond_code(ir->condition, &predicate);
1823    }
1824
1825    for (i = 0; i < type_size(ir->lhs->type); i++) {
1826       vec4_instruction *inst = emit(MOV(dst, src));
1827       inst->predicate = predicate;
1828
1829       dst.reg_offset++;
1830       src.reg_offset++;
1831    }
1832 }
1833
1834 void
1835 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1836 {
1837    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1838       foreach_list(node, &ir->components) {
1839          ir_constant *field_value = (ir_constant *)node;
1840
1841          emit_constant_values(dst, field_value);
1842       }
1843       return;
1844    }
1845
1846    if (ir->type->is_array()) {
1847       for (unsigned int i = 0; i < ir->type->length; i++) {
1848          emit_constant_values(dst, ir->array_elements[i]);
1849       }
1850       return;
1851    }
1852
1853    if (ir->type->is_matrix()) {
1854       for (int i = 0; i < ir->type->matrix_columns; i++) {
1855          float *vec = &ir->value.f[i * ir->type->vector_elements];
1856
1857          for (int j = 0; j < ir->type->vector_elements; j++) {
1858             dst->writemask = 1 << j;
1859             dst->type = BRW_REGISTER_TYPE_F;
1860
1861             emit(MOV(*dst, src_reg(vec[j])));
1862          }
1863          dst->reg_offset++;
1864       }
1865       return;
1866    }
1867
1868    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
1869
1870    for (int i = 0; i < ir->type->vector_elements; i++) {
1871       if (!(remaining_writemask & (1 << i)))
1872          continue;
1873
1874       dst->writemask = 1 << i;
1875       dst->type = brw_type_for_base_type(ir->type);
1876
1877       /* Find other components that match the one we're about to
1878        * write.  Emits fewer instructions for things like vec4(0.5,
1879        * 1.5, 1.5, 1.5).
1880        */
1881       for (int j = i + 1; j < ir->type->vector_elements; j++) {
1882          if (ir->type->base_type == GLSL_TYPE_BOOL) {
1883             if (ir->value.b[i] == ir->value.b[j])
1884                dst->writemask |= (1 << j);
1885          } else {
1886             /* u, i, and f storage all line up, so no need for a
1887              * switch case for comparing each type.
1888              */
1889             if (ir->value.u[i] == ir->value.u[j])
1890                dst->writemask |= (1 << j);
1891          }
1892       }
1893
1894       switch (ir->type->base_type) {
1895       case GLSL_TYPE_FLOAT:
1896          emit(MOV(*dst, src_reg(ir->value.f[i])));
1897          break;
1898       case GLSL_TYPE_INT:
1899          emit(MOV(*dst, src_reg(ir->value.i[i])));
1900          break;
1901       case GLSL_TYPE_UINT:
1902          emit(MOV(*dst, src_reg(ir->value.u[i])));
1903          break;
1904       case GLSL_TYPE_BOOL:
1905          emit(MOV(*dst, src_reg(ir->value.b[i])));
1906          break;
1907       default:
1908          assert(!"Non-float/uint/int/bool constant");
1909          break;
1910       }
1911
1912       remaining_writemask &= ~dst->writemask;
1913    }
1914    dst->reg_offset++;
1915 }
1916
1917 void
1918 vec4_visitor::visit(ir_constant *ir)
1919 {
1920    dst_reg dst = dst_reg(this, ir->type);
1921    this->result = src_reg(dst);
1922
1923    emit_constant_values(&dst, ir);
1924 }
1925
1926 void
1927 vec4_visitor::visit(ir_call *ir)
1928 {
1929    assert(!"not reached");
1930 }
1931
1932 void
1933 vec4_visitor::visit(ir_texture *ir)
1934 {
1935    int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
1936
1937    /* Should be lowered by do_lower_texture_projection */
1938    assert(!ir->projector);
1939
1940    /* Generate code to compute all the subexpression trees.  This has to be
1941     * done before loading any values into MRFs for the sampler message since
1942     * generating these values may involve SEND messages that need the MRFs.
1943     */
1944    src_reg coordinate;
1945    if (ir->coordinate) {
1946       ir->coordinate->accept(this);
1947       coordinate = this->result;
1948    }
1949
1950    src_reg shadow_comparitor;
1951    if (ir->shadow_comparitor) {
1952       ir->shadow_comparitor->accept(this);
1953       shadow_comparitor = this->result;
1954    }
1955
1956    const glsl_type *lod_type;
1957    src_reg lod, dPdx, dPdy;
1958    switch (ir->op) {
1959    case ir_tex:
1960       lod = src_reg(0.0f);
1961       lod_type = glsl_type::float_type;
1962       break;
1963    case ir_txf:
1964    case ir_txl:
1965    case ir_txs:
1966       ir->lod_info.lod->accept(this);
1967       lod = this->result;
1968       lod_type = ir->lod_info.lod->type;
1969       break;
1970    case ir_txd:
1971       ir->lod_info.grad.dPdx->accept(this);
1972       dPdx = this->result;
1973
1974       ir->lod_info.grad.dPdy->accept(this);
1975       dPdy = this->result;
1976
1977       lod_type = ir->lod_info.grad.dPdx->type;
1978       break;
1979    case ir_txb:
1980       break;
1981    }
1982
1983    vec4_instruction *inst = NULL;
1984    switch (ir->op) {
1985    case ir_tex:
1986    case ir_txl:
1987       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
1988       break;
1989    case ir_txd:
1990       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
1991       break;
1992    case ir_txf:
1993       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
1994       break;
1995    case ir_txs:
1996       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
1997       break;
1998    case ir_txb:
1999       assert(!"TXB is not valid for vertex shaders.");
2000    }
2001
2002    /* Texel offsets go in the message header; Gen4 also requires headers. */
2003    inst->header_present = ir->offset || intel->gen < 5;
2004    inst->base_mrf = 2;
2005    inst->mlen = inst->header_present + 1; /* always at least one */
2006    inst->sampler = sampler;
2007    inst->dst = dst_reg(this, ir->type);
2008    inst->dst.writemask = WRITEMASK_XYZW;
2009    inst->shadow_compare = ir->shadow_comparitor != NULL;
2010
2011    if (ir->offset != NULL && ir->op != ir_txf)
2012       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2013
2014    /* MRF for the first parameter */
2015    int param_base = inst->base_mrf + inst->header_present;
2016
2017    if (ir->op == ir_txs) {
2018       int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2019       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2020    } else {
2021       int i, coord_mask = 0, zero_mask = 0;
2022       /* Load the coordinate */
2023       /* FINISHME: gl_clamp_mask and saturate */
2024       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2025          coord_mask |= (1 << i);
2026       for (; i < 4; i++)
2027          zero_mask |= (1 << i);
2028
2029       if (ir->offset && ir->op == ir_txf) {
2030          /* It appears that the ld instruction used for txf does its
2031           * address bounds check before adding in the offset.  To work
2032           * around this, just add the integer offset to the integer
2033           * texel coordinate, and don't put the offset in the header.
2034           */
2035          ir_constant *offset = ir->offset->as_constant();
2036          assert(offset);
2037
2038          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2039             src_reg src = coordinate;
2040             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2041                                        BRW_GET_SWZ(src.swizzle, j),
2042                                        BRW_GET_SWZ(src.swizzle, j),
2043                                        BRW_GET_SWZ(src.swizzle, j));
2044             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2045                      src, offset->value.i[j]));
2046          }
2047       } else {
2048          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2049                   coordinate));
2050       }
2051       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2052                src_reg(0)));
2053       /* Load the shadow comparitor */
2054       if (ir->shadow_comparitor) {
2055          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2056                           WRITEMASK_X),
2057                   shadow_comparitor));
2058          inst->mlen++;
2059       }
2060
2061       /* Load the LOD info */
2062       if (ir->op == ir_tex || ir->op == ir_txl) {
2063          int mrf, writemask;
2064          if (intel->gen >= 5) {
2065             mrf = param_base + 1;
2066             if (ir->shadow_comparitor) {
2067                writemask = WRITEMASK_Y;
2068                /* mlen already incremented */
2069             } else {
2070                writemask = WRITEMASK_X;
2071                inst->mlen++;
2072             }
2073          } else /* intel->gen == 4 */ {
2074             mrf = param_base;
2075             writemask = WRITEMASK_Z;
2076          }
2077          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2078       } else if (ir->op == ir_txf) {
2079          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W),
2080                   lod));
2081       } else if (ir->op == ir_txd) {
2082          const glsl_type *type = lod_type;
2083
2084          if (intel->gen >= 5) {
2085             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2086             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2087             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2088             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2089             inst->mlen++;
2090
2091             if (ir->type->vector_elements == 3) {
2092                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2093                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2094                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2095                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2096                inst->mlen++;
2097             }
2098          } else /* intel->gen == 4 */ {
2099             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2100             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2101             inst->mlen += 2;
2102          }
2103       }
2104    }
2105
2106    emit(inst);
2107
2108    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2109     * spec requires layers.
2110     */
2111    if (ir->op == ir_txs) {
2112       glsl_type const *type = ir->sampler->type;
2113       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2114           type->sampler_array) {
2115          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2116                    with_writemask(inst->dst, WRITEMASK_Z),
2117                    src_reg(inst->dst), src_reg(6));
2118       }
2119    }
2120
2121    swizzle_result(ir, src_reg(inst->dst), sampler);
2122 }
2123
2124 void
2125 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2126 {
2127    int s = c->key.tex.swizzles[sampler];
2128
2129    this->result = src_reg(this, ir->type);
2130    dst_reg swizzled_result(this->result);
2131
2132    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2133                         || s == SWIZZLE_NOOP) {
2134       emit(MOV(swizzled_result, orig_val));
2135       return;
2136    }
2137
2138    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2139    int swizzle[4];
2140
2141    for (int i = 0; i < 4; i++) {
2142       switch (GET_SWZ(s, i)) {
2143       case SWIZZLE_ZERO:
2144          zero_mask |= (1 << i);
2145          break;
2146       case SWIZZLE_ONE:
2147          one_mask |= (1 << i);
2148          break;
2149       default:
2150          copy_mask |= (1 << i);
2151          swizzle[i] = GET_SWZ(s, i);
2152          break;
2153       }
2154    }
2155
2156    if (copy_mask) {
2157       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2158       swizzled_result.writemask = copy_mask;
2159       emit(MOV(swizzled_result, orig_val));
2160    }
2161
2162    if (zero_mask) {
2163       swizzled_result.writemask = zero_mask;
2164       emit(MOV(swizzled_result, src_reg(0.0f)));
2165    }
2166
2167    if (one_mask) {
2168       swizzled_result.writemask = one_mask;
2169       emit(MOV(swizzled_result, src_reg(1.0f)));
2170    }
2171 }
2172
2173 void
2174 vec4_visitor::visit(ir_return *ir)
2175 {
2176    assert(!"not reached");
2177 }
2178
2179 void
2180 vec4_visitor::visit(ir_discard *ir)
2181 {
2182    assert(!"not reached");
2183 }
2184
2185 void
2186 vec4_visitor::visit(ir_if *ir)
2187 {
2188    /* Don't point the annotation at the if statement, because then it plus
2189     * the then and else blocks get printed.
2190     */
2191    this->base_ir = ir->condition;
2192
2193    if (intel->gen == 6) {
2194       emit_if_gen6(ir);
2195    } else {
2196       uint32_t predicate;
2197       emit_bool_to_cond_code(ir->condition, &predicate);
2198       emit(IF(predicate));
2199    }
2200
2201    visit_instructions(&ir->then_instructions);
2202
2203    if (!ir->else_instructions.is_empty()) {
2204       this->base_ir = ir->condition;
2205       emit(BRW_OPCODE_ELSE);
2206
2207       visit_instructions(&ir->else_instructions);
2208    }
2209
2210    this->base_ir = ir->condition;
2211    emit(BRW_OPCODE_ENDIF);
2212 }
2213
2214 void
2215 vec4_visitor::emit_ndc_computation()
2216 {
2217    /* Get the position */
2218    src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
2219
2220    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2221    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2222    output_reg[BRW_VERT_RESULT_NDC] = ndc;
2223
2224    current_annotation = "NDC";
2225    dst_reg ndc_w = ndc;
2226    ndc_w.writemask = WRITEMASK_W;
2227    src_reg pos_w = pos;
2228    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2229    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2230
2231    dst_reg ndc_xyz = ndc;
2232    ndc_xyz.writemask = WRITEMASK_XYZ;
2233
2234    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2235 }
2236
2237 void
2238 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2239 {
2240    if (intel->gen < 6 &&
2241        ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2242         c->key.userclip_active || brw->has_negative_rhw_bug)) {
2243       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2244       dst_reg header1_w = header1;
2245       header1_w.writemask = WRITEMASK_W;
2246       GLuint i;
2247
2248       emit(MOV(header1, 0u));
2249
2250       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2251          src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2252
2253          current_annotation = "Point size";
2254          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2255          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2256       }
2257
2258       current_annotation = "Clipping flags";
2259       for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2260          vec4_instruction *inst;
2261
2262          inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2263                          src_reg(this->userplane[i])));
2264          inst->conditional_mod = BRW_CONDITIONAL_L;
2265
2266          inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2267          inst->predicate = BRW_PREDICATE_NORMAL;
2268       }
2269
2270       /* i965 clipping workaround:
2271        * 1) Test for -ve rhw
2272        * 2) If set,
2273        *      set ndc = (0,0,0,0)
2274        *      set ucp[6] = 1
2275        *
2276        * Later, clipping will detect ucp[6] and ensure the primitive is
2277        * clipped against all fixed planes.
2278        */
2279       if (brw->has_negative_rhw_bug) {
2280 #if 0
2281          /* FINISHME */
2282          brw_CMP(p,
2283                  vec8(brw_null_reg()),
2284                  BRW_CONDITIONAL_L,
2285                  brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2286                  brw_imm_f(0));
2287
2288          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2289          brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2290          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2291 #endif
2292       }
2293
2294       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2295    } else if (intel->gen < 6) {
2296       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2297    } else {
2298       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2299       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2300          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2301                   src_reg(output_reg[VERT_RESULT_PSIZ])));
2302       }
2303    }
2304 }
2305
2306 void
2307 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2308 {
2309    if (intel->gen < 6) {
2310       /* Clip distance slots are set aside in gen5, but they are not used.  It
2311        * is not clear whether we actually need to set aside space for them,
2312        * but the performance cost is negligible.
2313        */
2314       return;
2315    }
2316
2317    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2318     *
2319     *     "If a linked set of shaders forming the vertex stage contains no
2320     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2321     *     application has requested clipping against user clip planes through
2322     *     the API, then the coordinate written to gl_Position is used for
2323     *     comparison against the user clip planes."
2324     *
2325     * This function is only called if the shader didn't write to
2326     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2327     * if the user wrote to it; otherwise we use gl_Position.
2328     */
2329    gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2330    if (!(c->prog_data.outputs_written
2331          & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2332       clip_vertex = VERT_RESULT_HPOS;
2333    }
2334
2335    for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2336         ++i) {
2337       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2338                src_reg(output_reg[clip_vertex]),
2339                src_reg(this->userplane[i + offset])));
2340    }
2341 }
2342
2343 void
2344 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2345 {
2346    assert (vert_result < VERT_RESULT_MAX);
2347    reg.type = output_reg[vert_result].type;
2348    current_annotation = output_reg_annotation[vert_result];
2349    /* Copy the register, saturating if necessary */
2350    vec4_instruction *inst = emit(MOV(reg,
2351                                      src_reg(output_reg[vert_result])));
2352    if ((vert_result == VERT_RESULT_COL0 ||
2353         vert_result == VERT_RESULT_COL1 ||
2354         vert_result == VERT_RESULT_BFC0 ||
2355         vert_result == VERT_RESULT_BFC1) &&
2356        c->key.clamp_vertex_color) {
2357       inst->saturate = true;
2358    }
2359 }
2360
2361 void
2362 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2363 {
2364    struct brw_reg hw_reg = brw_message_reg(mrf);
2365    dst_reg reg = dst_reg(MRF, mrf);
2366    reg.type = BRW_REGISTER_TYPE_F;
2367
2368    switch (vert_result) {
2369    case VERT_RESULT_PSIZ:
2370       /* PSIZ is always in slot 0, and is coupled with other flags. */
2371       current_annotation = "indices, point width, clip flags";
2372       emit_psiz_and_flags(hw_reg);
2373       break;
2374    case BRW_VERT_RESULT_NDC:
2375       current_annotation = "NDC";
2376       emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2377       break;
2378    case BRW_VERT_RESULT_HPOS_DUPLICATE:
2379    case VERT_RESULT_HPOS:
2380       current_annotation = "gl_Position";
2381       emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2382       break;
2383    case VERT_RESULT_CLIP_DIST0:
2384    case VERT_RESULT_CLIP_DIST1:
2385       if (this->c->key.uses_clip_distance) {
2386          emit_generic_urb_slot(reg, vert_result);
2387       } else {
2388          current_annotation = "user clip distances";
2389          emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2390       }
2391       break;
2392    case VERT_RESULT_EDGE:
2393       /* This is present when doing unfilled polygons.  We're supposed to copy
2394        * the edge flag from the user-provided vertex array
2395        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2396        * of that attribute (starts as 1.0f).  This is then used in clipping to
2397        * determine which edges should be drawn as wireframe.
2398        */
2399       current_annotation = "edge flag";
2400       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2401                                     glsl_type::float_type, WRITEMASK_XYZW))));
2402       break;
2403    case BRW_VERT_RESULT_PAD:
2404       /* No need to write to this slot */
2405       break;
2406    default:
2407       emit_generic_urb_slot(reg, vert_result);
2408       break;
2409    }
2410 }
2411
2412 static int
2413 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2414 {
2415    struct intel_context *intel = &brw->intel;
2416
2417    if (intel->gen >= 6) {
2418       /* URB data written (does not include the message header reg) must
2419        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2420        * section 5.4.3.2.2: URB_INTERLEAVED.
2421        *
2422        * URB entries are allocated on a multiple of 1024 bits, so an
2423        * extra 128 bits written here to make the end align to 256 is
2424        * no problem.
2425        */
2426       if ((mlen % 2) != 1)
2427          mlen++;
2428    }
2429
2430    return mlen;
2431 }
2432
2433 /**
2434  * Generates the VUE payload plus the 1 or 2 URB write instructions to
2435  * complete the VS thread.
2436  *
2437  * The VUE layout is documented in Volume 2a.
2438  */
2439 void
2440 vec4_visitor::emit_urb_writes()
2441 {
2442    /* MRF 0 is reserved for the debugger, so start with message header
2443     * in MRF 1.
2444     */
2445    int base_mrf = 1;
2446    int mrf = base_mrf;
2447    /* In the process of generating our URB write message contents, we
2448     * may need to unspill a register or load from an array.  Those
2449     * reads would use MRFs 14-15.
2450     */
2451    int max_usable_mrf = 13;
2452
2453    /* The following assertion verifies that max_usable_mrf causes an
2454     * even-numbered amount of URB write data, which will meet gen6's
2455     * requirements for length alignment.
2456     */
2457    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2458
2459    /* First mrf is the g0-based message header containing URB handles and such,
2460     * which is implied in VS_OPCODE_URB_WRITE.
2461     */
2462    mrf++;
2463
2464    if (intel->gen < 6) {
2465       emit_ndc_computation();
2466    }
2467
2468    /* Set up the VUE data for the first URB write */
2469    int slot;
2470    for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) {
2471       emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2472
2473       /* If this was max_usable_mrf, we can't fit anything more into this URB
2474        * WRITE.
2475        */
2476       if (mrf > max_usable_mrf) {
2477          slot++;
2478          break;
2479       }
2480    }
2481
2482    current_annotation = "URB write";
2483    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2484    inst->base_mrf = base_mrf;
2485    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2486    inst->eot = (slot >= c->prog_data.vue_map.num_slots);
2487
2488    /* Optional second URB write */
2489    if (!inst->eot) {
2490       mrf = base_mrf + 1;
2491
2492       for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
2493          assert(mrf < max_usable_mrf);
2494
2495          emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2496       }
2497
2498       current_annotation = "URB write";
2499       inst = emit(VS_OPCODE_URB_WRITE);
2500       inst->base_mrf = base_mrf;
2501       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2502       inst->eot = true;
2503       /* URB destination offset.  In the previous write, we got MRFs
2504        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2505        * URB row increments, and each of our MRFs is half of one of
2506        * those, since we're doing interleaved writes.
2507        */
2508       inst->offset = (max_usable_mrf - base_mrf) / 2;
2509    }
2510 }
2511
2512 src_reg
2513 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2514                                  src_reg *reladdr, int reg_offset)
2515 {
2516    /* Because we store the values to scratch interleaved like our
2517     * vertex data, we need to scale the vec4 index by 2.
2518     */
2519    int message_header_scale = 2;
2520
2521    /* Pre-gen6, the message header uses byte offsets instead of vec4
2522     * (16-byte) offset units.
2523     */
2524    if (intel->gen < 6)
2525       message_header_scale *= 16;
2526
2527    if (reladdr) {
2528       src_reg index = src_reg(this, glsl_type::int_type);
2529
2530       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2531       emit_before(inst, MUL(dst_reg(index),
2532                             index, src_reg(message_header_scale)));
2533
2534       return index;
2535    } else {
2536       return src_reg(reg_offset * message_header_scale);
2537    }
2538 }
2539
2540 src_reg
2541 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2542                                        src_reg *reladdr, int reg_offset)
2543 {
2544    if (reladdr) {
2545       src_reg index = src_reg(this, glsl_type::int_type);
2546
2547       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2548
2549       /* Pre-gen6, the message header uses byte offsets instead of vec4
2550        * (16-byte) offset units.
2551        */
2552       if (intel->gen < 6) {
2553          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2554       }
2555
2556       return index;
2557    } else {
2558       int message_header_scale = intel->gen < 6 ? 16 : 1;
2559       return src_reg(reg_offset * message_header_scale);
2560    }
2561 }
2562
2563 /**
2564  * Emits an instruction before @inst to load the value named by @orig_src
2565  * from scratch space at @base_offset to @temp.
2566  *
2567  * @base_offset is measured in 32-byte units (the size of a register).
2568  */
2569 void
2570 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2571                                 dst_reg temp, src_reg orig_src,
2572                                 int base_offset)
2573 {
2574    int reg_offset = base_offset + orig_src.reg_offset;
2575    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2576
2577    emit_before(inst, SCRATCH_READ(temp, index));
2578 }
2579
2580 /**
2581  * Emits an instruction after @inst to store the value to be written
2582  * to @orig_dst to scratch space at @base_offset, from @temp.
2583  *
2584  * @base_offset is measured in 32-byte units (the size of a register).
2585  */
2586 void
2587 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2588 {
2589    int reg_offset = base_offset + inst->dst.reg_offset;
2590    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2591
2592    /* Create a temporary register to store *inst's result in.
2593     *
2594     * We have to be careful in MOVing from our temporary result register in
2595     * the scratch write.  If we swizzle from channels of the temporary that
2596     * weren't initialized, it will confuse live interval analysis, which will
2597     * make spilling fail to make progress.
2598     */
2599    src_reg temp = src_reg(this, glsl_type::vec4_type);
2600    temp.type = inst->dst.type;
2601    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2602    int swizzles[4];
2603    for (int i = 0; i < 4; i++)
2604       if (inst->dst.writemask & (1 << i))
2605          swizzles[i] = i;
2606       else
2607          swizzles[i] = first_writemask_chan;
2608    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2609                                swizzles[2], swizzles[3]);
2610
2611    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2612                                        inst->dst.writemask));
2613    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2614    write->predicate = inst->predicate;
2615    write->ir = inst->ir;
2616    write->annotation = inst->annotation;
2617    inst->insert_after(write);
2618
2619    inst->dst.file = temp.file;
2620    inst->dst.reg = temp.reg;
2621    inst->dst.reg_offset = temp.reg_offset;
2622    inst->dst.reladdr = NULL;
2623 }
2624
2625 /**
2626  * We can't generally support array access in GRF space, because a
2627  * single instruction's destination can only span 2 contiguous
2628  * registers.  So, we send all GRF arrays that get variable index
2629  * access to scratch space.
2630  */
2631 void
2632 vec4_visitor::move_grf_array_access_to_scratch()
2633 {
2634    int scratch_loc[this->virtual_grf_count];
2635
2636    for (int i = 0; i < this->virtual_grf_count; i++) {
2637       scratch_loc[i] = -1;
2638    }
2639
2640    /* First, calculate the set of virtual GRFs that need to be punted
2641     * to scratch due to having any array access on them, and where in
2642     * scratch.
2643     */
2644    foreach_list(node, &this->instructions) {
2645       vec4_instruction *inst = (vec4_instruction *)node;
2646
2647       if (inst->dst.file == GRF && inst->dst.reladdr &&
2648           scratch_loc[inst->dst.reg] == -1) {
2649          scratch_loc[inst->dst.reg] = c->last_scratch;
2650          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2651       }
2652
2653       for (int i = 0 ; i < 3; i++) {
2654          src_reg *src = &inst->src[i];
2655
2656          if (src->file == GRF && src->reladdr &&
2657              scratch_loc[src->reg] == -1) {
2658             scratch_loc[src->reg] = c->last_scratch;
2659             c->last_scratch += this->virtual_grf_sizes[src->reg];
2660          }
2661       }
2662    }
2663
2664    /* Now, for anything that will be accessed through scratch, rewrite
2665     * it to load/store.  Note that this is a _safe list walk, because
2666     * we may generate a new scratch_write instruction after the one
2667     * we're processing.
2668     */
2669    foreach_list_safe(node, &this->instructions) {
2670       vec4_instruction *inst = (vec4_instruction *)node;
2671
2672       /* Set up the annotation tracking for new generated instructions. */
2673       base_ir = inst->ir;
2674       current_annotation = inst->annotation;
2675
2676       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2677          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2678       }
2679
2680       for (int i = 0 ; i < 3; i++) {
2681          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2682             continue;
2683
2684          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2685
2686          emit_scratch_read(inst, temp, inst->src[i],
2687                            scratch_loc[inst->src[i].reg]);
2688
2689          inst->src[i].file = temp.file;
2690          inst->src[i].reg = temp.reg;
2691          inst->src[i].reg_offset = temp.reg_offset;
2692          inst->src[i].reladdr = NULL;
2693       }
2694    }
2695 }
2696
2697 /**
2698  * Emits an instruction before @inst to load the value named by @orig_src
2699  * from the pull constant buffer (surface) at @base_offset to @temp.
2700  */
2701 void
2702 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2703                                       dst_reg temp, src_reg orig_src,
2704                                       int base_offset)
2705 {
2706    int reg_offset = base_offset + orig_src.reg_offset;
2707    src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2708    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2709    vec4_instruction *load;
2710
2711    load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2712                                         temp, index, offset);
2713    load->base_mrf = 14;
2714    load->mlen = 1;
2715    emit_before(inst, load);
2716 }
2717
2718 /**
2719  * Implements array access of uniforms by inserting a
2720  * PULL_CONSTANT_LOAD instruction.
2721  *
2722  * Unlike temporary GRF array access (where we don't support it due to
2723  * the difficulty of doing relative addressing on instruction
2724  * destinations), we could potentially do array access of uniforms
2725  * that were loaded in GRF space as push constants.  In real-world
2726  * usage we've seen, though, the arrays being used are always larger
2727  * than we could load as push constants, so just always move all
2728  * uniform array access out to a pull constant buffer.
2729  */
2730 void
2731 vec4_visitor::move_uniform_array_access_to_pull_constants()
2732 {
2733    int pull_constant_loc[this->uniforms];
2734
2735    for (int i = 0; i < this->uniforms; i++) {
2736       pull_constant_loc[i] = -1;
2737    }
2738
2739    /* Walk through and find array access of uniforms.  Put a copy of that
2740     * uniform in the pull constant buffer.
2741     *
2742     * Note that we don't move constant-indexed accesses to arrays.  No
2743     * testing has been done of the performance impact of this choice.
2744     */
2745    foreach_list_safe(node, &this->instructions) {
2746       vec4_instruction *inst = (vec4_instruction *)node;
2747
2748       for (int i = 0 ; i < 3; i++) {
2749          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2750             continue;
2751
2752          int uniform = inst->src[i].reg;
2753
2754          /* If this array isn't already present in the pull constant buffer,
2755           * add it.
2756           */
2757          if (pull_constant_loc[uniform] == -1) {
2758             const float **values = &prog_data->param[uniform * 4];
2759
2760             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2761
2762             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2763                prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2764             }
2765          }
2766
2767          /* Set up the annotation tracking for new generated instructions. */
2768          base_ir = inst->ir;
2769          current_annotation = inst->annotation;
2770
2771          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2772
2773          emit_pull_constant_load(inst, temp, inst->src[i],
2774                                  pull_constant_loc[uniform]);
2775
2776          inst->src[i].file = temp.file;
2777          inst->src[i].reg = temp.reg;
2778          inst->src[i].reg_offset = temp.reg_offset;
2779          inst->src[i].reladdr = NULL;
2780       }
2781    }
2782
2783    /* Now there are no accesses of the UNIFORM file with a reladdr, so
2784     * no need to track them as larger-than-vec4 objects.  This will be
2785     * relied on in cutting out unused uniform vectors from push
2786     * constants.
2787     */
2788    split_uniform_registers();
2789 }
2790
2791 void
2792 vec4_visitor::resolve_ud_negate(src_reg *reg)
2793 {
2794    if (reg->type != BRW_REGISTER_TYPE_UD ||
2795        !reg->negate)
2796       return;
2797
2798    src_reg temp = src_reg(this, glsl_type::uvec4_type);
2799    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2800    *reg = temp;
2801 }
2802
2803 vec4_visitor::vec4_visitor(struct brw_context *brw,
2804                            struct brw_vs_compile *c,
2805                            struct gl_shader_program *prog,
2806                            struct brw_shader *shader,
2807                            void *mem_ctx)
2808 {
2809    this->c = c;
2810    this->brw = brw;
2811    this->intel = &brw->intel;
2812    this->ctx = &intel->ctx;
2813    this->prog = prog;
2814    this->shader = shader;
2815
2816    this->mem_ctx = mem_ctx;
2817    this->failed = false;
2818
2819    this->base_ir = NULL;
2820    this->current_annotation = NULL;
2821    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
2822
2823    this->c = c;
2824    this->vp = &c->vp->program;
2825    this->prog_data = &c->prog_data;
2826
2827    this->variable_ht = hash_table_ctor(0,
2828                                        hash_table_pointer_hash,
2829                                        hash_table_pointer_compare);
2830
2831    this->virtual_grf_def = NULL;
2832    this->virtual_grf_use = NULL;
2833    this->virtual_grf_sizes = NULL;
2834    this->virtual_grf_count = 0;
2835    this->virtual_grf_reg_map = NULL;
2836    this->virtual_grf_reg_count = 0;
2837    this->virtual_grf_array_size = 0;
2838    this->live_intervals_valid = false;
2839
2840    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2841
2842    this->uniforms = 0;
2843 }
2844
2845 vec4_visitor::~vec4_visitor()
2846 {
2847    hash_table_dtor(this->variable_ht);
2848 }
2849
2850
2851 void
2852 vec4_visitor::fail(const char *format, ...)
2853 {
2854    va_list va;
2855    char *msg;
2856
2857    if (failed)
2858       return;
2859
2860    failed = true;
2861
2862    va_start(va, format);
2863    msg = ralloc_vasprintf(mem_ctx, format, va);
2864    va_end(va);
2865    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2866
2867    this->fail_msg = msg;
2868
2869    if (INTEL_DEBUG & DEBUG_VS) {
2870       fprintf(stderr, "%s",  msg);
2871    }
2872 }
2873
2874 } /* namespace brw */