src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "glsl/ir_uniform.h"
  26 extern "C" {
  27 #include "main/context.h"
  28 #include "main/macros.h"
  29 #include "program/prog_parameter.h"
  30 #include "program/sampler.h"
  31 }
  32
  33 namespace brw {
  34
  35 vec4_instruction::vec4_instruction(vec4_visitor *v,
  36                                    enum opcode opcode, dst_reg dst,
  37                                    src_reg src0, src_reg src1, src_reg src2)
  38 {
  39    this->opcode = opcode;
  40    this->dst = dst;
  41    this->src[0] = src0;
  42    this->src[1] = src1;
  43    this->src[2] = src2;
  44    this->ir = v->base_ir;
  45    this->annotation = v->current_annotation;
  46 }
  47
  48 vec4_instruction *
  49 vec4_visitor::emit(vec4_instruction *inst)
  50 {
  51    this->instructions.push_tail(inst);
  52
  53    return inst;
  54 }
  55
  56 vec4_instruction *
  57 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  58 {
  59    new_inst->ir = inst->ir;
  60    new_inst->annotation = inst->annotation;
  61
  62    inst->insert_before(new_inst);
  63
  64    return inst;
  65 }
  66
  67 vec4_instruction *
  68 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  69                    src_reg src0, src_reg src1, src_reg src2)
  70 {
  71    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  72                                              src0, src1, src2));
  73 }
  74
  75
  76 vec4_instruction *
  77 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  78 {
  79    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  80 }
  81
  82 vec4_instruction *
  83 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  84 {
  85    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  86 }
  87
  88 vec4_instruction *
  89 vec4_visitor::emit(enum opcode opcode)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  92 }
  93
  94 #define ALU1(op)                                                        \
  95    vec4_instruction *                                                   \
  96    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
  97    {                                                                    \
  98       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
  99                                            src0);                       \
 100    }
 101
 102 #define ALU2(op)                                                        \
 103    vec4_instruction *                                                   \
 104    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 105    {                                                                    \
 106       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 107                                            src0, src1);                 \
 108    }
 109
 110 ALU1(NOT)
 111 ALU1(MOV)
 112 ALU1(FRC)
 113 ALU1(RNDD)
 114 ALU1(RNDE)
 115 ALU1(RNDZ)
 116 ALU2(ADD)
 117 ALU2(MUL)
 118 ALU2(MACH)
 119 ALU2(AND)
 120 ALU2(OR)
 121 ALU2(XOR)
 122 ALU2(DP3)
 123 ALU2(DP4)
 124 ALU2(DPH)
 125 ALU2(SHL)
 126 ALU2(SHR)
 127 ALU2(ASR)
 128
 129 /** Gen4 predicated IF. */
 130 vec4_instruction *
 131 vec4_visitor::IF(uint32_t predicate)
 132 {
 133    vec4_instruction *inst;
 134
 135    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 136    inst->predicate = predicate;
 137
 138    return inst;
 139 }
 140
 141 /** Gen6+ IF with embedded comparison. */
 142 vec4_instruction *
 143 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 144 {
 145    assert(intel->gen >= 6);
 146
 147    vec4_instruction *inst;
 148
 149    resolve_ud_negate(&src0);
 150    resolve_ud_negate(&src1);
 151
 152    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 153                                         src0, src1);
 154    inst->conditional_mod = condition;
 155
 156    return inst;
 157 }
 158
 159 /**
 160  * CMP: Sets the low bit of the destination channels with the result
 161  * of the comparison, while the upper bits are undefined, and updates
 162  * the flag register with the packed 16 bits of the result.
 163  */
 164 vec4_instruction *
 165 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 166 {
 167    vec4_instruction *inst;
 168
 169    /* original gen4 does type conversion to the destination type
 170     * before before comparison, producing garbage results for floating
 171     * point comparisons.
 172     */
 173    if (intel->gen == 4) {
 174       dst.type = src0.type;
 175       if (dst.file == HW_REG)
 176          dst.fixed_hw_reg.type = dst.type;
 177    }
 178
 179    resolve_ud_negate(&src0);
 180    resolve_ud_negate(&src1);
 181
 182    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 183    inst->conditional_mod = condition;
 184
 185    return inst;
 186 }
 187
 188 vec4_instruction *
 189 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 194                                         dst, index);
 195    inst->base_mrf = 14;
 196    inst->mlen = 2;
 197
 198    return inst;
 199 }
 200
 201 vec4_instruction *
 202 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 203 {
 204    vec4_instruction *inst;
 205
 206    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 207                                         dst, src, index);
 208    inst->base_mrf = 13;
 209    inst->mlen = 3;
 210
 211    return inst;
 212 }
 213
 214 void
 215 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 216 {
 217    static enum opcode dot_opcodes[] = {
 218       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 219    };
 220
 221    emit(dot_opcodes[elements - 2], dst, src0, src1);
 222 }
 223
 224 src_reg
 225 vec4_visitor::fix_math_operand(src_reg src)
 226 {
 227    /* The gen6 math instruction ignores the source modifiers --
 228     * swizzle, abs, negate, and at least some parts of the register
 229     * region description.
 230     *
 231     * Rather than trying to enumerate all these cases, *always* expand the
 232     * operand to a temp GRF for gen6.
 233     *
 234     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 235     * can't use.
 236     */
 237
 238    if (intel->gen == 7 && src.file != IMM)
 239       return src;
 240
 241    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 242    expanded.type = src.type;
 243    emit(MOV(expanded, src));
 244    return src_reg(expanded);
 245 }
 246
 247 void
 248 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 249 {
 250    src = fix_math_operand(src);
 251
 252    if (dst.writemask != WRITEMASK_XYZW) {
 253       /* The gen6 math instruction must be align1, so we can't do
 254        * writemasks.
 255        */
 256       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 257
 258       emit(opcode, temp_dst, src);
 259
 260       emit(MOV(dst, src_reg(temp_dst)));
 261    } else {
 262       emit(opcode, dst, src);
 263    }
 264 }
 265
 266 void
 267 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 268 {
 269    vec4_instruction *inst = emit(opcode, dst, src);
 270    inst->base_mrf = 1;
 271    inst->mlen = 1;
 272 }
 273
 274 void
 275 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 276 {
 277    switch (opcode) {
 278    case SHADER_OPCODE_RCP:
 279    case SHADER_OPCODE_RSQ:
 280    case SHADER_OPCODE_SQRT:
 281    case SHADER_OPCODE_EXP2:
 282    case SHADER_OPCODE_LOG2:
 283    case SHADER_OPCODE_SIN:
 284    case SHADER_OPCODE_COS:
 285       break;
 286    default:
 287       assert(!"not reached: bad math opcode");
 288       return;
 289    }
 290
 291    if (intel->gen >= 6) {
 292       return emit_math1_gen6(opcode, dst, src);
 293    } else {
 294       return emit_math1_gen4(opcode, dst, src);
 295    }
 296 }
 297
 298 void
 299 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 300                               dst_reg dst, src_reg src0, src_reg src1)
 301 {
 302    src0 = fix_math_operand(src0);
 303    src1 = fix_math_operand(src1);
 304
 305    if (dst.writemask != WRITEMASK_XYZW) {
 306       /* The gen6 math instruction must be align1, so we can't do
 307        * writemasks.
 308        */
 309       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 310       temp_dst.type = dst.type;
 311
 312       emit(opcode, temp_dst, src0, src1);
 313
 314       emit(MOV(dst, src_reg(temp_dst)));
 315    } else {
 316       emit(opcode, dst, src0, src1);
 317    }
 318 }
 319
 320 void
 321 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 322                               dst_reg dst, src_reg src0, src_reg src1)
 323 {
 324    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 325    inst->base_mrf = 1;
 326    inst->mlen = 2;
 327 }
 328
 329 void
 330 vec4_visitor::emit_math(enum opcode opcode,
 331                         dst_reg dst, src_reg src0, src_reg src1)
 332 {
 333    switch (opcode) {
 334    case SHADER_OPCODE_POW:
 335    case SHADER_OPCODE_INT_QUOTIENT:
 336    case SHADER_OPCODE_INT_REMAINDER:
 337       break;
 338    default:
 339       assert(!"not reached: unsupported binary math opcode");
 340       return;
 341    }
 342
 343    if (intel->gen >= 6) {
 344       return emit_math2_gen6(opcode, dst, src0, src1);
 345    } else {
 346       return emit_math2_gen4(opcode, dst, src0, src1);
 347    }
 348 }
 349
 350 void
 351 vec4_visitor::visit_instructions(const exec_list *list)
 352 {
 353    foreach_list(node, list) {
 354       ir_instruction *ir = (ir_instruction *)node;
 355
 356       base_ir = ir;
 357       ir->accept(this);
 358    }
 359 }
 360
 361
 362 static int
 363 type_size(const struct glsl_type *type)
 364 {
 365    unsigned int i;
 366    int size;
 367
 368    switch (type->base_type) {
 369    case GLSL_TYPE_UINT:
 370    case GLSL_TYPE_INT:
 371    case GLSL_TYPE_FLOAT:
 372    case GLSL_TYPE_BOOL:
 373       if (type->is_matrix()) {
 374          return type->matrix_columns;
 375       } else {
 376          /* Regardless of size of vector, it gets a vec4. This is bad
 377           * packing for things like floats, but otherwise arrays become a
 378           * mess.  Hopefully a later pass over the code can pack scalars
 379           * down if appropriate.
 380           */
 381          return 1;
 382       }
 383    case GLSL_TYPE_ARRAY:
 384       assert(type->length > 0);
 385       return type_size(type->fields.array) * type->length;
 386    case GLSL_TYPE_STRUCT:
 387       size = 0;
 388       for (i = 0; i < type->length; i++) {
 389          size += type_size(type->fields.structure[i].type);
 390       }
 391       return size;
 392    case GLSL_TYPE_SAMPLER:
 393       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 394        * at link time.
 395        */
 396       return 1;
 397    default:
 398       assert(0);
 399       return 0;
 400    }
 401 }
 402
 403 int
 404 vec4_visitor::virtual_grf_alloc(int size)
 405 {
 406    if (virtual_grf_array_size <= virtual_grf_count) {
 407       if (virtual_grf_array_size == 0)
 408          virtual_grf_array_size = 16;
 409       else
 410          virtual_grf_array_size *= 2;
 411       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 412                                    virtual_grf_array_size);
 413       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 414                                      virtual_grf_array_size);
 415    }
 416    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 417    virtual_grf_reg_count += size;
 418    virtual_grf_sizes[virtual_grf_count] = size;
 419    return virtual_grf_count++;
 420 }
 421
 422 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 423 {
 424    init();
 425
 426    this->file = GRF;
 427    this->reg = v->virtual_grf_alloc(type_size(type));
 428
 429    if (type->is_array() || type->is_record()) {
 430       this->swizzle = BRW_SWIZZLE_NOOP;
 431    } else {
 432       this->swizzle = swizzle_for_size(type->vector_elements);
 433    }
 434
 435    this->type = brw_type_for_base_type(type);
 436 }
 437
 438 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 439 {
 440    init();
 441
 442    this->file = GRF;
 443    this->reg = v->virtual_grf_alloc(type_size(type));
 444
 445    if (type->is_array() || type->is_record()) {
 446       this->writemask = WRITEMASK_XYZW;
 447    } else {
 448       this->writemask = (1 << type->vector_elements) - 1;
 449    }
 450
 451    this->type = brw_type_for_base_type(type);
 452 }
 453
 454 /* Our support for uniforms is piggy-backed on the struct
 455  * gl_fragment_program, because that's where the values actually
 456  * get stored, rather than in some global gl_shader_program uniform
 457  * store.
 458  */
 459 void
 460 vec4_visitor::setup_uniform_values(ir_variable *ir)
 461 {
 462    int namelen = strlen(ir->name);
 463
 464    /* The data for our (non-builtin) uniforms is stored in a series of
 465     * gl_uniform_driver_storage structs for each subcomponent that
 466     * glGetUniformLocation() could name.  We know it's been set up in the same
 467     * order we'd walk the type, so walk the list of storage and find anything
 468     * with our name, or the prefix of a component that starts with our name.
 469     */
 470    for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
 471       struct gl_uniform_storage *storage = &prog->UniformStorage[u];
 472
 473       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 474           (storage->name[namelen] != 0 &&
 475            storage->name[namelen] != '.' &&
 476            storage->name[namelen] != '[')) {
 477          continue;
 478       }
 479
 480       gl_constant_value *components = storage->storage;
 481       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 482                                storage->type->matrix_columns);
 483
 484       for (unsigned s = 0; s < vector_count; s++) {
 485          uniform_vector_size[uniforms] = storage->type->vector_elements;
 486
 487          int i;
 488          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 489             c->prog_data.param[uniforms * 4 + i] = &components->f;
 490             components++;
 491          }
 492          for (; i < 4; i++) {
 493             static float zero = 0;
 494             c->prog_data.param[uniforms * 4 + i] = &zero;
 495          }
 496
 497          uniforms++;
 498       }
 499    }
 500 }
 501
 502 void
 503 vec4_visitor::setup_uniform_clipplane_values()
 504 {
 505    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 506
 507    if (intel->gen < 6) {
 508       /* Pre-Gen6, we compact clip planes.  For example, if the user
 509        * enables just clip planes 0, 1, and 3, we will enable clip planes
 510        * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
 511        * plane 2.  This simplifies the implementation of the Gen6 clip
 512        * thread.
 513        */
 514       int compacted_clipplane_index = 0;
 515       for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
 516          if (!(c->key.userclip_planes_enabled_gen_4_5 & (1 << i)))
 517             continue;
 518
 519          this->uniform_vector_size[this->uniforms] = 4;
 520          this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
 521          this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
 522          for (int j = 0; j < 4; ++j) {
 523             c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 524          }
 525          ++compacted_clipplane_index;
 526          ++this->uniforms;
 527       }
 528    } else {
 529       /* In Gen6 and later, we don't compact clip planes, because this
 530        * simplifies the implementation of gl_ClipDistance.
 531        */
 532       for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
 533          this->uniform_vector_size[this->uniforms] = 4;
 534          this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 535          this->userplane[i].type = BRW_REGISTER_TYPE_F;
 536          for (int j = 0; j < 4; ++j) {
 537             c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 538          }
 539          ++this->uniforms;
 540       }
 541    }
 542 }
 543
 544 /* Our support for builtin uniforms is even scarier than non-builtin.
 545  * It sits on top of the PROG_STATE_VAR parameters that are
 546  * automatically updated from GL context state.
 547  */
 548 void
 549 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 550 {
 551    const ir_state_slot *const slots = ir->state_slots;
 552    assert(ir->state_slots != NULL);
 553
 554    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 555       /* This state reference has already been setup by ir_to_mesa,
 556        * but we'll get the same index back here.  We can reference
 557        * ParameterValues directly, since unlike brw_fs.cpp, we never
 558        * add new state references during compile.
 559        */
 560       int index = _mesa_add_state_reference(this->vp->Base.Parameters,
 561                                             (gl_state_index *)slots[i].tokens);
 562       float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
 563
 564       this->uniform_vector_size[this->uniforms] = 0;
 565       /* Add each of the unique swizzled channels of the element.
 566        * This will end up matching the size of the glsl_type of this field.
 567        */
 568       int last_swiz = -1;
 569       for (unsigned int j = 0; j < 4; j++) {
 570          int swiz = GET_SWZ(slots[i].swizzle, j);
 571          last_swiz = swiz;
 572
 573          c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
 574          if (swiz <= last_swiz)
 575             this->uniform_vector_size[this->uniforms]++;
 576       }
 577       this->uniforms++;
 578    }
 579 }
 580
 581 dst_reg *
 582 vec4_visitor::variable_storage(ir_variable *var)
 583 {
 584    return (dst_reg *)hash_table_find(this->variable_ht, var);
 585 }
 586
 587 void
 588 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 589 {
 590    ir_expression *expr = ir->as_expression();
 591
 592    *predicate = BRW_PREDICATE_NORMAL;
 593
 594    if (expr) {
 595       src_reg op[2];
 596       vec4_instruction *inst;
 597
 598       assert(expr->get_num_operands() <= 2);
 599       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 600          expr->operands[i]->accept(this);
 601          op[i] = this->result;
 602
 603          resolve_ud_negate(&op[i]);
 604       }
 605
 606       switch (expr->operation) {
 607       case ir_unop_logic_not:
 608          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 609          inst->conditional_mod = BRW_CONDITIONAL_Z;
 610          break;
 611
 612       case ir_binop_logic_xor:
 613          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 614          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 615          break;
 616
 617       case ir_binop_logic_or:
 618          inst = emit(OR(dst_null_d(), op[0], op[1]));
 619          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 620          break;
 621
 622       case ir_binop_logic_and:
 623          inst = emit(AND(dst_null_d(), op[0], op[1]));
 624          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 625          break;
 626
 627       case ir_unop_f2b:
 628          if (intel->gen >= 6) {
 629             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 630          } else {
 631             inst = emit(MOV(dst_null_f(), op[0]));
 632             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 633          }
 634          break;
 635
 636       case ir_unop_i2b:
 637          if (intel->gen >= 6) {
 638             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 639          } else {
 640             inst = emit(MOV(dst_null_d(), op[0]));
 641             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 642          }
 643          break;
 644
 645       case ir_binop_all_equal:
 646          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 647          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 648          break;
 649
 650       case ir_binop_any_nequal:
 651          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 652          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 653          break;
 654
 655       case ir_unop_any:
 656          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 657          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 658          break;
 659
 660       case ir_binop_greater:
 661       case ir_binop_gequal:
 662       case ir_binop_less:
 663       case ir_binop_lequal:
 664       case ir_binop_equal:
 665       case ir_binop_nequal:
 666          emit(CMP(dst_null_d(), op[0], op[1],
 667                   brw_conditional_for_comparison(expr->operation)));
 668          break;
 669
 670       default:
 671          assert(!"not reached");
 672          break;
 673       }
 674       return;
 675    }
 676
 677    ir->accept(this);
 678
 679    resolve_ud_negate(&this->result);
 680
 681    if (intel->gen >= 6) {
 682       vec4_instruction *inst = emit(AND(dst_null_d(),
 683                                         this->result, src_reg(1)));
 684       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 685    } else {
 686       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 687       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 688    }
 689 }
 690
 691 /**
 692  * Emit a gen6 IF statement with the comparison folded into the IF
 693  * instruction.
 694  */
 695 void
 696 vec4_visitor::emit_if_gen6(ir_if *ir)
 697 {
 698    ir_expression *expr = ir->condition->as_expression();
 699
 700    if (expr) {
 701       src_reg op[2];
 702       dst_reg temp;
 703
 704       assert(expr->get_num_operands() <= 2);
 705       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 706          expr->operands[i]->accept(this);
 707          op[i] = this->result;
 708       }
 709
 710       switch (expr->operation) {
 711       case ir_unop_logic_not:
 712          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 713          return;
 714
 715       case ir_binop_logic_xor:
 716          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 717          return;
 718
 719       case ir_binop_logic_or:
 720          temp = dst_reg(this, glsl_type::bool_type);
 721          emit(OR(temp, op[0], op[1]));
 722          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 723          return;
 724
 725       case ir_binop_logic_and:
 726          temp = dst_reg(this, glsl_type::bool_type);
 727          emit(AND(temp, op[0], op[1]));
 728          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 729          return;
 730
 731       case ir_unop_f2b:
 732          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 733          return;
 734
 735       case ir_unop_i2b:
 736          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 737          return;
 738
 739       case ir_binop_greater:
 740       case ir_binop_gequal:
 741       case ir_binop_less:
 742       case ir_binop_lequal:
 743       case ir_binop_equal:
 744       case ir_binop_nequal:
 745          emit(IF(op[0], op[1],
 746                  brw_conditional_for_comparison(expr->operation)));
 747          return;
 748
 749       case ir_binop_all_equal:
 750          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 751          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 752          return;
 753
 754       case ir_binop_any_nequal:
 755          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 756          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 757          return;
 758
 759       case ir_unop_any:
 760          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 761          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 762          return;
 763
 764       default:
 765          assert(!"not reached");
 766          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 767          return;
 768       }
 769       return;
 770    }
 771
 772    ir->condition->accept(this);
 773
 774    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 775 }
 776
 777 static dst_reg
 778 with_writemask(dst_reg const & r, int mask)
 779 {
 780    dst_reg result = r;
 781    result.writemask = mask;
 782    return result;
 783 }
 784
 785 void
 786 vec4_visitor::emit_attribute_fixups()
 787 {
 788    dst_reg sign_recovery_shift;
 789    dst_reg normalize_factor;
 790    dst_reg es3_normalize_factor;
 791
 792    for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
 793       if (prog_data->inputs_read & BITFIELD64_BIT(i)) {
 794          uint8_t wa_flags = c->key.gl_attrib_wa_flags[i];
 795          dst_reg reg(ATTR, i);
 796          dst_reg reg_d = reg;
 797          reg_d.type = BRW_REGISTER_TYPE_D;
 798          dst_reg reg_ud = reg;
 799          reg_ud.type = BRW_REGISTER_TYPE_UD;
 800
 801          /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 802           * come in as floating point conversions of the integer values.
 803           */
 804          if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
 805             dst_reg dst = reg;
 806             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 807             dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
 808             emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 809          }
 810
 811          /* Do sign recovery for 2101010 formats if required. */
 812          if (wa_flags & BRW_ATTRIB_WA_SIGN) {
 813             if (sign_recovery_shift.file == BAD_FILE) {
 814                /* shift constant: <22,22,22,30> */
 815                sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
 816                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
 817                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
 818             }
 819
 820             emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
 821             emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
 822          }
 823
 824          /* Apply BGRA swizzle if required. */
 825          if (wa_flags & BRW_ATTRIB_WA_BGRA) {
 826             src_reg temp = src_reg(reg);
 827             temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
 828             emit(MOV(reg, temp));
 829          }
 830
 831          if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
 832             /* ES 3.0 has different rules for converting signed normalized
 833              * fixed-point numbers than desktop GL.
 834              */
 835             if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
 836                /* According to equation 2.2 of the ES 3.0 specification,
 837                 * signed normalization conversion is done by:
 838                 *
 839                 * f = c / (2^(b-1)-1)
 840                 */
 841                if (es3_normalize_factor.file == BAD_FILE) {
 842                   /* mul constant: 1 / (2^(b-1) - 1) */
 843                   es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
 844                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
 845                            src_reg(1.0f / ((1<<9) - 1))));
 846                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
 847                            src_reg(1.0f / ((1<<1) - 1))));
 848                }
 849
 850                dst_reg dst = reg;
 851                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 852                emit(MOV(dst, src_reg(reg_d)));
 853                emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
 854                emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
 855             } else {
 856                /* The following equations are from the OpenGL 3.2 specification:
 857                 *
 858                 * 2.1 unsigned normalization
 859                 * f = c/(2^n-1)
 860                 *
 861                 * 2.2 signed normalization
 862                 * f = (2c+1)/(2^n-1)
 863                 *
 864                 * Both of these share a common divisor, which is represented by
 865                 * "normalize_factor" in the code below.
 866                 */
 867                if (normalize_factor.file == BAD_FILE) {
 868                   /* 1 / (2^b - 1) for b=<10,10,10,2> */
 869                   normalize_factor = dst_reg(this, glsl_type::vec4_type);
 870                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
 871                            src_reg(1.0f / ((1<<10) - 1))));
 872                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
 873                            src_reg(1.0f / ((1<<2) - 1))));
 874                }
 875
 876                dst_reg dst = reg;
 877                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 878                emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
 879
 880                /* For signed normalization, we want the numerator to be 2c+1. */
 881                if (wa_flags & BRW_ATTRIB_WA_SIGN) {
 882                   emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
 883                   emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
 884                }
 885
 886                emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
 887             }
 888          }
 889
 890          if (wa_flags & BRW_ATTRIB_WA_SCALE) {
 891             dst_reg dst = reg;
 892             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 893             emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
 894          }
 895       }
 896    }
 897 }
 898
 899 void
 900 vec4_visitor::visit(ir_variable *ir)
 901 {
 902    dst_reg *reg = NULL;
 903
 904    if (variable_storage(ir))
 905       return;
 906
 907    switch (ir->mode) {
 908    case ir_var_shader_in:
 909       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
 910       break;
 911
 912    case ir_var_shader_out:
 913       reg = new(mem_ctx) dst_reg(this, ir->type);
 914
 915       for (int i = 0; i < type_size(ir->type); i++) {
 916          output_reg[ir->location + i] = *reg;
 917          output_reg[ir->location + i].reg_offset = i;
 918          output_reg[ir->location + i].type =
 919             brw_type_for_base_type(ir->type->get_scalar_type());
 920          output_reg_annotation[ir->location + i] = ir->name;
 921       }
 922       break;
 923
 924    case ir_var_auto:
 925    case ir_var_temporary:
 926       reg = new(mem_ctx) dst_reg(this, ir->type);
 927       break;
 928
 929    case ir_var_uniform:
 930       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 931
 932       /* Thanks to the lower_ubo_reference pass, we will see only
 933        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 934        * variables, so no need for them to be in variable_ht.
 935        */
 936       if (ir->uniform_block != -1)
 937          return;
 938
 939       /* Track how big the whole uniform variable is, in case we need to put a
 940        * copy of its data into pull constants for array access.
 941        */
 942       this->uniform_size[this->uniforms] = type_size(ir->type);
 943
 944       if (!strncmp(ir->name, "gl_", 3)) {
 945          setup_builtin_uniform_values(ir);
 946       } else {
 947          setup_uniform_values(ir);
 948       }
 949       break;
 950
 951    case ir_var_system_value:
 952       /* VertexID is stored by the VF as the last vertex element, but
 953        * we don't represent it with a flag in inputs_read, so we call
 954        * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
 955        */
 956       reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
 957       prog_data->uses_vertexid = true;
 958
 959       switch (ir->location) {
 960       case SYSTEM_VALUE_VERTEX_ID:
 961          reg->writemask = WRITEMASK_X;
 962          break;
 963       case SYSTEM_VALUE_INSTANCE_ID:
 964          reg->writemask = WRITEMASK_Y;
 965          break;
 966       default:
 967          assert(!"not reached");
 968          break;
 969       }
 970       break;
 971
 972    default:
 973       assert(!"not reached");
 974    }
 975
 976    reg->type = brw_type_for_base_type(ir->type);
 977    hash_table_insert(this->variable_ht, reg, ir);
 978 }
 979
 980 void
 981 vec4_visitor::visit(ir_loop *ir)
 982 {
 983    dst_reg counter;
 984
 985    /* We don't want debugging output to print the whole body of the
 986     * loop as the annotation.
 987     */
 988    this->base_ir = NULL;
 989
 990    if (ir->counter != NULL) {
 991       this->base_ir = ir->counter;
 992       ir->counter->accept(this);
 993       counter = *(variable_storage(ir->counter));
 994
 995       if (ir->from != NULL) {
 996          this->base_ir = ir->from;
 997          ir->from->accept(this);
 998
 999          emit(MOV(counter, this->result));
1000       }
1001    }
1002
1003    emit(BRW_OPCODE_DO);
1004
1005    if (ir->to) {
1006       this->base_ir = ir->to;
1007       ir->to->accept(this);
1008
1009       emit(CMP(dst_null_d(), src_reg(counter), this->result,
1010                brw_conditional_for_comparison(ir->cmp)));
1011
1012       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1013       inst->predicate = BRW_PREDICATE_NORMAL;
1014    }
1015
1016    visit_instructions(&ir->body_instructions);
1017
1018
1019    if (ir->increment) {
1020       this->base_ir = ir->increment;
1021       ir->increment->accept(this);
1022       emit(ADD(counter, src_reg(counter), this->result));
1023    }
1024
1025    emit(BRW_OPCODE_WHILE);
1026 }
1027
1028 void
1029 vec4_visitor::visit(ir_loop_jump *ir)
1030 {
1031    switch (ir->mode) {
1032    case ir_loop_jump::jump_break:
1033       emit(BRW_OPCODE_BREAK);
1034       break;
1035    case ir_loop_jump::jump_continue:
1036       emit(BRW_OPCODE_CONTINUE);
1037       break;
1038    }
1039 }
1040
1041
1042 void
1043 vec4_visitor::visit(ir_function_signature *ir)
1044 {
1045    assert(0);
1046    (void)ir;
1047 }
1048
1049 void
1050 vec4_visitor::visit(ir_function *ir)
1051 {
1052    /* Ignore function bodies other than main() -- we shouldn't see calls to
1053     * them since they should all be inlined.
1054     */
1055    if (strcmp(ir->name, "main") == 0) {
1056       const ir_function_signature *sig;
1057       exec_list empty;
1058
1059       sig = ir->matching_signature(&empty);
1060
1061       assert(sig);
1062
1063       visit_instructions(&sig->body);
1064    }
1065 }
1066
1067 bool
1068 vec4_visitor::try_emit_sat(ir_expression *ir)
1069 {
1070    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1071    if (!sat_src)
1072       return false;
1073
1074    sat_src->accept(this);
1075    src_reg src = this->result;
1076
1077    this->result = src_reg(this, ir->type);
1078    vec4_instruction *inst;
1079    inst = emit(MOV(dst_reg(this->result), src));
1080    inst->saturate = true;
1081
1082    return true;
1083 }
1084
1085 void
1086 vec4_visitor::emit_bool_comparison(unsigned int op,
1087                                  dst_reg dst, src_reg src0, src_reg src1)
1088 {
1089    /* original gen4 does destination conversion before comparison. */
1090    if (intel->gen < 5)
1091       dst.type = src0.type;
1092
1093    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1094
1095    dst.type = BRW_REGISTER_TYPE_D;
1096    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1097 }
1098
1099 void
1100 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1101                           src_reg src0, src_reg src1)
1102 {
1103    vec4_instruction *inst;
1104
1105    if (intel->gen >= 6) {
1106       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1107       inst->conditional_mod = conditionalmod;
1108    } else {
1109       emit(CMP(dst, src0, src1, conditionalmod));
1110
1111       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1112       inst->predicate = BRW_PREDICATE_NORMAL;
1113    }
1114 }
1115
1116 void
1117 vec4_visitor::visit(ir_expression *ir)
1118 {
1119    unsigned int operand;
1120    src_reg op[Elements(ir->operands)];
1121    src_reg result_src;
1122    dst_reg result_dst;
1123    vec4_instruction *inst;
1124
1125    if (try_emit_sat(ir))
1126       return;
1127
1128    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1129       this->result.file = BAD_FILE;
1130       ir->operands[operand]->accept(this);
1131       if (this->result.file == BAD_FILE) {
1132          printf("Failed to get tree for expression operand:\n");
1133          ir->operands[operand]->print();
1134          exit(1);
1135       }
1136       op[operand] = this->result;
1137
1138       /* Matrix expression operands should have been broken down to vector
1139        * operations already.
1140        */
1141       assert(!ir->operands[operand]->type->is_matrix());
1142    }
1143
1144    int vector_elements = ir->operands[0]->type->vector_elements;
1145    if (ir->operands[1]) {
1146       vector_elements = MAX2(vector_elements,
1147                              ir->operands[1]->type->vector_elements);
1148    }
1149
1150    this->result.file = BAD_FILE;
1151
1152    /* Storage for our result.  Ideally for an assignment we'd be using
1153     * the actual storage for the result here, instead.
1154     */
1155    result_src = src_reg(this, ir->type);
1156    /* convenience for the emit functions below. */
1157    result_dst = dst_reg(result_src);
1158    /* If nothing special happens, this is the result. */
1159    this->result = result_src;
1160    /* Limit writes to the channels that will be used by result_src later.
1161     * This does limit this temp's use as a temporary for multi-instruction
1162     * sequences.
1163     */
1164    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1165
1166    switch (ir->operation) {
1167    case ir_unop_logic_not:
1168       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1169        * ones complement of the whole register, not just bit 0.
1170        */
1171       emit(XOR(result_dst, op[0], src_reg(1)));
1172       break;
1173    case ir_unop_neg:
1174       op[0].negate = !op[0].negate;
1175       this->result = op[0];
1176       break;
1177    case ir_unop_abs:
1178       op[0].abs = true;
1179       op[0].negate = false;
1180       this->result = op[0];
1181       break;
1182
1183    case ir_unop_sign:
1184       emit(MOV(result_dst, src_reg(0.0f)));
1185
1186       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1187       inst = emit(MOV(result_dst, src_reg(1.0f)));
1188       inst->predicate = BRW_PREDICATE_NORMAL;
1189
1190       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1191       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1192       inst->predicate = BRW_PREDICATE_NORMAL;
1193
1194       break;
1195
1196    case ir_unop_rcp:
1197       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1198       break;
1199
1200    case ir_unop_exp2:
1201       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1202       break;
1203    case ir_unop_log2:
1204       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1205       break;
1206    case ir_unop_exp:
1207    case ir_unop_log:
1208       assert(!"not reached: should be handled by ir_explog_to_explog2");
1209       break;
1210    case ir_unop_sin:
1211    case ir_unop_sin_reduced:
1212       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1213       break;
1214    case ir_unop_cos:
1215    case ir_unop_cos_reduced:
1216       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1217       break;
1218
1219    case ir_unop_dFdx:
1220    case ir_unop_dFdy:
1221       assert(!"derivatives not valid in vertex shader");
1222       break;
1223
1224    case ir_unop_noise:
1225       assert(!"not reached: should be handled by lower_noise");
1226       break;
1227
1228    case ir_binop_add:
1229       emit(ADD(result_dst, op[0], op[1]));
1230       break;
1231    case ir_binop_sub:
1232       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1233       break;
1234
1235    case ir_binop_mul:
1236       if (ir->type->is_integer()) {
1237          /* For integer multiplication, the MUL uses the low 16 bits
1238           * of one of the operands (src0 on gen6, src1 on gen7).  The
1239           * MACH accumulates in the contribution of the upper 16 bits
1240           * of that operand.
1241           *
1242           * FINISHME: Emit just the MUL if we know an operand is small
1243           * enough.
1244           */
1245          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1246
1247          emit(MUL(acc, op[0], op[1]));
1248          emit(MACH(dst_null_d(), op[0], op[1]));
1249          emit(MOV(result_dst, src_reg(acc)));
1250       } else {
1251          emit(MUL(result_dst, op[0], op[1]));
1252       }
1253       break;
1254    case ir_binop_div:
1255       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1256       assert(ir->type->is_integer());
1257       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1258       break;
1259    case ir_binop_mod:
1260       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1261       assert(ir->type->is_integer());
1262       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1263       break;
1264
1265    case ir_binop_less:
1266    case ir_binop_greater:
1267    case ir_binop_lequal:
1268    case ir_binop_gequal:
1269    case ir_binop_equal:
1270    case ir_binop_nequal: {
1271       emit(CMP(result_dst, op[0], op[1],
1272                brw_conditional_for_comparison(ir->operation)));
1273       emit(AND(result_dst, result_src, src_reg(0x1)));
1274       break;
1275    }
1276
1277    case ir_binop_all_equal:
1278       /* "==" operator producing a scalar boolean. */
1279       if (ir->operands[0]->type->is_vector() ||
1280           ir->operands[1]->type->is_vector()) {
1281          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1282          emit(MOV(result_dst, src_reg(0)));
1283          inst = emit(MOV(result_dst, src_reg(1)));
1284          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1285       } else {
1286          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1287          emit(AND(result_dst, result_src, src_reg(0x1)));
1288       }
1289       break;
1290    case ir_binop_any_nequal:
1291       /* "!=" operator producing a scalar boolean. */
1292       if (ir->operands[0]->type->is_vector() ||
1293           ir->operands[1]->type->is_vector()) {
1294          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1295
1296          emit(MOV(result_dst, src_reg(0)));
1297          inst = emit(MOV(result_dst, src_reg(1)));
1298          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1299       } else {
1300          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1301          emit(AND(result_dst, result_src, src_reg(0x1)));
1302       }
1303       break;
1304
1305    case ir_unop_any:
1306       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1307       emit(MOV(result_dst, src_reg(0)));
1308
1309       inst = emit(MOV(result_dst, src_reg(1)));
1310       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1311       break;
1312
1313    case ir_binop_logic_xor:
1314       emit(XOR(result_dst, op[0], op[1]));
1315       break;
1316
1317    case ir_binop_logic_or:
1318       emit(OR(result_dst, op[0], op[1]));
1319       break;
1320
1321    case ir_binop_logic_and:
1322       emit(AND(result_dst, op[0], op[1]));
1323       break;
1324
1325    case ir_binop_dot:
1326       assert(ir->operands[0]->type->is_vector());
1327       assert(ir->operands[0]->type == ir->operands[1]->type);
1328       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1329       break;
1330
1331    case ir_unop_sqrt:
1332       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1333       break;
1334    case ir_unop_rsq:
1335       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1336       break;
1337
1338    case ir_unop_bitcast_i2f:
1339    case ir_unop_bitcast_u2f:
1340       this->result = op[0];
1341       this->result.type = BRW_REGISTER_TYPE_F;
1342       break;
1343
1344    case ir_unop_bitcast_f2i:
1345       this->result = op[0];
1346       this->result.type = BRW_REGISTER_TYPE_D;
1347       break;
1348
1349    case ir_unop_bitcast_f2u:
1350       this->result = op[0];
1351       this->result.type = BRW_REGISTER_TYPE_UD;
1352       break;
1353
1354    case ir_unop_i2f:
1355    case ir_unop_i2u:
1356    case ir_unop_u2i:
1357    case ir_unop_u2f:
1358    case ir_unop_b2f:
1359    case ir_unop_b2i:
1360    case ir_unop_f2i:
1361    case ir_unop_f2u:
1362       emit(MOV(result_dst, op[0]));
1363       break;
1364    case ir_unop_f2b:
1365    case ir_unop_i2b: {
1366       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1367       emit(AND(result_dst, result_src, src_reg(1)));
1368       break;
1369    }
1370
1371    case ir_unop_trunc:
1372       emit(RNDZ(result_dst, op[0]));
1373       break;
1374    case ir_unop_ceil:
1375       op[0].negate = !op[0].negate;
1376       inst = emit(RNDD(result_dst, op[0]));
1377       this->result.negate = true;
1378       break;
1379    case ir_unop_floor:
1380       inst = emit(RNDD(result_dst, op[0]));
1381       break;
1382    case ir_unop_fract:
1383       inst = emit(FRC(result_dst, op[0]));
1384       break;
1385    case ir_unop_round_even:
1386       emit(RNDE(result_dst, op[0]));
1387       break;
1388
1389    case ir_binop_min:
1390       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1391       break;
1392    case ir_binop_max:
1393       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1394       break;
1395
1396    case ir_binop_pow:
1397       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1398       break;
1399
1400    case ir_unop_bit_not:
1401       inst = emit(NOT(result_dst, op[0]));
1402       break;
1403    case ir_binop_bit_and:
1404       inst = emit(AND(result_dst, op[0], op[1]));
1405       break;
1406    case ir_binop_bit_xor:
1407       inst = emit(XOR(result_dst, op[0], op[1]));
1408       break;
1409    case ir_binop_bit_or:
1410       inst = emit(OR(result_dst, op[0], op[1]));
1411       break;
1412
1413    case ir_binop_lshift:
1414       inst = emit(SHL(result_dst, op[0], op[1]));
1415       break;
1416
1417    case ir_binop_rshift:
1418       if (ir->type->base_type == GLSL_TYPE_INT)
1419          inst = emit(ASR(result_dst, op[0], op[1]));
1420       else
1421          inst = emit(SHR(result_dst, op[0], op[1]));
1422       break;
1423
1424    case ir_binop_ubo_load: {
1425       ir_constant *uniform_block = ir->operands[0]->as_constant();
1426       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1427       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1428       src_reg offset = op[1];
1429
1430       /* Now, load the vector from that offset. */
1431       assert(ir->type->is_vector() || ir->type->is_scalar());
1432
1433       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1434       packed_consts.type = result.type;
1435       src_reg surf_index =
1436          src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1437       if (const_offset_ir) {
1438          offset = src_reg(const_offset / 16);
1439       } else {
1440          emit(SHR(dst_reg(offset), offset, src_reg(4)));
1441       }
1442
1443       vec4_instruction *pull =
1444          emit(new(mem_ctx) vec4_instruction(this,
1445                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1446                                             dst_reg(packed_consts),
1447                                             surf_index,
1448                                             offset));
1449       pull->base_mrf = 14;
1450       pull->mlen = 1;
1451
1452       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1453       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1454                                             const_offset % 16 / 4,
1455                                             const_offset % 16 / 4,
1456                                             const_offset % 16 / 4);
1457
1458       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1459       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1460          emit(CMP(result_dst, packed_consts, src_reg(0u),
1461                   BRW_CONDITIONAL_NZ));
1462          emit(AND(result_dst, result, src_reg(0x1)));
1463       } else {
1464          emit(MOV(result_dst, packed_consts));
1465       }
1466       break;
1467    }
1468
1469    case ir_quadop_vector:
1470       assert(!"not reached: should be handled by lower_quadop_vector");
1471       break;
1472    }
1473 }
1474
1475
1476 void
1477 vec4_visitor::visit(ir_swizzle *ir)
1478 {
1479    src_reg src;
1480    int i = 0;
1481    int swizzle[4];
1482
1483    /* Note that this is only swizzles in expressions, not those on the left
1484     * hand side of an assignment, which do write masking.  See ir_assignment
1485     * for that.
1486     */
1487
1488    ir->val->accept(this);
1489    src = this->result;
1490    assert(src.file != BAD_FILE);
1491
1492    for (i = 0; i < ir->type->vector_elements; i++) {
1493       switch (i) {
1494       case 0:
1495          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1496          break;
1497       case 1:
1498          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1499          break;
1500       case 2:
1501          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1502          break;
1503       case 3:
1504          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1505             break;
1506       }
1507    }
1508    for (; i < 4; i++) {
1509       /* Replicate the last channel out. */
1510       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1511    }
1512
1513    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1514
1515    this->result = src;
1516 }
1517
1518 void
1519 vec4_visitor::visit(ir_dereference_variable *ir)
1520 {
1521    const struct glsl_type *type = ir->type;
1522    dst_reg *reg = variable_storage(ir->var);
1523
1524    if (!reg) {
1525       fail("Failed to find variable storage for %s\n", ir->var->name);
1526       this->result = src_reg(brw_null_reg());
1527       return;
1528    }
1529
1530    this->result = src_reg(*reg);
1531
1532    /* System values get their swizzle from the dst_reg writemask */
1533    if (ir->var->mode == ir_var_system_value)
1534       return;
1535
1536    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1537       this->result.swizzle = swizzle_for_size(type->vector_elements);
1538 }
1539
1540 void
1541 vec4_visitor::visit(ir_dereference_array *ir)
1542 {
1543    ir_constant *constant_index;
1544    src_reg src;
1545    int element_size = type_size(ir->type);
1546
1547    constant_index = ir->array_index->constant_expression_value();
1548
1549    ir->array->accept(this);
1550    src = this->result;
1551
1552    if (constant_index) {
1553       src.reg_offset += constant_index->value.i[0] * element_size;
1554    } else {
1555       /* Variable index array dereference.  It eats the "vec4" of the
1556        * base of the array and an index that offsets the Mesa register
1557        * index.
1558        */
1559       ir->array_index->accept(this);
1560
1561       src_reg index_reg;
1562
1563       if (element_size == 1) {
1564          index_reg = this->result;
1565       } else {
1566          index_reg = src_reg(this, glsl_type::int_type);
1567
1568          emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1569       }
1570
1571       if (src.reladdr) {
1572          src_reg temp = src_reg(this, glsl_type::int_type);
1573
1574          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1575
1576          index_reg = temp;
1577       }
1578
1579       src.reladdr = ralloc(mem_ctx, src_reg);
1580       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1581    }
1582
1583    /* If the type is smaller than a vec4, replicate the last channel out. */
1584    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1585       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1586    else
1587       src.swizzle = BRW_SWIZZLE_NOOP;
1588    src.type = brw_type_for_base_type(ir->type);
1589
1590    this->result = src;
1591 }
1592
1593 void
1594 vec4_visitor::visit(ir_dereference_record *ir)
1595 {
1596    unsigned int i;
1597    const glsl_type *struct_type = ir->record->type;
1598    int offset = 0;
1599
1600    ir->record->accept(this);
1601
1602    for (i = 0; i < struct_type->length; i++) {
1603       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1604          break;
1605       offset += type_size(struct_type->fields.structure[i].type);
1606    }
1607
1608    /* If the type is smaller than a vec4, replicate the last channel out. */
1609    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1610       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1611    else
1612       this->result.swizzle = BRW_SWIZZLE_NOOP;
1613    this->result.type = brw_type_for_base_type(ir->type);
1614
1615    this->result.reg_offset += offset;
1616 }
1617
1618 /**
1619  * We want to be careful in assignment setup to hit the actual storage
1620  * instead of potentially using a temporary like we might with the
1621  * ir_dereference handler.
1622  */
1623 static dst_reg
1624 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1625 {
1626    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1627     * access of a vector, it must be separated into a series conditional moves
1628     * before reaching this point (see ir_vec_index_to_cond_assign).
1629     */
1630    assert(ir->as_dereference());
1631    ir_dereference_array *deref_array = ir->as_dereference_array();
1632    if (deref_array) {
1633       assert(!deref_array->array->type->is_vector());
1634    }
1635
1636    /* Use the rvalue deref handler for the most part.  We'll ignore
1637     * swizzles in it and write swizzles using writemask, though.
1638     */
1639    ir->accept(v);
1640    return dst_reg(v->result);
1641 }
1642
1643 void
1644 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1645                               const struct glsl_type *type, uint32_t predicate)
1646 {
1647    if (type->base_type == GLSL_TYPE_STRUCT) {
1648       for (unsigned int i = 0; i < type->length; i++) {
1649          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1650       }
1651       return;
1652    }
1653
1654    if (type->is_array()) {
1655       for (unsigned int i = 0; i < type->length; i++) {
1656          emit_block_move(dst, src, type->fields.array, predicate);
1657       }
1658       return;
1659    }
1660
1661    if (type->is_matrix()) {
1662       const struct glsl_type *vec_type;
1663
1664       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1665                                          type->vector_elements, 1);
1666
1667       for (int i = 0; i < type->matrix_columns; i++) {
1668          emit_block_move(dst, src, vec_type, predicate);
1669       }
1670       return;
1671    }
1672
1673    assert(type->is_scalar() || type->is_vector());
1674
1675    dst->type = brw_type_for_base_type(type);
1676    src->type = dst->type;
1677
1678    dst->writemask = (1 << type->vector_elements) - 1;
1679
1680    src->swizzle = swizzle_for_size(type->vector_elements);
1681
1682    vec4_instruction *inst = emit(MOV(*dst, *src));
1683    inst->predicate = predicate;
1684
1685    dst->reg_offset++;
1686    src->reg_offset++;
1687 }
1688
1689
1690 /* If the RHS processing resulted in an instruction generating a
1691  * temporary value, and it would be easy to rewrite the instruction to
1692  * generate its result right into the LHS instead, do so.  This ends
1693  * up reliably removing instructions where it can be tricky to do so
1694  * later without real UD chain information.
1695  */
1696 bool
1697 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1698                                      dst_reg dst,
1699                                      src_reg src,
1700                                      vec4_instruction *pre_rhs_inst,
1701                                      vec4_instruction *last_rhs_inst)
1702 {
1703    /* This could be supported, but it would take more smarts. */
1704    if (ir->condition)
1705       return false;
1706
1707    if (pre_rhs_inst == last_rhs_inst)
1708       return false; /* No instructions generated to work with. */
1709
1710    /* Make sure the last instruction generated our source reg. */
1711    if (src.file != GRF ||
1712        src.file != last_rhs_inst->dst.file ||
1713        src.reg != last_rhs_inst->dst.reg ||
1714        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1715        src.reladdr ||
1716        src.abs ||
1717        src.negate ||
1718        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1719       return false;
1720
1721    /* Check that that last instruction fully initialized the channels
1722     * we want to use, in the order we want to use them.  We could
1723     * potentially reswizzle the operands of many instructions so that
1724     * we could handle out of order channels, but don't yet.
1725     */
1726
1727    for (unsigned i = 0; i < 4; i++) {
1728       if (dst.writemask & (1 << i)) {
1729          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1730             return false;
1731
1732          if (BRW_GET_SWZ(src.swizzle, i) != i)
1733             return false;
1734       }
1735    }
1736
1737    /* Success!  Rewrite the instruction. */
1738    last_rhs_inst->dst.file = dst.file;
1739    last_rhs_inst->dst.reg = dst.reg;
1740    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1741    last_rhs_inst->dst.reladdr = dst.reladdr;
1742    last_rhs_inst->dst.writemask &= dst.writemask;
1743
1744    return true;
1745 }
1746
1747 void
1748 vec4_visitor::visit(ir_assignment *ir)
1749 {
1750    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1751    uint32_t predicate = BRW_PREDICATE_NONE;
1752
1753    if (!ir->lhs->type->is_scalar() &&
1754        !ir->lhs->type->is_vector()) {
1755       ir->rhs->accept(this);
1756       src_reg src = this->result;
1757
1758       if (ir->condition) {
1759          emit_bool_to_cond_code(ir->condition, &predicate);
1760       }
1761
1762       /* emit_block_move doesn't account for swizzles in the source register.
1763        * This should be ok, since the source register is a structure or an
1764        * array, and those can't be swizzled.  But double-check to be sure.
1765        */
1766       assert(src.swizzle ==
1767              (ir->rhs->type->is_matrix()
1768               ? swizzle_for_size(ir->rhs->type->vector_elements)
1769               : BRW_SWIZZLE_NOOP));
1770
1771       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1772       return;
1773    }
1774
1775    /* Now we're down to just a scalar/vector with writemasks. */
1776    int i;
1777
1778    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1779    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1780
1781    ir->rhs->accept(this);
1782
1783    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1784
1785    src_reg src = this->result;
1786
1787    int swizzles[4];
1788    int first_enabled_chan = 0;
1789    int src_chan = 0;
1790
1791    assert(ir->lhs->type->is_vector() ||
1792           ir->lhs->type->is_scalar());
1793    dst.writemask = ir->write_mask;
1794
1795    for (int i = 0; i < 4; i++) {
1796       if (dst.writemask & (1 << i)) {
1797          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1798          break;
1799       }
1800    }
1801
1802    /* Swizzle a small RHS vector into the channels being written.
1803     *
1804     * glsl ir treats write_mask as dictating how many channels are
1805     * present on the RHS while in our instructions we need to make
1806     * those channels appear in the slots of the vec4 they're written to.
1807     */
1808    for (int i = 0; i < 4; i++) {
1809       if (dst.writemask & (1 << i))
1810          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1811       else
1812          swizzles[i] = first_enabled_chan;
1813    }
1814    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1815                               swizzles[2], swizzles[3]);
1816
1817    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1818       return;
1819    }
1820
1821    if (ir->condition) {
1822       emit_bool_to_cond_code(ir->condition, &predicate);
1823    }
1824
1825    for (i = 0; i < type_size(ir->lhs->type); i++) {
1826       vec4_instruction *inst = emit(MOV(dst, src));
1827       inst->predicate = predicate;
1828
1829       dst.reg_offset++;
1830       src.reg_offset++;
1831    }
1832 }
1833
1834 void
1835 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1836 {
1837    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1838       foreach_list(node, &ir->components) {
1839          ir_constant *field_value = (ir_constant *)node;
1840
1841          emit_constant_values(dst, field_value);
1842       }
1843       return;
1844    }
1845
1846    if (ir->type->is_array()) {
1847       for (unsigned int i = 0; i < ir->type->length; i++) {
1848          emit_constant_values(dst, ir->array_elements[i]);
1849       }
1850       return;
1851    }
1852
1853    if (ir->type->is_matrix()) {
1854       for (int i = 0; i < ir->type->matrix_columns; i++) {
1855          float *vec = &ir->value.f[i * ir->type->vector_elements];
1856
1857          for (int j = 0; j < ir->type->vector_elements; j++) {
1858             dst->writemask = 1 << j;
1859             dst->type = BRW_REGISTER_TYPE_F;
1860
1861             emit(MOV(*dst, src_reg(vec[j])));
1862          }
1863          dst->reg_offset++;
1864       }
1865       return;
1866    }
1867
1868    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
1869
1870    for (int i = 0; i < ir->type->vector_elements; i++) {
1871       if (!(remaining_writemask & (1 << i)))
1872          continue;
1873
1874       dst->writemask = 1 << i;
1875       dst->type = brw_type_for_base_type(ir->type);
1876
1877       /* Find other components that match the one we're about to
1878        * write.  Emits fewer instructions for things like vec4(0.5,
1879        * 1.5, 1.5, 1.5).
1880        */
1881       for (int j = i + 1; j < ir->type->vector_elements; j++) {
1882          if (ir->type->base_type == GLSL_TYPE_BOOL) {
1883             if (ir->value.b[i] == ir->value.b[j])
1884                dst->writemask |= (1 << j);
1885          } else {
1886             /* u, i, and f storage all line up, so no need for a
1887              * switch case for comparing each type.
1888              */
1889             if (ir->value.u[i] == ir->value.u[j])
1890                dst->writemask |= (1 << j);
1891          }
1892       }
1893
1894       switch (ir->type->base_type) {
1895       case GLSL_TYPE_FLOAT:
1896          emit(MOV(*dst, src_reg(ir->value.f[i])));
1897          break;
1898       case GLSL_TYPE_INT:
1899          emit(MOV(*dst, src_reg(ir->value.i[i])));
1900          break;
1901       case GLSL_TYPE_UINT:
1902          emit(MOV(*dst, src_reg(ir->value.u[i])));
1903          break;
1904       case GLSL_TYPE_BOOL:
1905          emit(MOV(*dst, src_reg(ir->value.b[i])));
1906          break;
1907       default:
1908          assert(!"Non-float/uint/int/bool constant");
1909          break;
1910       }
1911
1912       remaining_writemask &= ~dst->writemask;
1913    }
1914    dst->reg_offset++;
1915 }
1916
1917 void
1918 vec4_visitor::visit(ir_constant *ir)
1919 {
1920    dst_reg dst = dst_reg(this, ir->type);
1921    this->result = src_reg(dst);
1922
1923    emit_constant_values(&dst, ir);
1924 }
1925
1926 void
1927 vec4_visitor::visit(ir_call *ir)
1928 {
1929    assert(!"not reached");
1930 }
1931
1932 void
1933 vec4_visitor::visit(ir_texture *ir)
1934 {
1935    int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
1936
1937    /* Should be lowered by do_lower_texture_projection */
1938    assert(!ir->projector);
1939
1940    /* Generate code to compute all the subexpression trees.  This has to be
1941     * done before loading any values into MRFs for the sampler message since
1942     * generating these values may involve SEND messages that need the MRFs.
1943     */
1944    src_reg coordinate;
1945    if (ir->coordinate) {
1946       ir->coordinate->accept(this);
1947       coordinate = this->result;
1948    }
1949
1950    src_reg shadow_comparitor;
1951    if (ir->shadow_comparitor) {
1952       ir->shadow_comparitor->accept(this);
1953       shadow_comparitor = this->result;
1954    }
1955
1956    const glsl_type *lod_type;
1957    src_reg lod, dPdx, dPdy;
1958    switch (ir->op) {
1959    case ir_tex:
1960       lod = src_reg(0.0f);
1961       lod_type = glsl_type::float_type;
1962       break;
1963    case ir_txf:
1964    case ir_txl:
1965    case ir_txs:
1966       ir->lod_info.lod->accept(this);
1967       lod = this->result;
1968       lod_type = ir->lod_info.lod->type;
1969       break;
1970    case ir_txd:
1971       ir->lod_info.grad.dPdx->accept(this);
1972       dPdx = this->result;
1973
1974       ir->lod_info.grad.dPdy->accept(this);
1975       dPdy = this->result;
1976
1977       lod_type = ir->lod_info.grad.dPdx->type;
1978       break;
1979    case ir_txb:
1980       break;
1981    }
1982
1983    vec4_instruction *inst = NULL;
1984    switch (ir->op) {
1985    case ir_tex:
1986    case ir_txl:
1987       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
1988       break;
1989    case ir_txd:
1990       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
1991       break;
1992    case ir_txf:
1993       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
1994       break;
1995    case ir_txs:
1996       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
1997       break;
1998    case ir_txb:
1999       assert(!"TXB is not valid for vertex shaders.");
2000    }
2001
2002    bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2003
2004    /* Texel offsets go in the message header; Gen4 also requires headers. */
2005    inst->header_present = use_texture_offset || intel->gen < 5;
2006    inst->base_mrf = 2;
2007    inst->mlen = inst->header_present + 1; /* always at least one */
2008    inst->sampler = sampler;
2009    inst->dst = dst_reg(this, ir->type);
2010    inst->dst.writemask = WRITEMASK_XYZW;
2011    inst->shadow_compare = ir->shadow_comparitor != NULL;
2012
2013    if (use_texture_offset)
2014       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2015
2016    /* MRF for the first parameter */
2017    int param_base = inst->base_mrf + inst->header_present;
2018
2019    if (ir->op == ir_txs) {
2020       int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2021       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2022    } else {
2023       int i, coord_mask = 0, zero_mask = 0;
2024       /* Load the coordinate */
2025       /* FINISHME: gl_clamp_mask and saturate */
2026       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2027          coord_mask |= (1 << i);
2028       for (; i < 4; i++)
2029          zero_mask |= (1 << i);
2030
2031       if (ir->offset && ir->op == ir_txf) {
2032          /* It appears that the ld instruction used for txf does its
2033           * address bounds check before adding in the offset.  To work
2034           * around this, just add the integer offset to the integer
2035           * texel coordinate, and don't put the offset in the header.
2036           */
2037          ir_constant *offset = ir->offset->as_constant();
2038          assert(offset);
2039
2040          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2041             src_reg src = coordinate;
2042             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2043                                        BRW_GET_SWZ(src.swizzle, j),
2044                                        BRW_GET_SWZ(src.swizzle, j),
2045                                        BRW_GET_SWZ(src.swizzle, j));
2046             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2047                      src, offset->value.i[j]));
2048          }
2049       } else {
2050          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2051                   coordinate));
2052       }
2053       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2054                src_reg(0)));
2055       /* Load the shadow comparitor */
2056       if (ir->shadow_comparitor) {
2057          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2058                           WRITEMASK_X),
2059                   shadow_comparitor));
2060          inst->mlen++;
2061       }
2062
2063       /* Load the LOD info */
2064       if (ir->op == ir_tex || ir->op == ir_txl) {
2065          int mrf, writemask;
2066          if (intel->gen >= 5) {
2067             mrf = param_base + 1;
2068             if (ir->shadow_comparitor) {
2069                writemask = WRITEMASK_Y;
2070                /* mlen already incremented */
2071             } else {
2072                writemask = WRITEMASK_X;
2073                inst->mlen++;
2074             }
2075          } else /* intel->gen == 4 */ {
2076             mrf = param_base;
2077             writemask = WRITEMASK_Z;
2078          }
2079          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2080       } else if (ir->op == ir_txf) {
2081          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W),
2082                   lod));
2083       } else if (ir->op == ir_txd) {
2084          const glsl_type *type = lod_type;
2085
2086          if (intel->gen >= 5) {
2087             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2088             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2089             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2090             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2091             inst->mlen++;
2092
2093             if (ir->type->vector_elements == 3) {
2094                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2095                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2096                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2097                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2098                inst->mlen++;
2099             }
2100          } else /* intel->gen == 4 */ {
2101             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2102             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2103             inst->mlen += 2;
2104          }
2105       }
2106    }
2107
2108    emit(inst);
2109
2110    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2111     * spec requires layers.
2112     */
2113    if (ir->op == ir_txs) {
2114       glsl_type const *type = ir->sampler->type;
2115       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2116           type->sampler_array) {
2117          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2118                    with_writemask(inst->dst, WRITEMASK_Z),
2119                    src_reg(inst->dst), src_reg(6));
2120       }
2121    }
2122
2123    swizzle_result(ir, src_reg(inst->dst), sampler);
2124 }
2125
2126 void
2127 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2128 {
2129    int s = c->key.tex.swizzles[sampler];
2130
2131    this->result = src_reg(this, ir->type);
2132    dst_reg swizzled_result(this->result);
2133
2134    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2135                         || s == SWIZZLE_NOOP) {
2136       emit(MOV(swizzled_result, orig_val));
2137       return;
2138    }
2139
2140    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2141    int swizzle[4];
2142
2143    for (int i = 0; i < 4; i++) {
2144       switch (GET_SWZ(s, i)) {
2145       case SWIZZLE_ZERO:
2146          zero_mask |= (1 << i);
2147          break;
2148       case SWIZZLE_ONE:
2149          one_mask |= (1 << i);
2150          break;
2151       default:
2152          copy_mask |= (1 << i);
2153          swizzle[i] = GET_SWZ(s, i);
2154          break;
2155       }
2156    }
2157
2158    if (copy_mask) {
2159       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2160       swizzled_result.writemask = copy_mask;
2161       emit(MOV(swizzled_result, orig_val));
2162    }
2163
2164    if (zero_mask) {
2165       swizzled_result.writemask = zero_mask;
2166       emit(MOV(swizzled_result, src_reg(0.0f)));
2167    }
2168
2169    if (one_mask) {
2170       swizzled_result.writemask = one_mask;
2171       emit(MOV(swizzled_result, src_reg(1.0f)));
2172    }
2173 }
2174
2175 void
2176 vec4_visitor::visit(ir_return *ir)
2177 {
2178    assert(!"not reached");
2179 }
2180
2181 void
2182 vec4_visitor::visit(ir_discard *ir)
2183 {
2184    assert(!"not reached");
2185 }
2186
2187 void
2188 vec4_visitor::visit(ir_if *ir)
2189 {
2190    /* Don't point the annotation at the if statement, because then it plus
2191     * the then and else blocks get printed.
2192     */
2193    this->base_ir = ir->condition;
2194
2195    if (intel->gen == 6) {
2196       emit_if_gen6(ir);
2197    } else {
2198       uint32_t predicate;
2199       emit_bool_to_cond_code(ir->condition, &predicate);
2200       emit(IF(predicate));
2201    }
2202
2203    visit_instructions(&ir->then_instructions);
2204
2205    if (!ir->else_instructions.is_empty()) {
2206       this->base_ir = ir->condition;
2207       emit(BRW_OPCODE_ELSE);
2208
2209       visit_instructions(&ir->else_instructions);
2210    }
2211
2212    this->base_ir = ir->condition;
2213    emit(BRW_OPCODE_ENDIF);
2214 }
2215
2216 void
2217 vec4_visitor::emit_ndc_computation()
2218 {
2219    /* Get the position */
2220    src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
2221
2222    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2223    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2224    output_reg[BRW_VERT_RESULT_NDC] = ndc;
2225
2226    current_annotation = "NDC";
2227    dst_reg ndc_w = ndc;
2228    ndc_w.writemask = WRITEMASK_W;
2229    src_reg pos_w = pos;
2230    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2231    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2232
2233    dst_reg ndc_xyz = ndc;
2234    ndc_xyz.writemask = WRITEMASK_XYZ;
2235
2236    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2237 }
2238
2239 void
2240 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2241 {
2242    if (intel->gen < 6 &&
2243        ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2244         c->key.userclip_active || brw->has_negative_rhw_bug)) {
2245       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2246       dst_reg header1_w = header1;
2247       header1_w.writemask = WRITEMASK_W;
2248       GLuint i;
2249
2250       emit(MOV(header1, 0u));
2251
2252       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2253          src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2254
2255          current_annotation = "Point size";
2256          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2257          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2258       }
2259
2260       current_annotation = "Clipping flags";
2261       for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2262          vec4_instruction *inst;
2263
2264          inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2265                          src_reg(this->userplane[i])));
2266          inst->conditional_mod = BRW_CONDITIONAL_L;
2267
2268          inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2269          inst->predicate = BRW_PREDICATE_NORMAL;
2270       }
2271
2272       /* i965 clipping workaround:
2273        * 1) Test for -ve rhw
2274        * 2) If set,
2275        *      set ndc = (0,0,0,0)
2276        *      set ucp[6] = 1
2277        *
2278        * Later, clipping will detect ucp[6] and ensure the primitive is
2279        * clipped against all fixed planes.
2280        */
2281       if (brw->has_negative_rhw_bug) {
2282 #if 0
2283          /* FINISHME */
2284          brw_CMP(p,
2285                  vec8(brw_null_reg()),
2286                  BRW_CONDITIONAL_L,
2287                  brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2288                  brw_imm_f(0));
2289
2290          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2291          brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2292          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2293 #endif
2294       }
2295
2296       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2297    } else if (intel->gen < 6) {
2298       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2299    } else {
2300       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2301       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2302          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2303                   src_reg(output_reg[VERT_RESULT_PSIZ])));
2304       }
2305    }
2306 }
2307
2308 void
2309 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2310 {
2311    if (intel->gen < 6) {
2312       /* Clip distance slots are set aside in gen5, but they are not used.  It
2313        * is not clear whether we actually need to set aside space for them,
2314        * but the performance cost is negligible.
2315        */
2316       return;
2317    }
2318
2319    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2320     *
2321     *     "If a linked set of shaders forming the vertex stage contains no
2322     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2323     *     application has requested clipping against user clip planes through
2324     *     the API, then the coordinate written to gl_Position is used for
2325     *     comparison against the user clip planes."
2326     *
2327     * This function is only called if the shader didn't write to
2328     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2329     * if the user wrote to it; otherwise we use gl_Position.
2330     */
2331    gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2332    if (!(c->prog_data.outputs_written
2333          & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2334       clip_vertex = VERT_RESULT_HPOS;
2335    }
2336
2337    for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2338         ++i) {
2339       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2340                src_reg(output_reg[clip_vertex]),
2341                src_reg(this->userplane[i + offset])));
2342    }
2343 }
2344
2345 void
2346 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2347 {
2348    assert (vert_result < VERT_RESULT_MAX);
2349    reg.type = output_reg[vert_result].type;
2350    current_annotation = output_reg_annotation[vert_result];
2351    /* Copy the register, saturating if necessary */
2352    vec4_instruction *inst = emit(MOV(reg,
2353                                      src_reg(output_reg[vert_result])));
2354    if ((vert_result == VERT_RESULT_COL0 ||
2355         vert_result == VERT_RESULT_COL1 ||
2356         vert_result == VERT_RESULT_BFC0 ||
2357         vert_result == VERT_RESULT_BFC1) &&
2358        c->key.clamp_vertex_color) {
2359       inst->saturate = true;
2360    }
2361 }
2362
2363 void
2364 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2365 {
2366    struct brw_reg hw_reg = brw_message_reg(mrf);
2367    dst_reg reg = dst_reg(MRF, mrf);
2368    reg.type = BRW_REGISTER_TYPE_F;
2369
2370    switch (vert_result) {
2371    case VERT_RESULT_PSIZ:
2372       /* PSIZ is always in slot 0, and is coupled with other flags. */
2373       current_annotation = "indices, point width, clip flags";
2374       emit_psiz_and_flags(hw_reg);
2375       break;
2376    case BRW_VERT_RESULT_NDC:
2377       current_annotation = "NDC";
2378       emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2379       break;
2380    case BRW_VERT_RESULT_HPOS_DUPLICATE:
2381    case VERT_RESULT_HPOS:
2382       current_annotation = "gl_Position";
2383       emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2384       break;
2385    case VERT_RESULT_CLIP_DIST0:
2386    case VERT_RESULT_CLIP_DIST1:
2387       if (this->c->key.uses_clip_distance) {
2388          emit_generic_urb_slot(reg, vert_result);
2389       } else {
2390          current_annotation = "user clip distances";
2391          emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2392       }
2393       break;
2394    case VERT_RESULT_EDGE:
2395       /* This is present when doing unfilled polygons.  We're supposed to copy
2396        * the edge flag from the user-provided vertex array
2397        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2398        * of that attribute (starts as 1.0f).  This is then used in clipping to
2399        * determine which edges should be drawn as wireframe.
2400        */
2401       current_annotation = "edge flag";
2402       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2403                                     glsl_type::float_type, WRITEMASK_XYZW))));
2404       break;
2405    case BRW_VERT_RESULT_PAD:
2406       /* No need to write to this slot */
2407       break;
2408    default:
2409       emit_generic_urb_slot(reg, vert_result);
2410       break;
2411    }
2412 }
2413
2414 static int
2415 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2416 {
2417    struct intel_context *intel = &brw->intel;
2418
2419    if (intel->gen >= 6) {
2420       /* URB data written (does not include the message header reg) must
2421        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2422        * section 5.4.3.2.2: URB_INTERLEAVED.
2423        *
2424        * URB entries are allocated on a multiple of 1024 bits, so an
2425        * extra 128 bits written here to make the end align to 256 is
2426        * no problem.
2427        */
2428       if ((mlen % 2) != 1)
2429          mlen++;
2430    }
2431
2432    return mlen;
2433 }
2434
2435 /**
2436  * Generates the VUE payload plus the 1 or 2 URB write instructions to
2437  * complete the VS thread.
2438  *
2439  * The VUE layout is documented in Volume 2a.
2440  */
2441 void
2442 vec4_visitor::emit_urb_writes()
2443 {
2444    /* MRF 0 is reserved for the debugger, so start with message header
2445     * in MRF 1.
2446     */
2447    int base_mrf = 1;
2448    int mrf = base_mrf;
2449    /* In the process of generating our URB write message contents, we
2450     * may need to unspill a register or load from an array.  Those
2451     * reads would use MRFs 14-15.
2452     */
2453    int max_usable_mrf = 13;
2454
2455    /* The following assertion verifies that max_usable_mrf causes an
2456     * even-numbered amount of URB write data, which will meet gen6's
2457     * requirements for length alignment.
2458     */
2459    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2460
2461    /* First mrf is the g0-based message header containing URB handles and such,
2462     * which is implied in VS_OPCODE_URB_WRITE.
2463     */
2464    mrf++;
2465
2466    if (intel->gen < 6) {
2467       emit_ndc_computation();
2468    }
2469
2470    /* Set up the VUE data for the first URB write */
2471    int slot;
2472    for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) {
2473       emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2474
2475       /* If this was max_usable_mrf, we can't fit anything more into this URB
2476        * WRITE.
2477        */
2478       if (mrf > max_usable_mrf) {
2479          slot++;
2480          break;
2481       }
2482    }
2483
2484    current_annotation = "URB write";
2485    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2486    inst->base_mrf = base_mrf;
2487    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2488    inst->eot = (slot >= c->prog_data.vue_map.num_slots);
2489
2490    /* Optional second URB write */
2491    if (!inst->eot) {
2492       mrf = base_mrf + 1;
2493
2494       for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
2495          assert(mrf < max_usable_mrf);
2496
2497          emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2498       }
2499
2500       current_annotation = "URB write";
2501       inst = emit(VS_OPCODE_URB_WRITE);
2502       inst->base_mrf = base_mrf;
2503       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2504       inst->eot = true;
2505       /* URB destination offset.  In the previous write, we got MRFs
2506        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2507        * URB row increments, and each of our MRFs is half of one of
2508        * those, since we're doing interleaved writes.
2509        */
2510       inst->offset = (max_usable_mrf - base_mrf) / 2;
2511    }
2512 }
2513
2514 src_reg
2515 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2516                                  src_reg *reladdr, int reg_offset)
2517 {
2518    /* Because we store the values to scratch interleaved like our
2519     * vertex data, we need to scale the vec4 index by 2.
2520     */
2521    int message_header_scale = 2;
2522
2523    /* Pre-gen6, the message header uses byte offsets instead of vec4
2524     * (16-byte) offset units.
2525     */
2526    if (intel->gen < 6)
2527       message_header_scale *= 16;
2528
2529    if (reladdr) {
2530       src_reg index = src_reg(this, glsl_type::int_type);
2531
2532       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2533       emit_before(inst, MUL(dst_reg(index),
2534                             index, src_reg(message_header_scale)));
2535
2536       return index;
2537    } else {
2538       return src_reg(reg_offset * message_header_scale);
2539    }
2540 }
2541
2542 src_reg
2543 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2544                                        src_reg *reladdr, int reg_offset)
2545 {
2546    if (reladdr) {
2547       src_reg index = src_reg(this, glsl_type::int_type);
2548
2549       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2550
2551       /* Pre-gen6, the message header uses byte offsets instead of vec4
2552        * (16-byte) offset units.
2553        */
2554       if (intel->gen < 6) {
2555          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2556       }
2557
2558       return index;
2559    } else {
2560       int message_header_scale = intel->gen < 6 ? 16 : 1;
2561       return src_reg(reg_offset * message_header_scale);
2562    }
2563 }
2564
2565 /**
2566  * Emits an instruction before @inst to load the value named by @orig_src
2567  * from scratch space at @base_offset to @temp.
2568  *
2569  * @base_offset is measured in 32-byte units (the size of a register).
2570  */
2571 void
2572 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2573                                 dst_reg temp, src_reg orig_src,
2574                                 int base_offset)
2575 {
2576    int reg_offset = base_offset + orig_src.reg_offset;
2577    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2578
2579    emit_before(inst, SCRATCH_READ(temp, index));
2580 }
2581
2582 /**
2583  * Emits an instruction after @inst to store the value to be written
2584  * to @orig_dst to scratch space at @base_offset, from @temp.
2585  *
2586  * @base_offset is measured in 32-byte units (the size of a register).
2587  */
2588 void
2589 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2590 {
2591    int reg_offset = base_offset + inst->dst.reg_offset;
2592    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2593
2594    /* Create a temporary register to store *inst's result in.
2595     *
2596     * We have to be careful in MOVing from our temporary result register in
2597     * the scratch write.  If we swizzle from channels of the temporary that
2598     * weren't initialized, it will confuse live interval analysis, which will
2599     * make spilling fail to make progress.
2600     */
2601    src_reg temp = src_reg(this, glsl_type::vec4_type);
2602    temp.type = inst->dst.type;
2603    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2604    int swizzles[4];
2605    for (int i = 0; i < 4; i++)
2606       if (inst->dst.writemask & (1 << i))
2607          swizzles[i] = i;
2608       else
2609          swizzles[i] = first_writemask_chan;
2610    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2611                                swizzles[2], swizzles[3]);
2612
2613    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2614                                        inst->dst.writemask));
2615    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2616    write->predicate = inst->predicate;
2617    write->ir = inst->ir;
2618    write->annotation = inst->annotation;
2619    inst->insert_after(write);
2620
2621    inst->dst.file = temp.file;
2622    inst->dst.reg = temp.reg;
2623    inst->dst.reg_offset = temp.reg_offset;
2624    inst->dst.reladdr = NULL;
2625 }
2626
2627 /**
2628  * We can't generally support array access in GRF space, because a
2629  * single instruction's destination can only span 2 contiguous
2630  * registers.  So, we send all GRF arrays that get variable index
2631  * access to scratch space.
2632  */
2633 void
2634 vec4_visitor::move_grf_array_access_to_scratch()
2635 {
2636    int scratch_loc[this->virtual_grf_count];
2637
2638    for (int i = 0; i < this->virtual_grf_count; i++) {
2639       scratch_loc[i] = -1;
2640    }
2641
2642    /* First, calculate the set of virtual GRFs that need to be punted
2643     * to scratch due to having any array access on them, and where in
2644     * scratch.
2645     */
2646    foreach_list(node, &this->instructions) {
2647       vec4_instruction *inst = (vec4_instruction *)node;
2648
2649       if (inst->dst.file == GRF && inst->dst.reladdr &&
2650           scratch_loc[inst->dst.reg] == -1) {
2651          scratch_loc[inst->dst.reg] = c->last_scratch;
2652          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2653       }
2654
2655       for (int i = 0 ; i < 3; i++) {
2656          src_reg *src = &inst->src[i];
2657
2658          if (src->file == GRF && src->reladdr &&
2659              scratch_loc[src->reg] == -1) {
2660             scratch_loc[src->reg] = c->last_scratch;
2661             c->last_scratch += this->virtual_grf_sizes[src->reg];
2662          }
2663       }
2664    }
2665
2666    /* Now, for anything that will be accessed through scratch, rewrite
2667     * it to load/store.  Note that this is a _safe list walk, because
2668     * we may generate a new scratch_write instruction after the one
2669     * we're processing.
2670     */
2671    foreach_list_safe(node, &this->instructions) {
2672       vec4_instruction *inst = (vec4_instruction *)node;
2673
2674       /* Set up the annotation tracking for new generated instructions. */
2675       base_ir = inst->ir;
2676       current_annotation = inst->annotation;
2677
2678       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2679          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2680       }
2681
2682       for (int i = 0 ; i < 3; i++) {
2683          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2684             continue;
2685
2686          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2687
2688          emit_scratch_read(inst, temp, inst->src[i],
2689                            scratch_loc[inst->src[i].reg]);
2690
2691          inst->src[i].file = temp.file;
2692          inst->src[i].reg = temp.reg;
2693          inst->src[i].reg_offset = temp.reg_offset;
2694          inst->src[i].reladdr = NULL;
2695       }
2696    }
2697 }
2698
2699 /**
2700  * Emits an instruction before @inst to load the value named by @orig_src
2701  * from the pull constant buffer (surface) at @base_offset to @temp.
2702  */
2703 void
2704 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2705                                       dst_reg temp, src_reg orig_src,
2706                                       int base_offset)
2707 {
2708    int reg_offset = base_offset + orig_src.reg_offset;
2709    src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2710    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2711    vec4_instruction *load;
2712
2713    load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2714                                         temp, index, offset);
2715    load->base_mrf = 14;
2716    load->mlen = 1;
2717    emit_before(inst, load);
2718 }
2719
2720 /**
2721  * Implements array access of uniforms by inserting a
2722  * PULL_CONSTANT_LOAD instruction.
2723  *
2724  * Unlike temporary GRF array access (where we don't support it due to
2725  * the difficulty of doing relative addressing on instruction
2726  * destinations), we could potentially do array access of uniforms
2727  * that were loaded in GRF space as push constants.  In real-world
2728  * usage we've seen, though, the arrays being used are always larger
2729  * than we could load as push constants, so just always move all
2730  * uniform array access out to a pull constant buffer.
2731  */
2732 void
2733 vec4_visitor::move_uniform_array_access_to_pull_constants()
2734 {
2735    int pull_constant_loc[this->uniforms];
2736
2737    for (int i = 0; i < this->uniforms; i++) {
2738       pull_constant_loc[i] = -1;
2739    }
2740
2741    /* Walk through and find array access of uniforms.  Put a copy of that
2742     * uniform in the pull constant buffer.
2743     *
2744     * Note that we don't move constant-indexed accesses to arrays.  No
2745     * testing has been done of the performance impact of this choice.
2746     */
2747    foreach_list_safe(node, &this->instructions) {
2748       vec4_instruction *inst = (vec4_instruction *)node;
2749
2750       for (int i = 0 ; i < 3; i++) {
2751          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2752             continue;
2753
2754          int uniform = inst->src[i].reg;
2755
2756          /* If this array isn't already present in the pull constant buffer,
2757           * add it.
2758           */
2759          if (pull_constant_loc[uniform] == -1) {
2760             const float **values = &prog_data->param[uniform * 4];
2761
2762             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2763
2764             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2765                prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2766             }
2767          }
2768
2769          /* Set up the annotation tracking for new generated instructions. */
2770          base_ir = inst->ir;
2771          current_annotation = inst->annotation;
2772
2773          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2774
2775          emit_pull_constant_load(inst, temp, inst->src[i],
2776                                  pull_constant_loc[uniform]);
2777
2778          inst->src[i].file = temp.file;
2779          inst->src[i].reg = temp.reg;
2780          inst->src[i].reg_offset = temp.reg_offset;
2781          inst->src[i].reladdr = NULL;
2782       }
2783    }
2784
2785    /* Now there are no accesses of the UNIFORM file with a reladdr, so
2786     * no need to track them as larger-than-vec4 objects.  This will be
2787     * relied on in cutting out unused uniform vectors from push
2788     * constants.
2789     */
2790    split_uniform_registers();
2791 }
2792
2793 void
2794 vec4_visitor::resolve_ud_negate(src_reg *reg)
2795 {
2796    if (reg->type != BRW_REGISTER_TYPE_UD ||
2797        !reg->negate)
2798       return;
2799
2800    src_reg temp = src_reg(this, glsl_type::uvec4_type);
2801    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2802    *reg = temp;
2803 }
2804
2805 vec4_visitor::vec4_visitor(struct brw_context *brw,
2806                            struct brw_vs_compile *c,
2807                            struct gl_shader_program *prog,
2808                            struct brw_shader *shader,
2809                            void *mem_ctx)
2810 {
2811    this->c = c;
2812    this->brw = brw;
2813    this->intel = &brw->intel;
2814    this->ctx = &intel->ctx;
2815    this->prog = prog;
2816    this->shader = shader;
2817
2818    this->mem_ctx = mem_ctx;
2819    this->failed = false;
2820
2821    this->base_ir = NULL;
2822    this->current_annotation = NULL;
2823    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
2824
2825    this->c = c;
2826    this->vp = &c->vp->program;
2827    this->prog_data = &c->prog_data;
2828
2829    this->variable_ht = hash_table_ctor(0,
2830                                        hash_table_pointer_hash,
2831                                        hash_table_pointer_compare);
2832
2833    this->virtual_grf_def = NULL;
2834    this->virtual_grf_use = NULL;
2835    this->virtual_grf_sizes = NULL;
2836    this->virtual_grf_count = 0;
2837    this->virtual_grf_reg_map = NULL;
2838    this->virtual_grf_reg_count = 0;
2839    this->virtual_grf_array_size = 0;
2840    this->live_intervals_valid = false;
2841
2842    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2843
2844    this->uniforms = 0;
2845 }
2846
2847 vec4_visitor::~vec4_visitor()
2848 {
2849    hash_table_dtor(this->variable_ht);
2850 }
2851
2852
2853 void
2854 vec4_visitor::fail(const char *format, ...)
2855 {
2856    va_list va;
2857    char *msg;
2858
2859    if (failed)
2860       return;
2861
2862    failed = true;
2863
2864    va_start(va, format);
2865    msg = ralloc_vasprintf(mem_ctx, format, va);
2866    va_end(va);
2867    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2868
2869    this->fail_msg = msg;
2870
2871    if (INTEL_DEBUG & DEBUG_VS) {
2872       fprintf(stderr, "%s",  msg);
2873    }
2874 }
2875
2876 } /* namespace brw */