src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 extern "C" {
  26 #include "main/macros.h"
  27 #include "program/prog_parameter.h"
  28 #include "program/sampler.h"
  29 }
  30
  31 namespace brw {
  32
  33 vec4_instruction::vec4_instruction(vec4_visitor *v,
  34                                    enum opcode opcode, dst_reg dst,
  35                                    src_reg src0, src_reg src1, src_reg src2)
  36 {
  37    this->opcode = opcode;
  38    this->dst = dst;
  39    this->src[0] = src0;
  40    this->src[1] = src1;
  41    this->src[2] = src2;
  42    this->ir = v->base_ir;
  43    this->annotation = v->current_annotation;
  44 }
  45
  46 vec4_instruction *
  47 vec4_visitor::emit(vec4_instruction *inst)
  48 {
  49    this->instructions.push_tail(inst);
  50
  51    return inst;
  52 }
  53
  54 vec4_instruction *
  55 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  56 {
  57    new_inst->ir = inst->ir;
  58    new_inst->annotation = inst->annotation;
  59
  60    inst->insert_before(new_inst);
  61
  62    return inst;
  63 }
  64
  65 vec4_instruction *
  66 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  67                    src_reg src0, src_reg src1, src_reg src2)
  68 {
  69    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  70                                              src0, src1, src2));
  71 }
  72
  73
  74 vec4_instruction *
  75 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  76 {
  77    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  78 }
  79
  80 vec4_instruction *
  81 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  82 {
  83    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  84 }
  85
  86 vec4_instruction *
  87 vec4_visitor::emit(enum opcode opcode)
  88 {
  89    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  90 }
  91
  92 #define ALU1(op)                                                        \
  93    vec4_instruction *                                                   \
  94    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
  95    {                                                                    \
  96       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
  97                                            src0);                       \
  98    }
  99
 100 #define ALU2(op)                                                        \
 101    vec4_instruction *                                                   \
 102    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 103    {                                                                    \
 104       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 105                                            src0, src1);                 \
 106    }
 107
 108 ALU1(NOT)
 109 ALU1(MOV)
 110 ALU1(FRC)
 111 ALU1(RNDD)
 112 ALU1(RNDE)
 113 ALU1(RNDZ)
 114 ALU2(ADD)
 115 ALU2(MUL)
 116 ALU2(MACH)
 117 ALU2(AND)
 118 ALU2(OR)
 119 ALU2(XOR)
 120 ALU2(DP3)
 121 ALU2(DP4)
 122 ALU2(DPH)
 123
 124 /** Gen4 predicated IF. */
 125 vec4_instruction *
 126 vec4_visitor::IF(uint32_t predicate)
 127 {
 128    vec4_instruction *inst;
 129
 130    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 131    inst->predicate = predicate;
 132
 133    return inst;
 134 }
 135
 136 /** Gen6+ IF with embedded comparison. */
 137 vec4_instruction *
 138 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 139 {
 140    assert(intel->gen >= 6);
 141
 142    vec4_instruction *inst;
 143
 144    resolve_ud_negate(&src0);
 145    resolve_ud_negate(&src1);
 146
 147    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 148                                         src0, src1);
 149    inst->conditional_mod = condition;
 150
 151    return inst;
 152 }
 153
 154 /**
 155  * CMP: Sets the low bit of the destination channels with the result
 156  * of the comparison, while the upper bits are undefined, and updates
 157  * the flag register with the packed 16 bits of the result.
 158  */
 159 vec4_instruction *
 160 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 161 {
 162    vec4_instruction *inst;
 163
 164    /* original gen4 does type conversion to the destination type
 165     * before before comparison, producing garbage results for floating
 166     * point comparisons.
 167     */
 168    if (intel->gen == 4) {
 169       dst.type = src0.type;
 170       if (dst.file == HW_REG)
 171          dst.fixed_hw_reg.type = dst.type;
 172    }
 173
 174    resolve_ud_negate(&src0);
 175    resolve_ud_negate(&src1);
 176
 177    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 178    inst->conditional_mod = condition;
 179
 180    return inst;
 181 }
 182
 183 vec4_instruction *
 184 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 185 {
 186    vec4_instruction *inst;
 187
 188    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 189                                         dst, index);
 190    inst->base_mrf = 14;
 191    inst->mlen = 2;
 192
 193    return inst;
 194 }
 195
 196 vec4_instruction *
 197 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 198 {
 199    vec4_instruction *inst;
 200
 201    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 202                                         dst, src, index);
 203    inst->base_mrf = 13;
 204    inst->mlen = 3;
 205
 206    return inst;
 207 }
 208
 209 void
 210 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 211 {
 212    static enum opcode dot_opcodes[] = {
 213       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 214    };
 215
 216    emit(dot_opcodes[elements - 2], dst, src0, src1);
 217 }
 218
 219 void
 220 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 221 {
 222    /* The gen6 math instruction ignores the source modifiers --
 223     * swizzle, abs, negate, and at least some parts of the register
 224     * region description.
 225     *
 226     * While it would seem that this MOV could be avoided at this point
 227     * in the case that the swizzle is matched up with the destination
 228     * writemask, note that uniform packing and register allocation
 229     * could rearrange our swizzle, so let's leave this matter up to
 230     * copy propagation later.
 231     */
 232    src_reg temp_src = src_reg(this, glsl_type::vec4_type);
 233    emit(MOV(dst_reg(temp_src), src));
 234
 235    if (dst.writemask != WRITEMASK_XYZW) {
 236       /* The gen6 math instruction must be align1, so we can't do
 237        * writemasks.
 238        */
 239       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 240
 241       emit(opcode, temp_dst, temp_src);
 242
 243       emit(MOV(dst, src_reg(temp_dst)));
 244    } else {
 245       emit(opcode, dst, temp_src);
 246    }
 247 }
 248
 249 void
 250 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 251 {
 252    vec4_instruction *inst = emit(opcode, dst, src);
 253    inst->base_mrf = 1;
 254    inst->mlen = 1;
 255 }
 256
 257 void
 258 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 259 {
 260    switch (opcode) {
 261    case SHADER_OPCODE_RCP:
 262    case SHADER_OPCODE_RSQ:
 263    case SHADER_OPCODE_SQRT:
 264    case SHADER_OPCODE_EXP2:
 265    case SHADER_OPCODE_LOG2:
 266    case SHADER_OPCODE_SIN:
 267    case SHADER_OPCODE_COS:
 268       break;
 269    default:
 270       assert(!"not reached: bad math opcode");
 271       return;
 272    }
 273
 274    if (intel->gen >= 7) {
 275       emit(opcode, dst, src);
 276    } else if (intel->gen == 6) {
 277       return emit_math1_gen6(opcode, dst, src);
 278    } else {
 279       return emit_math1_gen4(opcode, dst, src);
 280    }
 281 }
 282
 283 void
 284 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 285                               dst_reg dst, src_reg src0, src_reg src1)
 286 {
 287    src_reg expanded;
 288
 289    /* The gen6 math instruction ignores the source modifiers --
 290     * swizzle, abs, negate, and at least some parts of the register
 291     * region description.  Move the sources to temporaries to make it
 292     * generally work.
 293     */
 294
 295    expanded = src_reg(this, glsl_type::vec4_type);
 296    expanded.type = src0.type;
 297    emit(MOV(dst_reg(expanded), src0));
 298    src0 = expanded;
 299
 300    expanded = src_reg(this, glsl_type::vec4_type);
 301    expanded.type = src1.type;
 302    emit(MOV(dst_reg(expanded), src1));
 303    src1 = expanded;
 304
 305    if (dst.writemask != WRITEMASK_XYZW) {
 306       /* The gen6 math instruction must be align1, so we can't do
 307        * writemasks.
 308        */
 309       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 310       temp_dst.type = dst.type;
 311
 312       emit(opcode, temp_dst, src0, src1);
 313
 314       emit(MOV(dst, src_reg(temp_dst)));
 315    } else {
 316       emit(opcode, dst, src0, src1);
 317    }
 318 }
 319
 320 void
 321 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 322                               dst_reg dst, src_reg src0, src_reg src1)
 323 {
 324    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 325    inst->base_mrf = 1;
 326    inst->mlen = 2;
 327 }
 328
 329 void
 330 vec4_visitor::emit_math(enum opcode opcode,
 331                         dst_reg dst, src_reg src0, src_reg src1)
 332 {
 333    switch (opcode) {
 334    case SHADER_OPCODE_POW:
 335    case SHADER_OPCODE_INT_QUOTIENT:
 336    case SHADER_OPCODE_INT_REMAINDER:
 337       break;
 338    default:
 339       assert(!"not reached: unsupported binary math opcode");
 340       return;
 341    }
 342
 343    if (intel->gen >= 7) {
 344       emit(opcode, dst, src0, src1);
 345    } else if (intel->gen == 6) {
 346       return emit_math2_gen6(opcode, dst, src0, src1);
 347    } else {
 348       return emit_math2_gen4(opcode, dst, src0, src1);
 349    }
 350 }
 351
 352 void
 353 vec4_visitor::visit_instructions(const exec_list *list)
 354 {
 355    foreach_list(node, list) {
 356       ir_instruction *ir = (ir_instruction *)node;
 357
 358       base_ir = ir;
 359       ir->accept(this);
 360    }
 361 }
 362
 363
 364 static int
 365 type_size(const struct glsl_type *type)
 366 {
 367    unsigned int i;
 368    int size;
 369
 370    switch (type->base_type) {
 371    case GLSL_TYPE_UINT:
 372    case GLSL_TYPE_INT:
 373    case GLSL_TYPE_FLOAT:
 374    case GLSL_TYPE_BOOL:
 375       if (type->is_matrix()) {
 376          return type->matrix_columns;
 377       } else {
 378          /* Regardless of size of vector, it gets a vec4. This is bad
 379           * packing for things like floats, but otherwise arrays become a
 380           * mess.  Hopefully a later pass over the code can pack scalars
 381           * down if appropriate.
 382           */
 383          return 1;
 384       }
 385    case GLSL_TYPE_ARRAY:
 386       assert(type->length > 0);
 387       return type_size(type->fields.array) * type->length;
 388    case GLSL_TYPE_STRUCT:
 389       size = 0;
 390       for (i = 0; i < type->length; i++) {
 391          size += type_size(type->fields.structure[i].type);
 392       }
 393       return size;
 394    case GLSL_TYPE_SAMPLER:
 395       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 396        * at link time.
 397        */
 398       return 1;
 399    default:
 400       assert(0);
 401       return 0;
 402    }
 403 }
 404
 405 int
 406 vec4_visitor::virtual_grf_alloc(int size)
 407 {
 408    if (virtual_grf_array_size <= virtual_grf_count) {
 409       if (virtual_grf_array_size == 0)
 410          virtual_grf_array_size = 16;
 411       else
 412          virtual_grf_array_size *= 2;
 413       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 414                                    virtual_grf_array_size);
 415       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 416                                      virtual_grf_array_size);
 417    }
 418    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 419    virtual_grf_reg_count += size;
 420    virtual_grf_sizes[virtual_grf_count] = size;
 421    return virtual_grf_count++;
 422 }
 423
 424 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 425 {
 426    init();
 427
 428    this->file = GRF;
 429    this->reg = v->virtual_grf_alloc(type_size(type));
 430
 431    if (type->is_array() || type->is_record()) {
 432       this->swizzle = BRW_SWIZZLE_NOOP;
 433    } else {
 434       this->swizzle = swizzle_for_size(type->vector_elements);
 435    }
 436
 437    this->type = brw_type_for_base_type(type);
 438 }
 439
 440 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 441 {
 442    init();
 443
 444    this->file = GRF;
 445    this->reg = v->virtual_grf_alloc(type_size(type));
 446
 447    if (type->is_array() || type->is_record()) {
 448       this->writemask = WRITEMASK_XYZW;
 449    } else {
 450       this->writemask = (1 << type->vector_elements) - 1;
 451    }
 452
 453    this->type = brw_type_for_base_type(type);
 454 }
 455
 456 /* Our support for uniforms is piggy-backed on the struct
 457  * gl_fragment_program, because that's where the values actually
 458  * get stored, rather than in some global gl_shader_program uniform
 459  * store.
 460  */
 461 int
 462 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
 463 {
 464    unsigned int offset = 0;
 465    float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
 466
 467    if (type->is_matrix()) {
 468       const glsl_type *column = type->column_type();
 469
 470       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 471          offset += setup_uniform_values(loc + offset, column);
 472       }
 473
 474       return offset;
 475    }
 476
 477    switch (type->base_type) {
 478    case GLSL_TYPE_FLOAT:
 479    case GLSL_TYPE_UINT:
 480    case GLSL_TYPE_INT:
 481    case GLSL_TYPE_BOOL:
 482       for (unsigned int i = 0; i < type->vector_elements; i++) {
 483          c->prog_data.param[this->uniforms * 4 + i] = &values[i];
 484       }
 485
 486       /* Set up pad elements to get things aligned to a vec4 boundary. */
 487       for (unsigned int i = type->vector_elements; i < 4; i++) {
 488          static float zero = 0;
 489
 490          c->prog_data.param[this->uniforms * 4 + i] = &zero;
 491       }
 492
 493       /* Track the size of this uniform vector, for future packing of
 494        * uniforms.
 495        */
 496       this->uniform_vector_size[this->uniforms] = type->vector_elements;
 497       this->uniforms++;
 498
 499       return 1;
 500
 501    case GLSL_TYPE_STRUCT:
 502       for (unsigned int i = 0; i < type->length; i++) {
 503          offset += setup_uniform_values(loc + offset,
 504                                         type->fields.structure[i].type);
 505       }
 506       return offset;
 507
 508    case GLSL_TYPE_ARRAY:
 509       for (unsigned int i = 0; i < type->length; i++) {
 510          offset += setup_uniform_values(loc + offset, type->fields.array);
 511       }
 512       return offset;
 513
 514    case GLSL_TYPE_SAMPLER:
 515       /* The sampler takes up a slot, but we don't use any values from it. */
 516       return 1;
 517
 518    default:
 519       assert(!"not reached");
 520       return 0;
 521    }
 522 }
 523
 524 void
 525 vec4_visitor::setup_uniform_clipplane_values()
 526 {
 527    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 528
 529    /* Pre-Gen6, we compact clip planes.  For example, if the user
 530     * enables just clip planes 0, 1, and 3, we will enable clip planes
 531     * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
 532     * plane 2.  This simplifies the implementation of the Gen6 clip
 533     * thread.
 534     *
 535     * In Gen6 and later, we don't compact clip planes, because this
 536     * simplifies the implementation of gl_ClipDistance.
 537     */
 538    int compacted_clipplane_index = 0;
 539    for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
 540       if (intel->gen < 6 &&
 541           !(c->key.userclip_planes_enabled_gen_4_5 & (1 << i))) {
 542          continue;
 543       }
 544       this->uniform_vector_size[this->uniforms] = 4;
 545       this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
 546       this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
 547       for (int j = 0; j < 4; ++j) {
 548          c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 549       }
 550       ++compacted_clipplane_index;
 551       ++this->uniforms;
 552    }
 553 }
 554
 555 /* Our support for builtin uniforms is even scarier than non-builtin.
 556  * It sits on top of the PROG_STATE_VAR parameters that are
 557  * automatically updated from GL context state.
 558  */
 559 void
 560 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 561 {
 562    const ir_state_slot *const slots = ir->state_slots;
 563    assert(ir->state_slots != NULL);
 564
 565    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 566       /* This state reference has already been setup by ir_to_mesa,
 567        * but we'll get the same index back here.  We can reference
 568        * ParameterValues directly, since unlike brw_fs.cpp, we never
 569        * add new state references during compile.
 570        */
 571       int index = _mesa_add_state_reference(this->vp->Base.Parameters,
 572                                             (gl_state_index *)slots[i].tokens);
 573       float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
 574
 575       this->uniform_vector_size[this->uniforms] = 0;
 576       /* Add each of the unique swizzled channels of the element.
 577        * This will end up matching the size of the glsl_type of this field.
 578        */
 579       int last_swiz = -1;
 580       for (unsigned int j = 0; j < 4; j++) {
 581          int swiz = GET_SWZ(slots[i].swizzle, j);
 582          last_swiz = swiz;
 583
 584          c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
 585          if (swiz <= last_swiz)
 586             this->uniform_vector_size[this->uniforms]++;
 587       }
 588       this->uniforms++;
 589    }
 590 }
 591
 592 dst_reg *
 593 vec4_visitor::variable_storage(ir_variable *var)
 594 {
 595    return (dst_reg *)hash_table_find(this->variable_ht, var);
 596 }
 597
 598 void
 599 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 600 {
 601    ir_expression *expr = ir->as_expression();
 602
 603    *predicate = BRW_PREDICATE_NORMAL;
 604
 605    if (expr) {
 606       src_reg op[2];
 607       vec4_instruction *inst;
 608
 609       assert(expr->get_num_operands() <= 2);
 610       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 611          expr->operands[i]->accept(this);
 612          op[i] = this->result;
 613
 614          resolve_ud_negate(&op[i]);
 615       }
 616
 617       switch (expr->operation) {
 618       case ir_unop_logic_not:
 619          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 620          inst->conditional_mod = BRW_CONDITIONAL_Z;
 621          break;
 622
 623       case ir_binop_logic_xor:
 624          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 625          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 626          break;
 627
 628       case ir_binop_logic_or:
 629          inst = emit(OR(dst_null_d(), op[0], op[1]));
 630          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 631          break;
 632
 633       case ir_binop_logic_and:
 634          inst = emit(AND(dst_null_d(), op[0], op[1]));
 635          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 636          break;
 637
 638       case ir_unop_f2b:
 639          if (intel->gen >= 6) {
 640             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 641          } else {
 642             inst = emit(MOV(dst_null_f(), op[0]));
 643             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 644          }
 645          break;
 646
 647       case ir_unop_i2b:
 648          if (intel->gen >= 6) {
 649             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 650          } else {
 651             inst = emit(MOV(dst_null_d(), op[0]));
 652             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 653          }
 654          break;
 655
 656       case ir_binop_all_equal:
 657          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 658          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 659          break;
 660
 661       case ir_binop_any_nequal:
 662          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 663          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 664          break;
 665
 666       case ir_unop_any:
 667          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 668          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 669          break;
 670
 671       case ir_binop_greater:
 672       case ir_binop_gequal:
 673       case ir_binop_less:
 674       case ir_binop_lequal:
 675       case ir_binop_equal:
 676       case ir_binop_nequal:
 677          emit(CMP(dst_null_d(), op[0], op[1],
 678                   brw_conditional_for_comparison(expr->operation)));
 679          break;
 680
 681       default:
 682          assert(!"not reached");
 683          break;
 684       }
 685       return;
 686    }
 687
 688    ir->accept(this);
 689
 690    resolve_ud_negate(&this->result);
 691
 692    if (intel->gen >= 6) {
 693       vec4_instruction *inst = emit(AND(dst_null_d(),
 694                                         this->result, src_reg(1)));
 695       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 696    } else {
 697       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 698       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 699    }
 700 }
 701
 702 /**
 703  * Emit a gen6 IF statement with the comparison folded into the IF
 704  * instruction.
 705  */
 706 void
 707 vec4_visitor::emit_if_gen6(ir_if *ir)
 708 {
 709    ir_expression *expr = ir->condition->as_expression();
 710
 711    if (expr) {
 712       src_reg op[2];
 713       dst_reg temp;
 714
 715       assert(expr->get_num_operands() <= 2);
 716       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 717          expr->operands[i]->accept(this);
 718          op[i] = this->result;
 719       }
 720
 721       switch (expr->operation) {
 722       case ir_unop_logic_not:
 723          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 724          return;
 725
 726       case ir_binop_logic_xor:
 727          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 728          return;
 729
 730       case ir_binop_logic_or:
 731          temp = dst_reg(this, glsl_type::bool_type);
 732          emit(OR(temp, op[0], op[1]));
 733          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 734          return;
 735
 736       case ir_binop_logic_and:
 737          temp = dst_reg(this, glsl_type::bool_type);
 738          emit(AND(temp, op[0], op[1]));
 739          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 740          return;
 741
 742       case ir_unop_f2b:
 743          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 744          return;
 745
 746       case ir_unop_i2b:
 747          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 748          return;
 749
 750       case ir_binop_greater:
 751       case ir_binop_gequal:
 752       case ir_binop_less:
 753       case ir_binop_lequal:
 754       case ir_binop_equal:
 755       case ir_binop_nequal:
 756          emit(IF(op[0], op[1],
 757                  brw_conditional_for_comparison(expr->operation)));
 758          return;
 759
 760       case ir_binop_all_equal:
 761          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 762          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 763          return;
 764
 765       case ir_binop_any_nequal:
 766          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 767          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 768          return;
 769
 770       case ir_unop_any:
 771          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 772          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 773          return;
 774
 775       default:
 776          assert(!"not reached");
 777          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 778          return;
 779       }
 780       return;
 781    }
 782
 783    ir->condition->accept(this);
 784
 785    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 786 }
 787
 788 void
 789 vec4_visitor::visit(ir_variable *ir)
 790 {
 791    dst_reg *reg = NULL;
 792
 793    if (variable_storage(ir))
 794       return;
 795
 796    switch (ir->mode) {
 797    case ir_var_in:
 798       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
 799
 800       /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 801        * come in as floating point conversions of the integer values.
 802        */
 803       for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
 804          if (!c->key.gl_fixed_input_size[i])
 805             continue;
 806
 807          dst_reg dst = *reg;
 808          dst.type = brw_type_for_base_type(ir->type);
 809          dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
 810          emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 811       }
 812       break;
 813
 814    case ir_var_out:
 815       reg = new(mem_ctx) dst_reg(this, ir->type);
 816
 817       for (int i = 0; i < type_size(ir->type); i++) {
 818          output_reg[ir->location + i] = *reg;
 819          output_reg[ir->location + i].reg_offset = i;
 820          output_reg[ir->location + i].type =
 821             brw_type_for_base_type(ir->type->get_scalar_type());
 822          output_reg_annotation[ir->location + i] = ir->name;
 823       }
 824       break;
 825
 826    case ir_var_auto:
 827    case ir_var_temporary:
 828       reg = new(mem_ctx) dst_reg(this, ir->type);
 829       break;
 830
 831    case ir_var_uniform:
 832       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 833
 834       /* Thanks to the lower_ubo_reference pass, we will see only
 835        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 836        * variables, so no need for them to be in variable_ht.
 837        */
 838       if (ir->uniform_block != -1)
 839          return;
 840
 841       /* Track how big the whole uniform variable is, in case we need to put a
 842        * copy of its data into pull constants for array access.
 843        */
 844       this->uniform_size[this->uniforms] = type_size(ir->type);
 845
 846       if (!strncmp(ir->name, "gl_", 3)) {
 847          setup_builtin_uniform_values(ir);
 848       } else {
 849          setup_uniform_values(ir->location, ir->type);
 850       }
 851       break;
 852
 853    case ir_var_system_value:
 854       /* VertexID is stored by the VF as the last vertex element, but
 855        * we don't represent it with a flag in inputs_read, so we call
 856        * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
 857        */
 858       reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
 859       prog_data->uses_vertexid = true;
 860
 861       switch (ir->location) {
 862       case SYSTEM_VALUE_VERTEX_ID:
 863          reg->writemask = WRITEMASK_X;
 864          break;
 865       case SYSTEM_VALUE_INSTANCE_ID:
 866          reg->writemask = WRITEMASK_Y;
 867          break;
 868       default:
 869          assert(!"not reached");
 870          break;
 871       }
 872       break;
 873
 874    default:
 875       assert(!"not reached");
 876    }
 877
 878    reg->type = brw_type_for_base_type(ir->type);
 879    hash_table_insert(this->variable_ht, reg, ir);
 880 }
 881
 882 void
 883 vec4_visitor::visit(ir_loop *ir)
 884 {
 885    dst_reg counter;
 886
 887    /* We don't want debugging output to print the whole body of the
 888     * loop as the annotation.
 889     */
 890    this->base_ir = NULL;
 891
 892    if (ir->counter != NULL) {
 893       this->base_ir = ir->counter;
 894       ir->counter->accept(this);
 895       counter = *(variable_storage(ir->counter));
 896
 897       if (ir->from != NULL) {
 898          this->base_ir = ir->from;
 899          ir->from->accept(this);
 900
 901          emit(MOV(counter, this->result));
 902       }
 903    }
 904
 905    emit(BRW_OPCODE_DO);
 906
 907    if (ir->to) {
 908       this->base_ir = ir->to;
 909       ir->to->accept(this);
 910
 911       emit(CMP(dst_null_d(), src_reg(counter), this->result,
 912                brw_conditional_for_comparison(ir->cmp)));
 913
 914       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
 915       inst->predicate = BRW_PREDICATE_NORMAL;
 916    }
 917
 918    visit_instructions(&ir->body_instructions);
 919
 920
 921    if (ir->increment) {
 922       this->base_ir = ir->increment;
 923       ir->increment->accept(this);
 924       emit(ADD(counter, src_reg(counter), this->result));
 925    }
 926
 927    emit(BRW_OPCODE_WHILE);
 928 }
 929
 930 void
 931 vec4_visitor::visit(ir_loop_jump *ir)
 932 {
 933    switch (ir->mode) {
 934    case ir_loop_jump::jump_break:
 935       emit(BRW_OPCODE_BREAK);
 936       break;
 937    case ir_loop_jump::jump_continue:
 938       emit(BRW_OPCODE_CONTINUE);
 939       break;
 940    }
 941 }
 942
 943
 944 void
 945 vec4_visitor::visit(ir_function_signature *ir)
 946 {
 947    assert(0);
 948    (void)ir;
 949 }
 950
 951 void
 952 vec4_visitor::visit(ir_function *ir)
 953 {
 954    /* Ignore function bodies other than main() -- we shouldn't see calls to
 955     * them since they should all be inlined.
 956     */
 957    if (strcmp(ir->name, "main") == 0) {
 958       const ir_function_signature *sig;
 959       exec_list empty;
 960
 961       sig = ir->matching_signature(&empty);
 962
 963       assert(sig);
 964
 965       visit_instructions(&sig->body);
 966    }
 967 }
 968
 969 bool
 970 vec4_visitor::try_emit_sat(ir_expression *ir)
 971 {
 972    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
 973    if (!sat_src)
 974       return false;
 975
 976    sat_src->accept(this);
 977    src_reg src = this->result;
 978
 979    this->result = src_reg(this, ir->type);
 980    vec4_instruction *inst;
 981    inst = emit(MOV(dst_reg(this->result), src));
 982    inst->saturate = true;
 983
 984    return true;
 985 }
 986
 987 void
 988 vec4_visitor::emit_bool_comparison(unsigned int op,
 989                                  dst_reg dst, src_reg src0, src_reg src1)
 990 {
 991    /* original gen4 does destination conversion before comparison. */
 992    if (intel->gen < 5)
 993       dst.type = src0.type;
 994
 995    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
 996
 997    dst.type = BRW_REGISTER_TYPE_D;
 998    emit(AND(dst, src_reg(dst), src_reg(0x1)));
 999 }
1000
1001 void
1002 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1003                           src_reg src0, src_reg src1)
1004 {
1005    vec4_instruction *inst;
1006
1007    if (intel->gen >= 6) {
1008       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1009       inst->conditional_mod = conditionalmod;
1010    } else {
1011       emit(CMP(dst, src0, src1, conditionalmod));
1012
1013       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1014       inst->predicate = BRW_PREDICATE_NORMAL;
1015    }
1016 }
1017
1018 void
1019 vec4_visitor::visit(ir_expression *ir)
1020 {
1021    unsigned int operand;
1022    src_reg op[Elements(ir->operands)];
1023    src_reg result_src;
1024    dst_reg result_dst;
1025    vec4_instruction *inst;
1026
1027    if (try_emit_sat(ir))
1028       return;
1029
1030    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1031       this->result.file = BAD_FILE;
1032       ir->operands[operand]->accept(this);
1033       if (this->result.file == BAD_FILE) {
1034          printf("Failed to get tree for expression operand:\n");
1035          ir->operands[operand]->print();
1036          exit(1);
1037       }
1038       op[operand] = this->result;
1039
1040       /* Matrix expression operands should have been broken down to vector
1041        * operations already.
1042        */
1043       assert(!ir->operands[operand]->type->is_matrix());
1044    }
1045
1046    int vector_elements = ir->operands[0]->type->vector_elements;
1047    if (ir->operands[1]) {
1048       vector_elements = MAX2(vector_elements,
1049                              ir->operands[1]->type->vector_elements);
1050    }
1051
1052    this->result.file = BAD_FILE;
1053
1054    /* Storage for our result.  Ideally for an assignment we'd be using
1055     * the actual storage for the result here, instead.
1056     */
1057    result_src = src_reg(this, ir->type);
1058    /* convenience for the emit functions below. */
1059    result_dst = dst_reg(result_src);
1060    /* If nothing special happens, this is the result. */
1061    this->result = result_src;
1062    /* Limit writes to the channels that will be used by result_src later.
1063     * This does limit this temp's use as a temporary for multi-instruction
1064     * sequences.
1065     */
1066    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1067
1068    switch (ir->operation) {
1069    case ir_unop_logic_not:
1070       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1071        * ones complement of the whole register, not just bit 0.
1072        */
1073       emit(XOR(result_dst, op[0], src_reg(1)));
1074       break;
1075    case ir_unop_neg:
1076       op[0].negate = !op[0].negate;
1077       this->result = op[0];
1078       break;
1079    case ir_unop_abs:
1080       op[0].abs = true;
1081       op[0].negate = false;
1082       this->result = op[0];
1083       break;
1084
1085    case ir_unop_sign:
1086       emit(MOV(result_dst, src_reg(0.0f)));
1087
1088       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1089       inst = emit(MOV(result_dst, src_reg(1.0f)));
1090       inst->predicate = BRW_PREDICATE_NORMAL;
1091
1092       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1093       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1094       inst->predicate = BRW_PREDICATE_NORMAL;
1095
1096       break;
1097
1098    case ir_unop_rcp:
1099       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1100       break;
1101
1102    case ir_unop_exp2:
1103       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1104       break;
1105    case ir_unop_log2:
1106       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1107       break;
1108    case ir_unop_exp:
1109    case ir_unop_log:
1110       assert(!"not reached: should be handled by ir_explog_to_explog2");
1111       break;
1112    case ir_unop_sin:
1113    case ir_unop_sin_reduced:
1114       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1115       break;
1116    case ir_unop_cos:
1117    case ir_unop_cos_reduced:
1118       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1119       break;
1120
1121    case ir_unop_dFdx:
1122    case ir_unop_dFdy:
1123       assert(!"derivatives not valid in vertex shader");
1124       break;
1125
1126    case ir_unop_noise:
1127       assert(!"not reached: should be handled by lower_noise");
1128       break;
1129
1130    case ir_binop_add:
1131       emit(ADD(result_dst, op[0], op[1]));
1132       break;
1133    case ir_binop_sub:
1134       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1135       break;
1136
1137    case ir_binop_mul:
1138       if (ir->type->is_integer()) {
1139          /* For integer multiplication, the MUL uses the low 16 bits
1140           * of one of the operands (src0 on gen6, src1 on gen7).  The
1141           * MACH accumulates in the contribution of the upper 16 bits
1142           * of that operand.
1143           *
1144           * FINISHME: Emit just the MUL if we know an operand is small
1145           * enough.
1146           */
1147          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1148
1149          emit(MUL(acc, op[0], op[1]));
1150          emit(MACH(dst_null_d(), op[0], op[1]));
1151          emit(MOV(result_dst, src_reg(acc)));
1152       } else {
1153          emit(MUL(result_dst, op[0], op[1]));
1154       }
1155       break;
1156    case ir_binop_div:
1157       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1158       assert(ir->type->is_integer());
1159       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1160       break;
1161    case ir_binop_mod:
1162       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1163       assert(ir->type->is_integer());
1164       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1165       break;
1166
1167    case ir_binop_less:
1168    case ir_binop_greater:
1169    case ir_binop_lequal:
1170    case ir_binop_gequal:
1171    case ir_binop_equal:
1172    case ir_binop_nequal: {
1173       emit(CMP(result_dst, op[0], op[1],
1174                brw_conditional_for_comparison(ir->operation)));
1175       emit(AND(result_dst, result_src, src_reg(0x1)));
1176       break;
1177    }
1178
1179    case ir_binop_all_equal:
1180       /* "==" operator producing a scalar boolean. */
1181       if (ir->operands[0]->type->is_vector() ||
1182           ir->operands[1]->type->is_vector()) {
1183          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1184          emit(MOV(result_dst, src_reg(0)));
1185          inst = emit(MOV(result_dst, src_reg(1)));
1186          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1187       } else {
1188          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1189          emit(AND(result_dst, result_src, src_reg(0x1)));
1190       }
1191       break;
1192    case ir_binop_any_nequal:
1193       /* "!=" operator producing a scalar boolean. */
1194       if (ir->operands[0]->type->is_vector() ||
1195           ir->operands[1]->type->is_vector()) {
1196          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1197
1198          emit(MOV(result_dst, src_reg(0)));
1199          inst = emit(MOV(result_dst, src_reg(1)));
1200          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1201       } else {
1202          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1203          emit(AND(result_dst, result_src, src_reg(0x1)));
1204       }
1205       break;
1206
1207    case ir_unop_any:
1208       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1209       emit(MOV(result_dst, src_reg(0)));
1210
1211       inst = emit(MOV(result_dst, src_reg(1)));
1212       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1213       break;
1214
1215    case ir_binop_logic_xor:
1216       emit(XOR(result_dst, op[0], op[1]));
1217       break;
1218
1219    case ir_binop_logic_or:
1220       emit(OR(result_dst, op[0], op[1]));
1221       break;
1222
1223    case ir_binop_logic_and:
1224       emit(AND(result_dst, op[0], op[1]));
1225       break;
1226
1227    case ir_binop_dot:
1228       assert(ir->operands[0]->type->is_vector());
1229       assert(ir->operands[0]->type == ir->operands[1]->type);
1230       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1231       break;
1232
1233    case ir_unop_sqrt:
1234       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1235       break;
1236    case ir_unop_rsq:
1237       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1238       break;
1239
1240    case ir_unop_bitcast_i2f:
1241    case ir_unop_bitcast_u2f:
1242       this->result = op[0];
1243       this->result.type = BRW_REGISTER_TYPE_F;
1244       break;
1245
1246    case ir_unop_bitcast_f2i:
1247       this->result = op[0];
1248       this->result.type = BRW_REGISTER_TYPE_D;
1249       break;
1250
1251    case ir_unop_bitcast_f2u:
1252       this->result = op[0];
1253       this->result.type = BRW_REGISTER_TYPE_UD;
1254       break;
1255
1256    case ir_unop_i2f:
1257    case ir_unop_i2u:
1258    case ir_unop_u2i:
1259    case ir_unop_u2f:
1260    case ir_unop_b2f:
1261    case ir_unop_b2i:
1262    case ir_unop_f2i:
1263    case ir_unop_f2u:
1264       emit(MOV(result_dst, op[0]));
1265       break;
1266    case ir_unop_f2b:
1267    case ir_unop_i2b: {
1268       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1269       emit(AND(result_dst, result_src, src_reg(1)));
1270       break;
1271    }
1272
1273    case ir_unop_trunc:
1274       emit(RNDZ(result_dst, op[0]));
1275       break;
1276    case ir_unop_ceil:
1277       op[0].negate = !op[0].negate;
1278       inst = emit(RNDD(result_dst, op[0]));
1279       this->result.negate = true;
1280       break;
1281    case ir_unop_floor:
1282       inst = emit(RNDD(result_dst, op[0]));
1283       break;
1284    case ir_unop_fract:
1285       inst = emit(FRC(result_dst, op[0]));
1286       break;
1287    case ir_unop_round_even:
1288       emit(RNDE(result_dst, op[0]));
1289       break;
1290
1291    case ir_binop_min:
1292       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1293       break;
1294    case ir_binop_max:
1295       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1296       break;
1297
1298    case ir_binop_pow:
1299       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1300       break;
1301
1302    case ir_unop_bit_not:
1303       inst = emit(NOT(result_dst, op[0]));
1304       break;
1305    case ir_binop_bit_and:
1306       inst = emit(AND(result_dst, op[0], op[1]));
1307       break;
1308    case ir_binop_bit_xor:
1309       inst = emit(XOR(result_dst, op[0], op[1]));
1310       break;
1311    case ir_binop_bit_or:
1312       inst = emit(OR(result_dst, op[0], op[1]));
1313       break;
1314
1315    case ir_binop_lshift:
1316       inst = emit(BRW_OPCODE_SHL, result_dst, op[0], op[1]);
1317       break;
1318
1319    case ir_binop_rshift:
1320       if (ir->type->base_type == GLSL_TYPE_INT)
1321          inst = emit(BRW_OPCODE_ASR, result_dst, op[0], op[1]);
1322       else
1323          inst = emit(BRW_OPCODE_SHR, result_dst, op[0], op[1]);
1324       break;
1325
1326    case ir_binop_ubo_load: {
1327       ir_constant *uniform_block = ir->operands[0]->as_constant();
1328       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1329       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1330       src_reg offset = op[1];
1331
1332       /* Now, load the vector from that offset. */
1333       assert(ir->type->is_vector() || ir->type->is_scalar());
1334
1335       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1336       packed_consts.type = result.type;
1337       src_reg surf_index =
1338          src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1339       if (const_offset_ir) {
1340          offset = src_reg(const_offset / 16);
1341       } else {
1342          emit(BRW_OPCODE_SHR, dst_reg(offset), offset, src_reg(4));
1343       }
1344
1345       vec4_instruction *pull =
1346          emit(new(mem_ctx) vec4_instruction(this,
1347                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1348                                             dst_reg(packed_consts),
1349                                             surf_index,
1350                                             offset));
1351       pull->base_mrf = 14;
1352       pull->mlen = 1;
1353
1354       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1355       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1356                                             const_offset % 16 / 4,
1357                                             const_offset % 16 / 4,
1358                                             const_offset % 16 / 4);
1359
1360       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1361       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1362          emit(CMP(result_dst, packed_consts, src_reg(0u),
1363                   BRW_CONDITIONAL_NZ));
1364          emit(AND(result_dst, result, src_reg(0x1)));
1365       } else {
1366          emit(MOV(result_dst, packed_consts));
1367       }
1368       break;
1369    }
1370
1371    case ir_quadop_vector:
1372       assert(!"not reached: should be handled by lower_quadop_vector");
1373       break;
1374    }
1375 }
1376
1377
1378 void
1379 vec4_visitor::visit(ir_swizzle *ir)
1380 {
1381    src_reg src;
1382    int i = 0;
1383    int swizzle[4];
1384
1385    /* Note that this is only swizzles in expressions, not those on the left
1386     * hand side of an assignment, which do write masking.  See ir_assignment
1387     * for that.
1388     */
1389
1390    ir->val->accept(this);
1391    src = this->result;
1392    assert(src.file != BAD_FILE);
1393
1394    for (i = 0; i < ir->type->vector_elements; i++) {
1395       switch (i) {
1396       case 0:
1397          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1398          break;
1399       case 1:
1400          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1401          break;
1402       case 2:
1403          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1404          break;
1405       case 3:
1406          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1407             break;
1408       }
1409    }
1410    for (; i < 4; i++) {
1411       /* Replicate the last channel out. */
1412       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1413    }
1414
1415    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1416
1417    this->result = src;
1418 }
1419
1420 void
1421 vec4_visitor::visit(ir_dereference_variable *ir)
1422 {
1423    const struct glsl_type *type = ir->type;
1424    dst_reg *reg = variable_storage(ir->var);
1425
1426    if (!reg) {
1427       fail("Failed to find variable storage for %s\n", ir->var->name);
1428       this->result = src_reg(brw_null_reg());
1429       return;
1430    }
1431
1432    this->result = src_reg(*reg);
1433
1434    /* System values get their swizzle from the dst_reg writemask */
1435    if (ir->var->mode == ir_var_system_value)
1436       return;
1437
1438    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1439       this->result.swizzle = swizzle_for_size(type->vector_elements);
1440 }
1441
1442 void
1443 vec4_visitor::visit(ir_dereference_array *ir)
1444 {
1445    ir_constant *constant_index;
1446    src_reg src;
1447    int element_size = type_size(ir->type);
1448
1449    constant_index = ir->array_index->constant_expression_value();
1450
1451    ir->array->accept(this);
1452    src = this->result;
1453
1454    if (constant_index) {
1455       src.reg_offset += constant_index->value.i[0] * element_size;
1456    } else {
1457       /* Variable index array dereference.  It eats the "vec4" of the
1458        * base of the array and an index that offsets the Mesa register
1459        * index.
1460        */
1461       ir->array_index->accept(this);
1462
1463       src_reg index_reg;
1464
1465       if (element_size == 1) {
1466          index_reg = this->result;
1467       } else {
1468          index_reg = src_reg(this, glsl_type::int_type);
1469
1470          emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1471       }
1472
1473       if (src.reladdr) {
1474          src_reg temp = src_reg(this, glsl_type::int_type);
1475
1476          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1477
1478          index_reg = temp;
1479       }
1480
1481       src.reladdr = ralloc(mem_ctx, src_reg);
1482       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1483    }
1484
1485    /* If the type is smaller than a vec4, replicate the last channel out. */
1486    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1487       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1488    else
1489       src.swizzle = BRW_SWIZZLE_NOOP;
1490    src.type = brw_type_for_base_type(ir->type);
1491
1492    this->result = src;
1493 }
1494
1495 void
1496 vec4_visitor::visit(ir_dereference_record *ir)
1497 {
1498    unsigned int i;
1499    const glsl_type *struct_type = ir->record->type;
1500    int offset = 0;
1501
1502    ir->record->accept(this);
1503
1504    for (i = 0; i < struct_type->length; i++) {
1505       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1506          break;
1507       offset += type_size(struct_type->fields.structure[i].type);
1508    }
1509
1510    /* If the type is smaller than a vec4, replicate the last channel out. */
1511    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1512       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1513    else
1514       this->result.swizzle = BRW_SWIZZLE_NOOP;
1515    this->result.type = brw_type_for_base_type(ir->type);
1516
1517    this->result.reg_offset += offset;
1518 }
1519
1520 /**
1521  * We want to be careful in assignment setup to hit the actual storage
1522  * instead of potentially using a temporary like we might with the
1523  * ir_dereference handler.
1524  */
1525 static dst_reg
1526 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1527 {
1528    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1529     * access of a vector, it must be separated into a series conditional moves
1530     * before reaching this point (see ir_vec_index_to_cond_assign).
1531     */
1532    assert(ir->as_dereference());
1533    ir_dereference_array *deref_array = ir->as_dereference_array();
1534    if (deref_array) {
1535       assert(!deref_array->array->type->is_vector());
1536    }
1537
1538    /* Use the rvalue deref handler for the most part.  We'll ignore
1539     * swizzles in it and write swizzles using writemask, though.
1540     */
1541    ir->accept(v);
1542    return dst_reg(v->result);
1543 }
1544
1545 void
1546 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1547                               const struct glsl_type *type, uint32_t predicate)
1548 {
1549    if (type->base_type == GLSL_TYPE_STRUCT) {
1550       for (unsigned int i = 0; i < type->length; i++) {
1551          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1552       }
1553       return;
1554    }
1555
1556    if (type->is_array()) {
1557       for (unsigned int i = 0; i < type->length; i++) {
1558          emit_block_move(dst, src, type->fields.array, predicate);
1559       }
1560       return;
1561    }
1562
1563    if (type->is_matrix()) {
1564       const struct glsl_type *vec_type;
1565
1566       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1567                                          type->vector_elements, 1);
1568
1569       for (int i = 0; i < type->matrix_columns; i++) {
1570          emit_block_move(dst, src, vec_type, predicate);
1571       }
1572       return;
1573    }
1574
1575    assert(type->is_scalar() || type->is_vector());
1576
1577    dst->type = brw_type_for_base_type(type);
1578    src->type = dst->type;
1579
1580    dst->writemask = (1 << type->vector_elements) - 1;
1581
1582    src->swizzle = swizzle_for_size(type->vector_elements);
1583
1584    vec4_instruction *inst = emit(MOV(*dst, *src));
1585    inst->predicate = predicate;
1586
1587    dst->reg_offset++;
1588    src->reg_offset++;
1589 }
1590
1591
1592 /* If the RHS processing resulted in an instruction generating a
1593  * temporary value, and it would be easy to rewrite the instruction to
1594  * generate its result right into the LHS instead, do so.  This ends
1595  * up reliably removing instructions where it can be tricky to do so
1596  * later without real UD chain information.
1597  */
1598 bool
1599 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1600                                      dst_reg dst,
1601                                      src_reg src,
1602                                      vec4_instruction *pre_rhs_inst,
1603                                      vec4_instruction *last_rhs_inst)
1604 {
1605    /* This could be supported, but it would take more smarts. */
1606    if (ir->condition)
1607       return false;
1608
1609    if (pre_rhs_inst == last_rhs_inst)
1610       return false; /* No instructions generated to work with. */
1611
1612    /* Make sure the last instruction generated our source reg. */
1613    if (src.file != GRF ||
1614        src.file != last_rhs_inst->dst.file ||
1615        src.reg != last_rhs_inst->dst.reg ||
1616        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1617        src.reladdr ||
1618        src.abs ||
1619        src.negate ||
1620        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1621       return false;
1622
1623    /* Check that that last instruction fully initialized the channels
1624     * we want to use, in the order we want to use them.  We could
1625     * potentially reswizzle the operands of many instructions so that
1626     * we could handle out of order channels, but don't yet.
1627     */
1628
1629    for (unsigned i = 0; i < 4; i++) {
1630       if (dst.writemask & (1 << i)) {
1631          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1632             return false;
1633
1634          if (BRW_GET_SWZ(src.swizzle, i) != i)
1635             return false;
1636       }
1637    }
1638
1639    /* Success!  Rewrite the instruction. */
1640    last_rhs_inst->dst.file = dst.file;
1641    last_rhs_inst->dst.reg = dst.reg;
1642    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1643    last_rhs_inst->dst.reladdr = dst.reladdr;
1644    last_rhs_inst->dst.writemask &= dst.writemask;
1645
1646    return true;
1647 }
1648
1649 void
1650 vec4_visitor::visit(ir_assignment *ir)
1651 {
1652    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1653    uint32_t predicate = BRW_PREDICATE_NONE;
1654
1655    if (!ir->lhs->type->is_scalar() &&
1656        !ir->lhs->type->is_vector()) {
1657       ir->rhs->accept(this);
1658       src_reg src = this->result;
1659
1660       if (ir->condition) {
1661          emit_bool_to_cond_code(ir->condition, &predicate);
1662       }
1663
1664       /* emit_block_move doesn't account for swizzles in the source register.
1665        * This should be ok, since the source register is a structure or an
1666        * array, and those can't be swizzled.  But double-check to be sure.
1667        */
1668       assert(src.swizzle ==
1669              (ir->rhs->type->is_matrix()
1670               ? swizzle_for_size(ir->rhs->type->vector_elements)
1671               : BRW_SWIZZLE_NOOP));
1672
1673       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1674       return;
1675    }
1676
1677    /* Now we're down to just a scalar/vector with writemasks. */
1678    int i;
1679
1680    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1681    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1682
1683    ir->rhs->accept(this);
1684
1685    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1686
1687    src_reg src = this->result;
1688
1689    int swizzles[4];
1690    int first_enabled_chan = 0;
1691    int src_chan = 0;
1692
1693    assert(ir->lhs->type->is_vector() ||
1694           ir->lhs->type->is_scalar());
1695    dst.writemask = ir->write_mask;
1696
1697    for (int i = 0; i < 4; i++) {
1698       if (dst.writemask & (1 << i)) {
1699          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1700          break;
1701       }
1702    }
1703
1704    /* Swizzle a small RHS vector into the channels being written.
1705     *
1706     * glsl ir treats write_mask as dictating how many channels are
1707     * present on the RHS while in our instructions we need to make
1708     * those channels appear in the slots of the vec4 they're written to.
1709     */
1710    for (int i = 0; i < 4; i++) {
1711       if (dst.writemask & (1 << i))
1712          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1713       else
1714          swizzles[i] = first_enabled_chan;
1715    }
1716    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1717                               swizzles[2], swizzles[3]);
1718
1719    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1720       return;
1721    }
1722
1723    if (ir->condition) {
1724       emit_bool_to_cond_code(ir->condition, &predicate);
1725    }
1726
1727    for (i = 0; i < type_size(ir->lhs->type); i++) {
1728       vec4_instruction *inst = emit(MOV(dst, src));
1729       inst->predicate = predicate;
1730
1731       dst.reg_offset++;
1732       src.reg_offset++;
1733    }
1734 }
1735
1736 void
1737 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1738 {
1739    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1740       foreach_list(node, &ir->components) {
1741          ir_constant *field_value = (ir_constant *)node;
1742
1743          emit_constant_values(dst, field_value);
1744       }
1745       return;
1746    }
1747
1748    if (ir->type->is_array()) {
1749       for (unsigned int i = 0; i < ir->type->length; i++) {
1750          emit_constant_values(dst, ir->array_elements[i]);
1751       }
1752       return;
1753    }
1754
1755    if (ir->type->is_matrix()) {
1756       for (int i = 0; i < ir->type->matrix_columns; i++) {
1757          float *vec = &ir->value.f[i * ir->type->vector_elements];
1758
1759          for (int j = 0; j < ir->type->vector_elements; j++) {
1760             dst->writemask = 1 << j;
1761             dst->type = BRW_REGISTER_TYPE_F;
1762
1763             emit(MOV(*dst, src_reg(vec[j])));
1764          }
1765          dst->reg_offset++;
1766       }
1767       return;
1768    }
1769
1770    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
1771
1772    for (int i = 0; i < ir->type->vector_elements; i++) {
1773       if (!(remaining_writemask & (1 << i)))
1774          continue;
1775
1776       dst->writemask = 1 << i;
1777       dst->type = brw_type_for_base_type(ir->type);
1778
1779       /* Find other components that match the one we're about to
1780        * write.  Emits fewer instructions for things like vec4(0.5,
1781        * 1.5, 1.5, 1.5).
1782        */
1783       for (int j = i + 1; j < ir->type->vector_elements; j++) {
1784          if (ir->type->base_type == GLSL_TYPE_BOOL) {
1785             if (ir->value.b[i] == ir->value.b[j])
1786                dst->writemask |= (1 << j);
1787          } else {
1788             /* u, i, and f storage all line up, so no need for a
1789              * switch case for comparing each type.
1790              */
1791             if (ir->value.u[i] == ir->value.u[j])
1792                dst->writemask |= (1 << j);
1793          }
1794       }
1795
1796       switch (ir->type->base_type) {
1797       case GLSL_TYPE_FLOAT:
1798          emit(MOV(*dst, src_reg(ir->value.f[i])));
1799          break;
1800       case GLSL_TYPE_INT:
1801          emit(MOV(*dst, src_reg(ir->value.i[i])));
1802          break;
1803       case GLSL_TYPE_UINT:
1804          emit(MOV(*dst, src_reg(ir->value.u[i])));
1805          break;
1806       case GLSL_TYPE_BOOL:
1807          emit(MOV(*dst, src_reg(ir->value.b[i])));
1808          break;
1809       default:
1810          assert(!"Non-float/uint/int/bool constant");
1811          break;
1812       }
1813
1814       remaining_writemask &= ~dst->writemask;
1815    }
1816    dst->reg_offset++;
1817 }
1818
1819 void
1820 vec4_visitor::visit(ir_constant *ir)
1821 {
1822    dst_reg dst = dst_reg(this, ir->type);
1823    this->result = src_reg(dst);
1824
1825    emit_constant_values(&dst, ir);
1826 }
1827
1828 void
1829 vec4_visitor::visit(ir_call *ir)
1830 {
1831    assert(!"not reached");
1832 }
1833
1834 void
1835 vec4_visitor::visit(ir_texture *ir)
1836 {
1837    int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
1838
1839    /* Should be lowered by do_lower_texture_projection */
1840    assert(!ir->projector);
1841
1842    /* Generate code to compute all the subexpression trees.  This has to be
1843     * done before loading any values into MRFs for the sampler message since
1844     * generating these values may involve SEND messages that need the MRFs.
1845     */
1846    src_reg coordinate;
1847    if (ir->coordinate) {
1848       ir->coordinate->accept(this);
1849       coordinate = this->result;
1850    }
1851
1852    src_reg shadow_comparitor;
1853    if (ir->shadow_comparitor) {
1854       ir->shadow_comparitor->accept(this);
1855       shadow_comparitor = this->result;
1856    }
1857
1858    src_reg lod, dPdx, dPdy;
1859    switch (ir->op) {
1860    case ir_txf:
1861    case ir_txl:
1862    case ir_txs:
1863       ir->lod_info.lod->accept(this);
1864       lod = this->result;
1865       break;
1866    case ir_txd:
1867       ir->lod_info.grad.dPdx->accept(this);
1868       dPdx = this->result;
1869
1870       ir->lod_info.grad.dPdy->accept(this);
1871       dPdy = this->result;
1872       break;
1873    case ir_tex:
1874    case ir_txb:
1875       break;
1876    }
1877
1878    vec4_instruction *inst = NULL;
1879    switch (ir->op) {
1880    case ir_tex:
1881    case ir_txl:
1882       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
1883       break;
1884    case ir_txd:
1885       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
1886       break;
1887    case ir_txf:
1888       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
1889       break;
1890    case ir_txs:
1891       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
1892       break;
1893    case ir_txb:
1894       assert(!"TXB is not valid for vertex shaders.");
1895    }
1896
1897    /* Texel offsets go in the message header; Gen4 also requires headers. */
1898    inst->header_present = ir->offset || intel->gen < 5;
1899    inst->base_mrf = 2;
1900    inst->mlen = inst->header_present + 1; /* always at least one */
1901    inst->sampler = sampler;
1902    inst->dst = dst_reg(this, ir->type);
1903    inst->shadow_compare = ir->shadow_comparitor != NULL;
1904
1905    if (ir->offset != NULL && ir->op != ir_txf)
1906       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
1907
1908    /* MRF for the first parameter */
1909    int param_base = inst->base_mrf + inst->header_present;
1910
1911    if (ir->op == ir_txs) {
1912       int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
1913       emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, writemask),
1914            lod));
1915    } else {
1916       int i, coord_mask = 0, zero_mask = 0;
1917       /* Load the coordinate */
1918       /* FINISHME: gl_clamp_mask and saturate */
1919       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
1920          coord_mask |= (1 << i);
1921       for (; i < 4; i++)
1922          zero_mask |= (1 << i);
1923
1924       if (ir->offset && ir->op == ir_txf) {
1925          /* It appears that the ld instruction used for txf does its
1926           * address bounds check before adding in the offset.  To work
1927           * around this, just add the integer offset to the integer
1928           * texel coordinate, and don't put the offset in the header.
1929           */
1930          ir_constant *offset = ir->offset->as_constant();
1931          assert(offset);
1932
1933          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
1934             src_reg src = coordinate;
1935             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
1936                                        BRW_GET_SWZ(src.swizzle, j),
1937                                        BRW_GET_SWZ(src.swizzle, j),
1938                                        BRW_GET_SWZ(src.swizzle, j));
1939             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
1940                      src, offset->value.i[j]));
1941          }
1942       } else {
1943          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
1944                   coordinate));
1945       }
1946       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
1947                src_reg(0)));
1948       /* Load the shadow comparitor */
1949       if (ir->shadow_comparitor) {
1950          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
1951                           WRITEMASK_X),
1952                   shadow_comparitor));
1953          inst->mlen++;
1954       }
1955
1956       /* Load the LOD info */
1957       if (ir->op == ir_txl) {
1958          int mrf, writemask;
1959          if (intel->gen >= 5) {
1960             mrf = param_base + 1;
1961             if (ir->shadow_comparitor) {
1962                writemask = WRITEMASK_Y;
1963                /* mlen already incremented */
1964             } else {
1965                writemask = WRITEMASK_X;
1966                inst->mlen++;
1967             }
1968          } else /* intel->gen == 4 */ {
1969             mrf = param_base;
1970             writemask = WRITEMASK_Z;
1971          }
1972          emit(MOV(dst_reg(MRF, mrf, ir->lod_info.lod->type, writemask), lod));
1973       } else if (ir->op == ir_txf) {
1974          emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, WRITEMASK_W),
1975                   lod));
1976       } else if (ir->op == ir_txd) {
1977          const glsl_type *type = ir->lod_info.grad.dPdx->type;
1978
1979          if (intel->gen >= 5) {
1980             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1981             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1982             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
1983             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
1984             inst->mlen++;
1985
1986             if (ir->type->vector_elements == 3) {
1987                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
1988                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
1989                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
1990                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
1991                inst->mlen++;
1992             }
1993          } else /* intel->gen == 4 */ {
1994             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
1995             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
1996             inst->mlen += 2;
1997          }
1998       }
1999    }
2000
2001    emit(inst);
2002
2003    swizzle_result(ir, src_reg(inst->dst), sampler);
2004 }
2005
2006 void
2007 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2008 {
2009    this->result = orig_val;
2010
2011    int s = c->key.tex.swizzles[sampler];
2012
2013    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2014                         || s == SWIZZLE_NOOP)
2015       return;
2016
2017    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2018    int swizzle[4];
2019
2020    for (int i = 0; i < 4; i++) {
2021       switch (GET_SWZ(s, i)) {
2022       case SWIZZLE_ZERO:
2023          zero_mask |= (1 << i);
2024          break;
2025       case SWIZZLE_ONE:
2026          one_mask |= (1 << i);
2027          break;
2028       default:
2029          copy_mask |= (1 << i);
2030          swizzle[i] = GET_SWZ(s, i);
2031          break;
2032       }
2033    }
2034
2035    this->result = src_reg(this, ir->type);
2036    dst_reg swizzled_result(this->result);
2037
2038    if (copy_mask) {
2039       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2040       swizzled_result.writemask = copy_mask;
2041       emit(MOV(swizzled_result, orig_val));
2042    }
2043
2044    if (zero_mask) {
2045       swizzled_result.writemask = zero_mask;
2046       emit(MOV(swizzled_result, src_reg(0.0f)));
2047    }
2048
2049    if (one_mask) {
2050       swizzled_result.writemask = one_mask;
2051       emit(MOV(swizzled_result, src_reg(1.0f)));
2052    }
2053 }
2054
2055 void
2056 vec4_visitor::visit(ir_return *ir)
2057 {
2058    assert(!"not reached");
2059 }
2060
2061 void
2062 vec4_visitor::visit(ir_discard *ir)
2063 {
2064    assert(!"not reached");
2065 }
2066
2067 void
2068 vec4_visitor::visit(ir_if *ir)
2069 {
2070    /* Don't point the annotation at the if statement, because then it plus
2071     * the then and else blocks get printed.
2072     */
2073    this->base_ir = ir->condition;
2074
2075    if (intel->gen == 6) {
2076       emit_if_gen6(ir);
2077    } else {
2078       uint32_t predicate;
2079       emit_bool_to_cond_code(ir->condition, &predicate);
2080       emit(IF(predicate));
2081    }
2082
2083    visit_instructions(&ir->then_instructions);
2084
2085    if (!ir->else_instructions.is_empty()) {
2086       this->base_ir = ir->condition;
2087       emit(BRW_OPCODE_ELSE);
2088
2089       visit_instructions(&ir->else_instructions);
2090    }
2091
2092    this->base_ir = ir->condition;
2093    emit(BRW_OPCODE_ENDIF);
2094 }
2095
2096 void
2097 vec4_visitor::emit_ndc_computation()
2098 {
2099    /* Get the position */
2100    src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
2101
2102    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2103    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2104    output_reg[BRW_VERT_RESULT_NDC] = ndc;
2105
2106    current_annotation = "NDC";
2107    dst_reg ndc_w = ndc;
2108    ndc_w.writemask = WRITEMASK_W;
2109    src_reg pos_w = pos;
2110    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2111    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2112
2113    dst_reg ndc_xyz = ndc;
2114    ndc_xyz.writemask = WRITEMASK_XYZ;
2115
2116    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2117 }
2118
2119 void
2120 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2121 {
2122    if (intel->gen < 6 &&
2123        ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2124         c->key.userclip_active || brw->has_negative_rhw_bug)) {
2125       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2126       dst_reg header1_w = header1;
2127       header1_w.writemask = WRITEMASK_W;
2128       GLuint i;
2129
2130       emit(MOV(header1, 0u));
2131
2132       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2133          src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2134
2135          current_annotation = "Point size";
2136          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2137          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2138       }
2139
2140       current_annotation = "Clipping flags";
2141       for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2142          vec4_instruction *inst;
2143
2144          inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2145                          src_reg(this->userplane[i])));
2146          inst->conditional_mod = BRW_CONDITIONAL_L;
2147
2148          inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2149          inst->predicate = BRW_PREDICATE_NORMAL;
2150       }
2151
2152       /* i965 clipping workaround:
2153        * 1) Test for -ve rhw
2154        * 2) If set,
2155        *      set ndc = (0,0,0,0)
2156        *      set ucp[6] = 1
2157        *
2158        * Later, clipping will detect ucp[6] and ensure the primitive is
2159        * clipped against all fixed planes.
2160        */
2161       if (brw->has_negative_rhw_bug) {
2162 #if 0
2163          /* FINISHME */
2164          brw_CMP(p,
2165                  vec8(brw_null_reg()),
2166                  BRW_CONDITIONAL_L,
2167                  brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2168                  brw_imm_f(0));
2169
2170          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2171          brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2172          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2173 #endif
2174       }
2175
2176       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2177    } else if (intel->gen < 6) {
2178       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2179    } else {
2180       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2181       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2182          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2183                   src_reg(output_reg[VERT_RESULT_PSIZ])));
2184       }
2185    }
2186 }
2187
2188 void
2189 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2190 {
2191    if (intel->gen < 6) {
2192       /* Clip distance slots are set aside in gen5, but they are not used.  It
2193        * is not clear whether we actually need to set aside space for them,
2194        * but the performance cost is negligible.
2195        */
2196       return;
2197    }
2198
2199    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2200     *
2201     *     "If a linked set of shaders forming the vertex stage contains no
2202     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2203     *     application has requested clipping against user clip planes through
2204     *     the API, then the coordinate written to gl_Position is used for
2205     *     comparison against the user clip planes."
2206     *
2207     * This function is only called if the shader didn't write to
2208     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2209     * if the user wrote to it; otherwise we use gl_Position.
2210     */
2211    gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2212    if (!(c->prog_data.outputs_written
2213          & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2214       clip_vertex = VERT_RESULT_HPOS;
2215    }
2216
2217    for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2218         ++i) {
2219       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2220                src_reg(output_reg[clip_vertex]),
2221                src_reg(this->userplane[i + offset])));
2222    }
2223 }
2224
2225 void
2226 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2227 {
2228    assert (vert_result < VERT_RESULT_MAX);
2229    reg.type = output_reg[vert_result].type;
2230    current_annotation = output_reg_annotation[vert_result];
2231    /* Copy the register, saturating if necessary */
2232    vec4_instruction *inst = emit(MOV(reg,
2233                                      src_reg(output_reg[vert_result])));
2234    if ((vert_result == VERT_RESULT_COL0 ||
2235         vert_result == VERT_RESULT_COL1 ||
2236         vert_result == VERT_RESULT_BFC0 ||
2237         vert_result == VERT_RESULT_BFC1) &&
2238        c->key.clamp_vertex_color) {
2239       inst->saturate = true;
2240    }
2241 }
2242
2243 void
2244 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2245 {
2246    struct brw_reg hw_reg = brw_message_reg(mrf);
2247    dst_reg reg = dst_reg(MRF, mrf);
2248    reg.type = BRW_REGISTER_TYPE_F;
2249
2250    switch (vert_result) {
2251    case VERT_RESULT_PSIZ:
2252       /* PSIZ is always in slot 0, and is coupled with other flags. */
2253       current_annotation = "indices, point width, clip flags";
2254       emit_psiz_and_flags(hw_reg);
2255       break;
2256    case BRW_VERT_RESULT_NDC:
2257       current_annotation = "NDC";
2258       emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2259       break;
2260    case BRW_VERT_RESULT_HPOS_DUPLICATE:
2261    case VERT_RESULT_HPOS:
2262       current_annotation = "gl_Position";
2263       emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2264       break;
2265    case VERT_RESULT_CLIP_DIST0:
2266    case VERT_RESULT_CLIP_DIST1:
2267       if (this->c->key.uses_clip_distance) {
2268          emit_generic_urb_slot(reg, vert_result);
2269       } else {
2270          current_annotation = "user clip distances";
2271          emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2272       }
2273       break;
2274    case VERT_RESULT_EDGE:
2275       /* This is present when doing unfilled polygons.  We're supposed to copy
2276        * the edge flag from the user-provided vertex array
2277        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2278        * of that attribute (starts as 1.0f).  This is then used in clipping to
2279        * determine which edges should be drawn as wireframe.
2280        */
2281       current_annotation = "edge flag";
2282       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2283                                     glsl_type::float_type, WRITEMASK_XYZW))));
2284       break;
2285    case BRW_VERT_RESULT_PAD:
2286       /* No need to write to this slot */
2287       break;
2288    default:
2289       emit_generic_urb_slot(reg, vert_result);
2290       break;
2291    }
2292 }
2293
2294 static int
2295 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2296 {
2297    struct intel_context *intel = &brw->intel;
2298
2299    if (intel->gen >= 6) {
2300       /* URB data written (does not include the message header reg) must
2301        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2302        * section 5.4.3.2.2: URB_INTERLEAVED.
2303        *
2304        * URB entries are allocated on a multiple of 1024 bits, so an
2305        * extra 128 bits written here to make the end align to 256 is
2306        * no problem.
2307        */
2308       if ((mlen % 2) != 1)
2309          mlen++;
2310    }
2311
2312    return mlen;
2313 }
2314
2315 /**
2316  * Generates the VUE payload plus the 1 or 2 URB write instructions to
2317  * complete the VS thread.
2318  *
2319  * The VUE layout is documented in Volume 2a.
2320  */
2321 void
2322 vec4_visitor::emit_urb_writes()
2323 {
2324    /* MRF 0 is reserved for the debugger, so start with message header
2325     * in MRF 1.
2326     */
2327    int base_mrf = 1;
2328    int mrf = base_mrf;
2329    /* In the process of generating our URB write message contents, we
2330     * may need to unspill a register or load from an array.  Those
2331     * reads would use MRFs 14-15.
2332     */
2333    int max_usable_mrf = 13;
2334
2335    /* The following assertion verifies that max_usable_mrf causes an
2336     * even-numbered amount of URB write data, which will meet gen6's
2337     * requirements for length alignment.
2338     */
2339    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2340
2341    /* First mrf is the g0-based message header containing URB handles and such,
2342     * which is implied in VS_OPCODE_URB_WRITE.
2343     */
2344    mrf++;
2345
2346    if (intel->gen < 6) {
2347       emit_ndc_computation();
2348    }
2349
2350    /* Set up the VUE data for the first URB write */
2351    int slot;
2352    for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) {
2353       emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2354
2355       /* If this was max_usable_mrf, we can't fit anything more into this URB
2356        * WRITE.
2357        */
2358       if (mrf > max_usable_mrf) {
2359          slot++;
2360          break;
2361       }
2362    }
2363
2364    current_annotation = "URB write";
2365    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2366    inst->base_mrf = base_mrf;
2367    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2368    inst->eot = (slot >= c->prog_data.vue_map.num_slots);
2369
2370    /* Optional second URB write */
2371    if (!inst->eot) {
2372       mrf = base_mrf + 1;
2373
2374       for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
2375          assert(mrf < max_usable_mrf);
2376
2377          emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2378       }
2379
2380       current_annotation = "URB write";
2381       inst = emit(VS_OPCODE_URB_WRITE);
2382       inst->base_mrf = base_mrf;
2383       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2384       inst->eot = true;
2385       /* URB destination offset.  In the previous write, we got MRFs
2386        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2387        * URB row increments, and each of our MRFs is half of one of
2388        * those, since we're doing interleaved writes.
2389        */
2390       inst->offset = (max_usable_mrf - base_mrf) / 2;
2391    }
2392 }
2393
2394 src_reg
2395 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2396                                  src_reg *reladdr, int reg_offset)
2397 {
2398    /* Because we store the values to scratch interleaved like our
2399     * vertex data, we need to scale the vec4 index by 2.
2400     */
2401    int message_header_scale = 2;
2402
2403    /* Pre-gen6, the message header uses byte offsets instead of vec4
2404     * (16-byte) offset units.
2405     */
2406    if (intel->gen < 6)
2407       message_header_scale *= 16;
2408
2409    if (reladdr) {
2410       src_reg index = src_reg(this, glsl_type::int_type);
2411
2412       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2413       emit_before(inst, MUL(dst_reg(index),
2414                             index, src_reg(message_header_scale)));
2415
2416       return index;
2417    } else {
2418       return src_reg(reg_offset * message_header_scale);
2419    }
2420 }
2421
2422 src_reg
2423 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2424                                        src_reg *reladdr, int reg_offset)
2425 {
2426    if (reladdr) {
2427       src_reg index = src_reg(this, glsl_type::int_type);
2428
2429       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2430
2431       /* Pre-gen6, the message header uses byte offsets instead of vec4
2432        * (16-byte) offset units.
2433        */
2434       if (intel->gen < 6) {
2435          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2436       }
2437
2438       return index;
2439    } else {
2440       int message_header_scale = intel->gen < 6 ? 16 : 1;
2441       return src_reg(reg_offset * message_header_scale);
2442    }
2443 }
2444
2445 /**
2446  * Emits an instruction before @inst to load the value named by @orig_src
2447  * from scratch space at @base_offset to @temp.
2448  *
2449  * @base_offset is measured in 32-byte units (the size of a register).
2450  */
2451 void
2452 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2453                                 dst_reg temp, src_reg orig_src,
2454                                 int base_offset)
2455 {
2456    int reg_offset = base_offset + orig_src.reg_offset;
2457    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2458
2459    emit_before(inst, SCRATCH_READ(temp, index));
2460 }
2461
2462 /**
2463  * Emits an instruction after @inst to store the value to be written
2464  * to @orig_dst to scratch space at @base_offset, from @temp.
2465  *
2466  * @base_offset is measured in 32-byte units (the size of a register).
2467  */
2468 void
2469 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2470 {
2471    int reg_offset = base_offset + inst->dst.reg_offset;
2472    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2473
2474    /* Create a temporary register to store *inst's result in.
2475     *
2476     * We have to be careful in MOVing from our temporary result register in
2477     * the scratch write.  If we swizzle from channels of the temporary that
2478     * weren't initialized, it will confuse live interval analysis, which will
2479     * make spilling fail to make progress.
2480     */
2481    src_reg temp = src_reg(this, glsl_type::vec4_type);
2482    temp.type = inst->dst.type;
2483    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2484    int swizzles[4];
2485    for (int i = 0; i < 4; i++)
2486       if (inst->dst.writemask & (1 << i))
2487          swizzles[i] = i;
2488       else
2489          swizzles[i] = first_writemask_chan;
2490    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2491                                swizzles[2], swizzles[3]);
2492
2493    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2494                                        inst->dst.writemask));
2495    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2496    write->predicate = inst->predicate;
2497    write->ir = inst->ir;
2498    write->annotation = inst->annotation;
2499    inst->insert_after(write);
2500
2501    inst->dst.file = temp.file;
2502    inst->dst.reg = temp.reg;
2503    inst->dst.reg_offset = temp.reg_offset;
2504    inst->dst.reladdr = NULL;
2505 }
2506
2507 /**
2508  * We can't generally support array access in GRF space, because a
2509  * single instruction's destination can only span 2 contiguous
2510  * registers.  So, we send all GRF arrays that get variable index
2511  * access to scratch space.
2512  */
2513 void
2514 vec4_visitor::move_grf_array_access_to_scratch()
2515 {
2516    int scratch_loc[this->virtual_grf_count];
2517
2518    for (int i = 0; i < this->virtual_grf_count; i++) {
2519       scratch_loc[i] = -1;
2520    }
2521
2522    /* First, calculate the set of virtual GRFs that need to be punted
2523     * to scratch due to having any array access on them, and where in
2524     * scratch.
2525     */
2526    foreach_list(node, &this->instructions) {
2527       vec4_instruction *inst = (vec4_instruction *)node;
2528
2529       if (inst->dst.file == GRF && inst->dst.reladdr &&
2530           scratch_loc[inst->dst.reg] == -1) {
2531          scratch_loc[inst->dst.reg] = c->last_scratch;
2532          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2533       }
2534
2535       for (int i = 0 ; i < 3; i++) {
2536          src_reg *src = &inst->src[i];
2537
2538          if (src->file == GRF && src->reladdr &&
2539              scratch_loc[src->reg] == -1) {
2540             scratch_loc[src->reg] = c->last_scratch;
2541             c->last_scratch += this->virtual_grf_sizes[src->reg];
2542          }
2543       }
2544    }
2545
2546    /* Now, for anything that will be accessed through scratch, rewrite
2547     * it to load/store.  Note that this is a _safe list walk, because
2548     * we may generate a new scratch_write instruction after the one
2549     * we're processing.
2550     */
2551    foreach_list_safe(node, &this->instructions) {
2552       vec4_instruction *inst = (vec4_instruction *)node;
2553
2554       /* Set up the annotation tracking for new generated instructions. */
2555       base_ir = inst->ir;
2556       current_annotation = inst->annotation;
2557
2558       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2559          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2560       }
2561
2562       for (int i = 0 ; i < 3; i++) {
2563          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2564             continue;
2565
2566          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2567
2568          emit_scratch_read(inst, temp, inst->src[i],
2569                            scratch_loc[inst->src[i].reg]);
2570
2571          inst->src[i].file = temp.file;
2572          inst->src[i].reg = temp.reg;
2573          inst->src[i].reg_offset = temp.reg_offset;
2574          inst->src[i].reladdr = NULL;
2575       }
2576    }
2577 }
2578
2579 /**
2580  * Emits an instruction before @inst to load the value named by @orig_src
2581  * from the pull constant buffer (surface) at @base_offset to @temp.
2582  */
2583 void
2584 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2585                                       dst_reg temp, src_reg orig_src,
2586                                       int base_offset)
2587 {
2588    int reg_offset = base_offset + orig_src.reg_offset;
2589    src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2590    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2591    vec4_instruction *load;
2592
2593    load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2594                                         temp, index, offset);
2595    load->base_mrf = 14;
2596    load->mlen = 1;
2597    emit_before(inst, load);
2598 }
2599
2600 /**
2601  * Implements array access of uniforms by inserting a
2602  * PULL_CONSTANT_LOAD instruction.
2603  *
2604  * Unlike temporary GRF array access (where we don't support it due to
2605  * the difficulty of doing relative addressing on instruction
2606  * destinations), we could potentially do array access of uniforms
2607  * that were loaded in GRF space as push constants.  In real-world
2608  * usage we've seen, though, the arrays being used are always larger
2609  * than we could load as push constants, so just always move all
2610  * uniform array access out to a pull constant buffer.
2611  */
2612 void
2613 vec4_visitor::move_uniform_array_access_to_pull_constants()
2614 {
2615    int pull_constant_loc[this->uniforms];
2616
2617    for (int i = 0; i < this->uniforms; i++) {
2618       pull_constant_loc[i] = -1;
2619    }
2620
2621    /* Walk through and find array access of uniforms.  Put a copy of that
2622     * uniform in the pull constant buffer.
2623     *
2624     * Note that we don't move constant-indexed accesses to arrays.  No
2625     * testing has been done of the performance impact of this choice.
2626     */
2627    foreach_list_safe(node, &this->instructions) {
2628       vec4_instruction *inst = (vec4_instruction *)node;
2629
2630       for (int i = 0 ; i < 3; i++) {
2631          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2632             continue;
2633
2634          int uniform = inst->src[i].reg;
2635
2636          /* If this array isn't already present in the pull constant buffer,
2637           * add it.
2638           */
2639          if (pull_constant_loc[uniform] == -1) {
2640             const float **values = &prog_data->param[uniform * 4];
2641
2642             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2643
2644             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2645                prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2646             }
2647          }
2648
2649          /* Set up the annotation tracking for new generated instructions. */
2650          base_ir = inst->ir;
2651          current_annotation = inst->annotation;
2652
2653          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2654
2655          emit_pull_constant_load(inst, temp, inst->src[i],
2656                                  pull_constant_loc[uniform]);
2657
2658          inst->src[i].file = temp.file;
2659          inst->src[i].reg = temp.reg;
2660          inst->src[i].reg_offset = temp.reg_offset;
2661          inst->src[i].reladdr = NULL;
2662       }
2663    }
2664
2665    /* Now there are no accesses of the UNIFORM file with a reladdr, so
2666     * no need to track them as larger-than-vec4 objects.  This will be
2667     * relied on in cutting out unused uniform vectors from push
2668     * constants.
2669     */
2670    split_uniform_registers();
2671 }
2672
2673 void
2674 vec4_visitor::resolve_ud_negate(src_reg *reg)
2675 {
2676    if (reg->type != BRW_REGISTER_TYPE_UD ||
2677        !reg->negate)
2678       return;
2679
2680    src_reg temp = src_reg(this, glsl_type::uvec4_type);
2681    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2682    *reg = temp;
2683 }
2684
2685 vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2686                            struct gl_shader_program *prog,
2687                            struct brw_shader *shader)
2688 {
2689    this->c = c;
2690    this->p = &c->func;
2691    this->brw = p->brw;
2692    this->intel = &brw->intel;
2693    this->ctx = &intel->ctx;
2694    this->prog = prog;
2695    this->shader = shader;
2696
2697    this->mem_ctx = ralloc_context(NULL);
2698    this->failed = false;
2699
2700    this->base_ir = NULL;
2701    this->current_annotation = NULL;
2702
2703    this->c = c;
2704    this->vp = &c->vp->program;
2705    this->prog_data = &c->prog_data;
2706
2707    this->variable_ht = hash_table_ctor(0,
2708                                        hash_table_pointer_hash,
2709                                        hash_table_pointer_compare);
2710
2711    this->virtual_grf_def = NULL;
2712    this->virtual_grf_use = NULL;
2713    this->virtual_grf_sizes = NULL;
2714    this->virtual_grf_count = 0;
2715    this->virtual_grf_reg_map = NULL;
2716    this->virtual_grf_reg_count = 0;
2717    this->virtual_grf_array_size = 0;
2718    this->live_intervals_valid = false;
2719
2720    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2721
2722    this->uniforms = 0;
2723 }
2724
2725 vec4_visitor::~vec4_visitor()
2726 {
2727    ralloc_free(this->mem_ctx);
2728    hash_table_dtor(this->variable_ht);
2729 }
2730
2731
2732 void
2733 vec4_visitor::fail(const char *format, ...)
2734 {
2735    va_list va;
2736    char *msg;
2737
2738    if (failed)
2739       return;
2740
2741    failed = true;
2742
2743    va_start(va, format);
2744    msg = ralloc_vasprintf(mem_ctx, format, va);
2745    va_end(va);
2746    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2747
2748    this->fail_msg = msg;
2749
2750    if (INTEL_DEBUG & DEBUG_VS) {
2751       fprintf(stderr, "%s",  msg);
2752    }
2753 }
2754
2755 } /* namespace brw */