src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 extern "C" {
  26 #include "main/context.h"
  27 #include "main/macros.h"
  28 #include "program/prog_parameter.h"
  29 #include "program/sampler.h"
  30 }
  31
  32 namespace brw {
  33
  34 vec4_instruction::vec4_instruction(vec4_visitor *v,
  35                                    enum opcode opcode, dst_reg dst,
  36                                    src_reg src0, src_reg src1, src_reg src2)
  37 {
  38    this->opcode = opcode;
  39    this->dst = dst;
  40    this->src[0] = src0;
  41    this->src[1] = src1;
  42    this->src[2] = src2;
  43    this->ir = v->base_ir;
  44    this->annotation = v->current_annotation;
  45 }
  46
  47 vec4_instruction *
  48 vec4_visitor::emit(vec4_instruction *inst)
  49 {
  50    this->instructions.push_tail(inst);
  51
  52    return inst;
  53 }
  54
  55 vec4_instruction *
  56 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  57 {
  58    new_inst->ir = inst->ir;
  59    new_inst->annotation = inst->annotation;
  60
  61    inst->insert_before(new_inst);
  62
  63    return inst;
  64 }
  65
  66 vec4_instruction *
  67 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  68                    src_reg src0, src_reg src1, src_reg src2)
  69 {
  70    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  71                                              src0, src1, src2));
  72 }
  73
  74
  75 vec4_instruction *
  76 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  77 {
  78    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  79 }
  80
  81 vec4_instruction *
  82 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  83 {
  84    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  85 }
  86
  87 vec4_instruction *
  88 vec4_visitor::emit(enum opcode opcode)
  89 {
  90    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  91 }
  92
  93 #define ALU1(op)                                                        \
  94    vec4_instruction *                                                   \
  95    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
  96    {                                                                    \
  97       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
  98                                            src0);                       \
  99    }
 100
 101 #define ALU2(op)                                                        \
 102    vec4_instruction *                                                   \
 103    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
 104    {                                                                    \
 105       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
 106                                            src0, src1);                 \
 107    }
 108
 109 ALU1(NOT)
 110 ALU1(MOV)
 111 ALU1(FRC)
 112 ALU1(RNDD)
 113 ALU1(RNDE)
 114 ALU1(RNDZ)
 115 ALU2(ADD)
 116 ALU2(MUL)
 117 ALU2(MACH)
 118 ALU2(AND)
 119 ALU2(OR)
 120 ALU2(XOR)
 121 ALU2(DP3)
 122 ALU2(DP4)
 123 ALU2(DPH)
 124 ALU2(SHL)
 125 ALU2(SHR)
 126 ALU2(ASR)
 127
 128 /** Gen4 predicated IF. */
 129 vec4_instruction *
 130 vec4_visitor::IF(uint32_t predicate)
 131 {
 132    vec4_instruction *inst;
 133
 134    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
 135    inst->predicate = predicate;
 136
 137    return inst;
 138 }
 139
 140 /** Gen6+ IF with embedded comparison. */
 141 vec4_instruction *
 142 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
 143 {
 144    assert(intel->gen >= 6);
 145
 146    vec4_instruction *inst;
 147
 148    resolve_ud_negate(&src0);
 149    resolve_ud_negate(&src1);
 150
 151    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
 152                                         src0, src1);
 153    inst->conditional_mod = condition;
 154
 155    return inst;
 156 }
 157
 158 /**
 159  * CMP: Sets the low bit of the destination channels with the result
 160  * of the comparison, while the upper bits are undefined, and updates
 161  * the flag register with the packed 16 bits of the result.
 162  */
 163 vec4_instruction *
 164 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
 165 {
 166    vec4_instruction *inst;
 167
 168    /* original gen4 does type conversion to the destination type
 169     * before before comparison, producing garbage results for floating
 170     * point comparisons.
 171     */
 172    if (intel->gen == 4) {
 173       dst.type = src0.type;
 174       if (dst.file == HW_REG)
 175          dst.fixed_hw_reg.type = dst.type;
 176    }
 177
 178    resolve_ud_negate(&src0);
 179    resolve_ud_negate(&src1);
 180
 181    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
 182    inst->conditional_mod = condition;
 183
 184    return inst;
 185 }
 186
 187 vec4_instruction *
 188 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
 189 {
 190    vec4_instruction *inst;
 191
 192    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
 193                                         dst, index);
 194    inst->base_mrf = 14;
 195    inst->mlen = 2;
 196
 197    return inst;
 198 }
 199
 200 vec4_instruction *
 201 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
 202 {
 203    vec4_instruction *inst;
 204
 205    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
 206                                         dst, src, index);
 207    inst->base_mrf = 13;
 208    inst->mlen = 3;
 209
 210    return inst;
 211 }
 212
 213 void
 214 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 215 {
 216    static enum opcode dot_opcodes[] = {
 217       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 218    };
 219
 220    emit(dot_opcodes[elements - 2], dst, src0, src1);
 221 }
 222
 223 src_reg
 224 vec4_visitor::fix_math_operand(src_reg src)
 225 {
 226    /* The gen6 math instruction ignores the source modifiers --
 227     * swizzle, abs, negate, and at least some parts of the register
 228     * region description.
 229     *
 230     * Rather than trying to enumerate all these cases, *always* expand the
 231     * operand to a temp GRF for gen6.
 232     *
 233     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 234     * can't use.
 235     */
 236
 237    if (intel->gen == 7 && src.file != IMM)
 238       return src;
 239
 240    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 241    expanded.type = src.type;
 242    emit(MOV(expanded, src));
 243    return src_reg(expanded);
 244 }
 245
 246 void
 247 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
 248 {
 249    src = fix_math_operand(src);
 250
 251    if (dst.writemask != WRITEMASK_XYZW) {
 252       /* The gen6 math instruction must be align1, so we can't do
 253        * writemasks.
 254        */
 255       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 256
 257       emit(opcode, temp_dst, src);
 258
 259       emit(MOV(dst, src_reg(temp_dst)));
 260    } else {
 261       emit(opcode, dst, src);
 262    }
 263 }
 264
 265 void
 266 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
 267 {
 268    vec4_instruction *inst = emit(opcode, dst, src);
 269    inst->base_mrf = 1;
 270    inst->mlen = 1;
 271 }
 272
 273 void
 274 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
 275 {
 276    switch (opcode) {
 277    case SHADER_OPCODE_RCP:
 278    case SHADER_OPCODE_RSQ:
 279    case SHADER_OPCODE_SQRT:
 280    case SHADER_OPCODE_EXP2:
 281    case SHADER_OPCODE_LOG2:
 282    case SHADER_OPCODE_SIN:
 283    case SHADER_OPCODE_COS:
 284       break;
 285    default:
 286       assert(!"not reached: bad math opcode");
 287       return;
 288    }
 289
 290    if (intel->gen >= 6) {
 291       return emit_math1_gen6(opcode, dst, src);
 292    } else {
 293       return emit_math1_gen4(opcode, dst, src);
 294    }
 295 }
 296
 297 void
 298 vec4_visitor::emit_math2_gen6(enum opcode opcode,
 299                               dst_reg dst, src_reg src0, src_reg src1)
 300 {
 301    src0 = fix_math_operand(src0);
 302    src1 = fix_math_operand(src1);
 303
 304    if (dst.writemask != WRITEMASK_XYZW) {
 305       /* The gen6 math instruction must be align1, so we can't do
 306        * writemasks.
 307        */
 308       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
 309       temp_dst.type = dst.type;
 310
 311       emit(opcode, temp_dst, src0, src1);
 312
 313       emit(MOV(dst, src_reg(temp_dst)));
 314    } else {
 315       emit(opcode, dst, src0, src1);
 316    }
 317 }
 318
 319 void
 320 vec4_visitor::emit_math2_gen4(enum opcode opcode,
 321                               dst_reg dst, src_reg src0, src_reg src1)
 322 {
 323    vec4_instruction *inst = emit(opcode, dst, src0, src1);
 324    inst->base_mrf = 1;
 325    inst->mlen = 2;
 326 }
 327
 328 void
 329 vec4_visitor::emit_math(enum opcode opcode,
 330                         dst_reg dst, src_reg src0, src_reg src1)
 331 {
 332    switch (opcode) {
 333    case SHADER_OPCODE_POW:
 334    case SHADER_OPCODE_INT_QUOTIENT:
 335    case SHADER_OPCODE_INT_REMAINDER:
 336       break;
 337    default:
 338       assert(!"not reached: unsupported binary math opcode");
 339       return;
 340    }
 341
 342    if (intel->gen >= 6) {
 343       return emit_math2_gen6(opcode, dst, src0, src1);
 344    } else {
 345       return emit_math2_gen4(opcode, dst, src0, src1);
 346    }
 347 }
 348
 349 void
 350 vec4_visitor::visit_instructions(const exec_list *list)
 351 {
 352    foreach_list(node, list) {
 353       ir_instruction *ir = (ir_instruction *)node;
 354
 355       base_ir = ir;
 356       ir->accept(this);
 357    }
 358 }
 359
 360
 361 static int
 362 type_size(const struct glsl_type *type)
 363 {
 364    unsigned int i;
 365    int size;
 366
 367    switch (type->base_type) {
 368    case GLSL_TYPE_UINT:
 369    case GLSL_TYPE_INT:
 370    case GLSL_TYPE_FLOAT:
 371    case GLSL_TYPE_BOOL:
 372       if (type->is_matrix()) {
 373          return type->matrix_columns;
 374       } else {
 375          /* Regardless of size of vector, it gets a vec4. This is bad
 376           * packing for things like floats, but otherwise arrays become a
 377           * mess.  Hopefully a later pass over the code can pack scalars
 378           * down if appropriate.
 379           */
 380          return 1;
 381       }
 382    case GLSL_TYPE_ARRAY:
 383       assert(type->length > 0);
 384       return type_size(type->fields.array) * type->length;
 385    case GLSL_TYPE_STRUCT:
 386       size = 0;
 387       for (i = 0; i < type->length; i++) {
 388          size += type_size(type->fields.structure[i].type);
 389       }
 390       return size;
 391    case GLSL_TYPE_SAMPLER:
 392       /* Samplers take up one slot in UNIFORMS[], but they're baked in
 393        * at link time.
 394        */
 395       return 1;
 396    default:
 397       assert(0);
 398       return 0;
 399    }
 400 }
 401
 402 int
 403 vec4_visitor::virtual_grf_alloc(int size)
 404 {
 405    if (virtual_grf_array_size <= virtual_grf_count) {
 406       if (virtual_grf_array_size == 0)
 407          virtual_grf_array_size = 16;
 408       else
 409          virtual_grf_array_size *= 2;
 410       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 411                                    virtual_grf_array_size);
 412       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
 413                                      virtual_grf_array_size);
 414    }
 415    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
 416    virtual_grf_reg_count += size;
 417    virtual_grf_sizes[virtual_grf_count] = size;
 418    return virtual_grf_count++;
 419 }
 420
 421 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 422 {
 423    init();
 424
 425    this->file = GRF;
 426    this->reg = v->virtual_grf_alloc(type_size(type));
 427
 428    if (type->is_array() || type->is_record()) {
 429       this->swizzle = BRW_SWIZZLE_NOOP;
 430    } else {
 431       this->swizzle = swizzle_for_size(type->vector_elements);
 432    }
 433
 434    this->type = brw_type_for_base_type(type);
 435 }
 436
 437 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 438 {
 439    init();
 440
 441    this->file = GRF;
 442    this->reg = v->virtual_grf_alloc(type_size(type));
 443
 444    if (type->is_array() || type->is_record()) {
 445       this->writemask = WRITEMASK_XYZW;
 446    } else {
 447       this->writemask = (1 << type->vector_elements) - 1;
 448    }
 449
 450    this->type = brw_type_for_base_type(type);
 451 }
 452
 453 /* Our support for uniforms is piggy-backed on the struct
 454  * gl_fragment_program, because that's where the values actually
 455  * get stored, rather than in some global gl_shader_program uniform
 456  * store.
 457  */
 458 int
 459 vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
 460 {
 461    unsigned int offset = 0;
 462    float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
 463
 464    if (type->is_matrix()) {
 465       const glsl_type *column = type->column_type();
 466
 467       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 468          offset += setup_uniform_values(loc + offset, column);
 469       }
 470
 471       return offset;
 472    }
 473
 474    switch (type->base_type) {
 475    case GLSL_TYPE_FLOAT:
 476    case GLSL_TYPE_UINT:
 477    case GLSL_TYPE_INT:
 478    case GLSL_TYPE_BOOL:
 479       for (unsigned int i = 0; i < type->vector_elements; i++) {
 480          c->prog_data.param[this->uniforms * 4 + i] = &values[i];
 481       }
 482
 483       /* Set up pad elements to get things aligned to a vec4 boundary. */
 484       for (unsigned int i = type->vector_elements; i < 4; i++) {
 485          static float zero = 0;
 486
 487          c->prog_data.param[this->uniforms * 4 + i] = &zero;
 488       }
 489
 490       /* Track the size of this uniform vector, for future packing of
 491        * uniforms.
 492        */
 493       this->uniform_vector_size[this->uniforms] = type->vector_elements;
 494       this->uniforms++;
 495
 496       return 1;
 497
 498    case GLSL_TYPE_STRUCT:
 499       for (unsigned int i = 0; i < type->length; i++) {
 500          offset += setup_uniform_values(loc + offset,
 501                                         type->fields.structure[i].type);
 502       }
 503       return offset;
 504
 505    case GLSL_TYPE_ARRAY:
 506       for (unsigned int i = 0; i < type->length; i++) {
 507          offset += setup_uniform_values(loc + offset, type->fields.array);
 508       }
 509       return offset;
 510
 511    case GLSL_TYPE_SAMPLER:
 512       /* The sampler takes up a slot, but we don't use any values from it. */
 513       return 1;
 514
 515    default:
 516       assert(!"not reached");
 517       return 0;
 518    }
 519 }
 520
 521 void
 522 vec4_visitor::setup_uniform_clipplane_values()
 523 {
 524    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 525
 526    if (intel->gen < 6) {
 527       /* Pre-Gen6, we compact clip planes.  For example, if the user
 528        * enables just clip planes 0, 1, and 3, we will enable clip planes
 529        * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
 530        * plane 2.  This simplifies the implementation of the Gen6 clip
 531        * thread.
 532        */
 533       int compacted_clipplane_index = 0;
 534       for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
 535          if (!(c->key.userclip_planes_enabled_gen_4_5 & (1 << i)))
 536             continue;
 537
 538          this->uniform_vector_size[this->uniforms] = 4;
 539          this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
 540          this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
 541          for (int j = 0; j < 4; ++j) {
 542             c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 543          }
 544          ++compacted_clipplane_index;
 545          ++this->uniforms;
 546       }
 547    } else {
 548       /* In Gen6 and later, we don't compact clip planes, because this
 549        * simplifies the implementation of gl_ClipDistance.
 550        */
 551       for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
 552          this->uniform_vector_size[this->uniforms] = 4;
 553          this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 554          this->userplane[i].type = BRW_REGISTER_TYPE_F;
 555          for (int j = 0; j < 4; ++j) {
 556             c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
 557          }
 558          ++this->uniforms;
 559       }
 560    }
 561 }
 562
 563 /* Our support for builtin uniforms is even scarier than non-builtin.
 564  * It sits on top of the PROG_STATE_VAR parameters that are
 565  * automatically updated from GL context state.
 566  */
 567 void
 568 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 569 {
 570    const ir_state_slot *const slots = ir->state_slots;
 571    assert(ir->state_slots != NULL);
 572
 573    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 574       /* This state reference has already been setup by ir_to_mesa,
 575        * but we'll get the same index back here.  We can reference
 576        * ParameterValues directly, since unlike brw_fs.cpp, we never
 577        * add new state references during compile.
 578        */
 579       int index = _mesa_add_state_reference(this->vp->Base.Parameters,
 580                                             (gl_state_index *)slots[i].tokens);
 581       float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
 582
 583       this->uniform_vector_size[this->uniforms] = 0;
 584       /* Add each of the unique swizzled channels of the element.
 585        * This will end up matching the size of the glsl_type of this field.
 586        */
 587       int last_swiz = -1;
 588       for (unsigned int j = 0; j < 4; j++) {
 589          int swiz = GET_SWZ(slots[i].swizzle, j);
 590          last_swiz = swiz;
 591
 592          c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
 593          if (swiz <= last_swiz)
 594             this->uniform_vector_size[this->uniforms]++;
 595       }
 596       this->uniforms++;
 597    }
 598 }
 599
 600 dst_reg *
 601 vec4_visitor::variable_storage(ir_variable *var)
 602 {
 603    return (dst_reg *)hash_table_find(this->variable_ht, var);
 604 }
 605
 606 void
 607 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
 608 {
 609    ir_expression *expr = ir->as_expression();
 610
 611    *predicate = BRW_PREDICATE_NORMAL;
 612
 613    if (expr) {
 614       src_reg op[2];
 615       vec4_instruction *inst;
 616
 617       assert(expr->get_num_operands() <= 2);
 618       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 619          expr->operands[i]->accept(this);
 620          op[i] = this->result;
 621
 622          resolve_ud_negate(&op[i]);
 623       }
 624
 625       switch (expr->operation) {
 626       case ir_unop_logic_not:
 627          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 628          inst->conditional_mod = BRW_CONDITIONAL_Z;
 629          break;
 630
 631       case ir_binop_logic_xor:
 632          inst = emit(XOR(dst_null_d(), op[0], op[1]));
 633          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 634          break;
 635
 636       case ir_binop_logic_or:
 637          inst = emit(OR(dst_null_d(), op[0], op[1]));
 638          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 639          break;
 640
 641       case ir_binop_logic_and:
 642          inst = emit(AND(dst_null_d(), op[0], op[1]));
 643          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 644          break;
 645
 646       case ir_unop_f2b:
 647          if (intel->gen >= 6) {
 648             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 649          } else {
 650             inst = emit(MOV(dst_null_f(), op[0]));
 651             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 652          }
 653          break;
 654
 655       case ir_unop_i2b:
 656          if (intel->gen >= 6) {
 657             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 658          } else {
 659             inst = emit(MOV(dst_null_d(), op[0]));
 660             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 661          }
 662          break;
 663
 664       case ir_binop_all_equal:
 665          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 666          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 667          break;
 668
 669       case ir_binop_any_nequal:
 670          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 671          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 672          break;
 673
 674       case ir_unop_any:
 675          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 676          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 677          break;
 678
 679       case ir_binop_greater:
 680       case ir_binop_gequal:
 681       case ir_binop_less:
 682       case ir_binop_lequal:
 683       case ir_binop_equal:
 684       case ir_binop_nequal:
 685          emit(CMP(dst_null_d(), op[0], op[1],
 686                   brw_conditional_for_comparison(expr->operation)));
 687          break;
 688
 689       default:
 690          assert(!"not reached");
 691          break;
 692       }
 693       return;
 694    }
 695
 696    ir->accept(this);
 697
 698    resolve_ud_negate(&this->result);
 699
 700    if (intel->gen >= 6) {
 701       vec4_instruction *inst = emit(AND(dst_null_d(),
 702                                         this->result, src_reg(1)));
 703       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 704    } else {
 705       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
 706       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 707    }
 708 }
 709
 710 /**
 711  * Emit a gen6 IF statement with the comparison folded into the IF
 712  * instruction.
 713  */
 714 void
 715 vec4_visitor::emit_if_gen6(ir_if *ir)
 716 {
 717    ir_expression *expr = ir->condition->as_expression();
 718
 719    if (expr) {
 720       src_reg op[2];
 721       dst_reg temp;
 722
 723       assert(expr->get_num_operands() <= 2);
 724       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 725          expr->operands[i]->accept(this);
 726          op[i] = this->result;
 727       }
 728
 729       switch (expr->operation) {
 730       case ir_unop_logic_not:
 731          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 732          return;
 733
 734       case ir_binop_logic_xor:
 735          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 736          return;
 737
 738       case ir_binop_logic_or:
 739          temp = dst_reg(this, glsl_type::bool_type);
 740          emit(OR(temp, op[0], op[1]));
 741          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 742          return;
 743
 744       case ir_binop_logic_and:
 745          temp = dst_reg(this, glsl_type::bool_type);
 746          emit(AND(temp, op[0], op[1]));
 747          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 748          return;
 749
 750       case ir_unop_f2b:
 751          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 752          return;
 753
 754       case ir_unop_i2b:
 755          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 756          return;
 757
 758       case ir_binop_greater:
 759       case ir_binop_gequal:
 760       case ir_binop_less:
 761       case ir_binop_lequal:
 762       case ir_binop_equal:
 763       case ir_binop_nequal:
 764          emit(IF(op[0], op[1],
 765                  brw_conditional_for_comparison(expr->operation)));
 766          return;
 767
 768       case ir_binop_all_equal:
 769          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 770          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 771          return;
 772
 773       case ir_binop_any_nequal:
 774          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 775          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 776          return;
 777
 778       case ir_unop_any:
 779          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 780          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 781          return;
 782
 783       default:
 784          assert(!"not reached");
 785          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 786          return;
 787       }
 788       return;
 789    }
 790
 791    ir->condition->accept(this);
 792
 793    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
 794 }
 795
 796 static dst_reg
 797 with_writemask(dst_reg const & r, int mask)
 798 {
 799    dst_reg result = r;
 800    result.writemask = mask;
 801    return result;
 802 }
 803
 804 void
 805 vec4_visitor::emit_attribute_fixups()
 806 {
 807    dst_reg sign_recovery_shift;
 808    dst_reg normalize_factor;
 809    dst_reg es3_normalize_factor;
 810
 811    for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
 812       if (prog_data->inputs_read & BITFIELD64_BIT(i)) {
 813          uint8_t wa_flags = c->key.gl_attrib_wa_flags[i];
 814          dst_reg reg(ATTR, i);
 815          dst_reg reg_d = reg;
 816          reg_d.type = BRW_REGISTER_TYPE_D;
 817          dst_reg reg_ud = reg;
 818          reg_ud.type = BRW_REGISTER_TYPE_UD;
 819
 820          /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
 821           * come in as floating point conversions of the integer values.
 822           */
 823          if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
 824             dst_reg dst = reg;
 825             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 826             dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
 827             emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
 828          }
 829
 830          /* Do sign recovery for 2101010 formats if required. */
 831          if (wa_flags & BRW_ATTRIB_WA_SIGN) {
 832             if (sign_recovery_shift.file == BAD_FILE) {
 833                /* shift constant: <22,22,22,30> */
 834                sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
 835                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
 836                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
 837             }
 838
 839             emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
 840             emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
 841          }
 842
 843          /* Apply BGRA swizzle if required. */
 844          if (wa_flags & BRW_ATTRIB_WA_BGRA) {
 845             src_reg temp = src_reg(reg);
 846             temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
 847             emit(MOV(reg, temp));
 848          }
 849
 850          if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
 851             /* ES 3.0 has different rules for converting signed normalized
 852              * fixed-point numbers than desktop GL.
 853              */
 854             if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
 855                /* According to equation 2.2 of the ES 3.0 specification,
 856                 * signed normalization conversion is done by:
 857                 *
 858                 * f = c / (2^(b-1)-1)
 859                 */
 860                if (es3_normalize_factor.file == BAD_FILE) {
 861                   /* mul constant: 1 / (2^(b-1) - 1) */
 862                   es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
 863                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
 864                            src_reg(1.0f / ((1<<9) - 1))));
 865                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
 866                            src_reg(1.0f / ((1<<1) - 1))));
 867                }
 868
 869                dst_reg dst = reg;
 870                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 871                emit(MOV(dst, src_reg(reg_d)));
 872                emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
 873                emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
 874             } else {
 875                /* The following equations are from the OpenGL 3.2 specification:
 876                 *
 877                 * 2.1 unsigned normalization
 878                 * f = c/(2^n-1)
 879                 *
 880                 * 2.2 signed normalization
 881                 * f = (2c+1)/(2^n-1)
 882                 *
 883                 * Both of these share a common divisor, which is represented by
 884                 * "normalize_factor" in the code below.
 885                 */
 886                if (normalize_factor.file == BAD_FILE) {
 887                   /* 1 / (2^b - 1) for b=<10,10,10,2> */
 888                   normalize_factor = dst_reg(this, glsl_type::vec4_type);
 889                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
 890                            src_reg(1.0f / ((1<<10) - 1))));
 891                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
 892                            src_reg(1.0f / ((1<<2) - 1))));
 893                }
 894
 895                dst_reg dst = reg;
 896                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 897                emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
 898
 899                /* For signed normalization, we want the numerator to be 2c+1. */
 900                if (wa_flags & BRW_ATTRIB_WA_SIGN) {
 901                   emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
 902                   emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
 903                }
 904
 905                emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
 906             }
 907          }
 908
 909          if (wa_flags & BRW_ATTRIB_WA_SCALE) {
 910             dst_reg dst = reg;
 911             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
 912             emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
 913          }
 914       }
 915    }
 916 }
 917
 918 void
 919 vec4_visitor::visit(ir_variable *ir)
 920 {
 921    dst_reg *reg = NULL;
 922
 923    if (variable_storage(ir))
 924       return;
 925
 926    switch (ir->mode) {
 927    case ir_var_in:
 928       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
 929       break;
 930
 931    case ir_var_out:
 932       reg = new(mem_ctx) dst_reg(this, ir->type);
 933
 934       for (int i = 0; i < type_size(ir->type); i++) {
 935          output_reg[ir->location + i] = *reg;
 936          output_reg[ir->location + i].reg_offset = i;
 937          output_reg[ir->location + i].type =
 938             brw_type_for_base_type(ir->type->get_scalar_type());
 939          output_reg_annotation[ir->location + i] = ir->name;
 940       }
 941       break;
 942
 943    case ir_var_auto:
 944    case ir_var_temporary:
 945       reg = new(mem_ctx) dst_reg(this, ir->type);
 946       break;
 947
 948    case ir_var_uniform:
 949       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
 950
 951       /* Thanks to the lower_ubo_reference pass, we will see only
 952        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 953        * variables, so no need for them to be in variable_ht.
 954        */
 955       if (ir->uniform_block != -1)
 956          return;
 957
 958       /* Track how big the whole uniform variable is, in case we need to put a
 959        * copy of its data into pull constants for array access.
 960        */
 961       this->uniform_size[this->uniforms] = type_size(ir->type);
 962
 963       if (!strncmp(ir->name, "gl_", 3)) {
 964          setup_builtin_uniform_values(ir);
 965       } else {
 966          setup_uniform_values(ir->location, ir->type);
 967       }
 968       break;
 969
 970    case ir_var_system_value:
 971       /* VertexID is stored by the VF as the last vertex element, but
 972        * we don't represent it with a flag in inputs_read, so we call
 973        * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
 974        */
 975       reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
 976       prog_data->uses_vertexid = true;
 977
 978       switch (ir->location) {
 979       case SYSTEM_VALUE_VERTEX_ID:
 980          reg->writemask = WRITEMASK_X;
 981          break;
 982       case SYSTEM_VALUE_INSTANCE_ID:
 983          reg->writemask = WRITEMASK_Y;
 984          break;
 985       default:
 986          assert(!"not reached");
 987          break;
 988       }
 989       break;
 990
 991    default:
 992       assert(!"not reached");
 993    }
 994
 995    reg->type = brw_type_for_base_type(ir->type);
 996    hash_table_insert(this->variable_ht, reg, ir);
 997 }
 998
 999 void
1000 vec4_visitor::visit(ir_loop *ir)
1001 {
1002    dst_reg counter;
1003
1004    /* We don't want debugging output to print the whole body of the
1005     * loop as the annotation.
1006     */
1007    this->base_ir = NULL;
1008
1009    if (ir->counter != NULL) {
1010       this->base_ir = ir->counter;
1011       ir->counter->accept(this);
1012       counter = *(variable_storage(ir->counter));
1013
1014       if (ir->from != NULL) {
1015          this->base_ir = ir->from;
1016          ir->from->accept(this);
1017
1018          emit(MOV(counter, this->result));
1019       }
1020    }
1021
1022    emit(BRW_OPCODE_DO);
1023
1024    if (ir->to) {
1025       this->base_ir = ir->to;
1026       ir->to->accept(this);
1027
1028       emit(CMP(dst_null_d(), src_reg(counter), this->result,
1029                brw_conditional_for_comparison(ir->cmp)));
1030
1031       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1032       inst->predicate = BRW_PREDICATE_NORMAL;
1033    }
1034
1035    visit_instructions(&ir->body_instructions);
1036
1037
1038    if (ir->increment) {
1039       this->base_ir = ir->increment;
1040       ir->increment->accept(this);
1041       emit(ADD(counter, src_reg(counter), this->result));
1042    }
1043
1044    emit(BRW_OPCODE_WHILE);
1045 }
1046
1047 void
1048 vec4_visitor::visit(ir_loop_jump *ir)
1049 {
1050    switch (ir->mode) {
1051    case ir_loop_jump::jump_break:
1052       emit(BRW_OPCODE_BREAK);
1053       break;
1054    case ir_loop_jump::jump_continue:
1055       emit(BRW_OPCODE_CONTINUE);
1056       break;
1057    }
1058 }
1059
1060
1061 void
1062 vec4_visitor::visit(ir_function_signature *ir)
1063 {
1064    assert(0);
1065    (void)ir;
1066 }
1067
1068 void
1069 vec4_visitor::visit(ir_function *ir)
1070 {
1071    /* Ignore function bodies other than main() -- we shouldn't see calls to
1072     * them since they should all be inlined.
1073     */
1074    if (strcmp(ir->name, "main") == 0) {
1075       const ir_function_signature *sig;
1076       exec_list empty;
1077
1078       sig = ir->matching_signature(&empty);
1079
1080       assert(sig);
1081
1082       visit_instructions(&sig->body);
1083    }
1084 }
1085
1086 bool
1087 vec4_visitor::try_emit_sat(ir_expression *ir)
1088 {
1089    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1090    if (!sat_src)
1091       return false;
1092
1093    sat_src->accept(this);
1094    src_reg src = this->result;
1095
1096    this->result = src_reg(this, ir->type);
1097    vec4_instruction *inst;
1098    inst = emit(MOV(dst_reg(this->result), src));
1099    inst->saturate = true;
1100
1101    return true;
1102 }
1103
1104 void
1105 vec4_visitor::emit_bool_comparison(unsigned int op,
1106                                  dst_reg dst, src_reg src0, src_reg src1)
1107 {
1108    /* original gen4 does destination conversion before comparison. */
1109    if (intel->gen < 5)
1110       dst.type = src0.type;
1111
1112    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1113
1114    dst.type = BRW_REGISTER_TYPE_D;
1115    emit(AND(dst, src_reg(dst), src_reg(0x1)));
1116 }
1117
1118 void
1119 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1120                           src_reg src0, src_reg src1)
1121 {
1122    vec4_instruction *inst;
1123
1124    if (intel->gen >= 6) {
1125       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1126       inst->conditional_mod = conditionalmod;
1127    } else {
1128       emit(CMP(dst, src0, src1, conditionalmod));
1129
1130       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1131       inst->predicate = BRW_PREDICATE_NORMAL;
1132    }
1133 }
1134
1135 void
1136 vec4_visitor::visit(ir_expression *ir)
1137 {
1138    unsigned int operand;
1139    src_reg op[Elements(ir->operands)];
1140    src_reg result_src;
1141    dst_reg result_dst;
1142    vec4_instruction *inst;
1143
1144    if (try_emit_sat(ir))
1145       return;
1146
1147    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1148       this->result.file = BAD_FILE;
1149       ir->operands[operand]->accept(this);
1150       if (this->result.file == BAD_FILE) {
1151          printf("Failed to get tree for expression operand:\n");
1152          ir->operands[operand]->print();
1153          exit(1);
1154       }
1155       op[operand] = this->result;
1156
1157       /* Matrix expression operands should have been broken down to vector
1158        * operations already.
1159        */
1160       assert(!ir->operands[operand]->type->is_matrix());
1161    }
1162
1163    int vector_elements = ir->operands[0]->type->vector_elements;
1164    if (ir->operands[1]) {
1165       vector_elements = MAX2(vector_elements,
1166                              ir->operands[1]->type->vector_elements);
1167    }
1168
1169    this->result.file = BAD_FILE;
1170
1171    /* Storage for our result.  Ideally for an assignment we'd be using
1172     * the actual storage for the result here, instead.
1173     */
1174    result_src = src_reg(this, ir->type);
1175    /* convenience for the emit functions below. */
1176    result_dst = dst_reg(result_src);
1177    /* If nothing special happens, this is the result. */
1178    this->result = result_src;
1179    /* Limit writes to the channels that will be used by result_src later.
1180     * This does limit this temp's use as a temporary for multi-instruction
1181     * sequences.
1182     */
1183    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1184
1185    switch (ir->operation) {
1186    case ir_unop_logic_not:
1187       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1188        * ones complement of the whole register, not just bit 0.
1189        */
1190       emit(XOR(result_dst, op[0], src_reg(1)));
1191       break;
1192    case ir_unop_neg:
1193       op[0].negate = !op[0].negate;
1194       this->result = op[0];
1195       break;
1196    case ir_unop_abs:
1197       op[0].abs = true;
1198       op[0].negate = false;
1199       this->result = op[0];
1200       break;
1201
1202    case ir_unop_sign:
1203       emit(MOV(result_dst, src_reg(0.0f)));
1204
1205       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1206       inst = emit(MOV(result_dst, src_reg(1.0f)));
1207       inst->predicate = BRW_PREDICATE_NORMAL;
1208
1209       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1210       inst = emit(MOV(result_dst, src_reg(-1.0f)));
1211       inst->predicate = BRW_PREDICATE_NORMAL;
1212
1213       break;
1214
1215    case ir_unop_rcp:
1216       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1217       break;
1218
1219    case ir_unop_exp2:
1220       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1221       break;
1222    case ir_unop_log2:
1223       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1224       break;
1225    case ir_unop_exp:
1226    case ir_unop_log:
1227       assert(!"not reached: should be handled by ir_explog_to_explog2");
1228       break;
1229    case ir_unop_sin:
1230    case ir_unop_sin_reduced:
1231       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1232       break;
1233    case ir_unop_cos:
1234    case ir_unop_cos_reduced:
1235       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1236       break;
1237
1238    case ir_unop_dFdx:
1239    case ir_unop_dFdy:
1240       assert(!"derivatives not valid in vertex shader");
1241       break;
1242
1243    case ir_unop_noise:
1244       assert(!"not reached: should be handled by lower_noise");
1245       break;
1246
1247    case ir_binop_add:
1248       emit(ADD(result_dst, op[0], op[1]));
1249       break;
1250    case ir_binop_sub:
1251       assert(!"not reached: should be handled by ir_sub_to_add_neg");
1252       break;
1253
1254    case ir_binop_mul:
1255       if (ir->type->is_integer()) {
1256          /* For integer multiplication, the MUL uses the low 16 bits
1257           * of one of the operands (src0 on gen6, src1 on gen7).  The
1258           * MACH accumulates in the contribution of the upper 16 bits
1259           * of that operand.
1260           *
1261           * FINISHME: Emit just the MUL if we know an operand is small
1262           * enough.
1263           */
1264          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1265
1266          emit(MUL(acc, op[0], op[1]));
1267          emit(MACH(dst_null_d(), op[0], op[1]));
1268          emit(MOV(result_dst, src_reg(acc)));
1269       } else {
1270          emit(MUL(result_dst, op[0], op[1]));
1271       }
1272       break;
1273    case ir_binop_div:
1274       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1275       assert(ir->type->is_integer());
1276       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1277       break;
1278    case ir_binop_mod:
1279       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1280       assert(ir->type->is_integer());
1281       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1282       break;
1283
1284    case ir_binop_less:
1285    case ir_binop_greater:
1286    case ir_binop_lequal:
1287    case ir_binop_gequal:
1288    case ir_binop_equal:
1289    case ir_binop_nequal: {
1290       emit(CMP(result_dst, op[0], op[1],
1291                brw_conditional_for_comparison(ir->operation)));
1292       emit(AND(result_dst, result_src, src_reg(0x1)));
1293       break;
1294    }
1295
1296    case ir_binop_all_equal:
1297       /* "==" operator producing a scalar boolean. */
1298       if (ir->operands[0]->type->is_vector() ||
1299           ir->operands[1]->type->is_vector()) {
1300          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1301          emit(MOV(result_dst, src_reg(0)));
1302          inst = emit(MOV(result_dst, src_reg(1)));
1303          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1304       } else {
1305          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1306          emit(AND(result_dst, result_src, src_reg(0x1)));
1307       }
1308       break;
1309    case ir_binop_any_nequal:
1310       /* "!=" operator producing a scalar boolean. */
1311       if (ir->operands[0]->type->is_vector() ||
1312           ir->operands[1]->type->is_vector()) {
1313          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1314
1315          emit(MOV(result_dst, src_reg(0)));
1316          inst = emit(MOV(result_dst, src_reg(1)));
1317          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1318       } else {
1319          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1320          emit(AND(result_dst, result_src, src_reg(0x1)));
1321       }
1322       break;
1323
1324    case ir_unop_any:
1325       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1326       emit(MOV(result_dst, src_reg(0)));
1327
1328       inst = emit(MOV(result_dst, src_reg(1)));
1329       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1330       break;
1331
1332    case ir_binop_logic_xor:
1333       emit(XOR(result_dst, op[0], op[1]));
1334       break;
1335
1336    case ir_binop_logic_or:
1337       emit(OR(result_dst, op[0], op[1]));
1338       break;
1339
1340    case ir_binop_logic_and:
1341       emit(AND(result_dst, op[0], op[1]));
1342       break;
1343
1344    case ir_binop_dot:
1345       assert(ir->operands[0]->type->is_vector());
1346       assert(ir->operands[0]->type == ir->operands[1]->type);
1347       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1348       break;
1349
1350    case ir_unop_sqrt:
1351       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1352       break;
1353    case ir_unop_rsq:
1354       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1355       break;
1356
1357    case ir_unop_bitcast_i2f:
1358    case ir_unop_bitcast_u2f:
1359       this->result = op[0];
1360       this->result.type = BRW_REGISTER_TYPE_F;
1361       break;
1362
1363    case ir_unop_bitcast_f2i:
1364       this->result = op[0];
1365       this->result.type = BRW_REGISTER_TYPE_D;
1366       break;
1367
1368    case ir_unop_bitcast_f2u:
1369       this->result = op[0];
1370       this->result.type = BRW_REGISTER_TYPE_UD;
1371       break;
1372
1373    case ir_unop_i2f:
1374    case ir_unop_i2u:
1375    case ir_unop_u2i:
1376    case ir_unop_u2f:
1377    case ir_unop_b2f:
1378    case ir_unop_b2i:
1379    case ir_unop_f2i:
1380    case ir_unop_f2u:
1381       emit(MOV(result_dst, op[0]));
1382       break;
1383    case ir_unop_f2b:
1384    case ir_unop_i2b: {
1385       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1386       emit(AND(result_dst, result_src, src_reg(1)));
1387       break;
1388    }
1389
1390    case ir_unop_trunc:
1391       emit(RNDZ(result_dst, op[0]));
1392       break;
1393    case ir_unop_ceil:
1394       op[0].negate = !op[0].negate;
1395       inst = emit(RNDD(result_dst, op[0]));
1396       this->result.negate = true;
1397       break;
1398    case ir_unop_floor:
1399       inst = emit(RNDD(result_dst, op[0]));
1400       break;
1401    case ir_unop_fract:
1402       inst = emit(FRC(result_dst, op[0]));
1403       break;
1404    case ir_unop_round_even:
1405       emit(RNDE(result_dst, op[0]));
1406       break;
1407
1408    case ir_binop_min:
1409       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1410       break;
1411    case ir_binop_max:
1412       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1413       break;
1414
1415    case ir_binop_pow:
1416       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1417       break;
1418
1419    case ir_unop_bit_not:
1420       inst = emit(NOT(result_dst, op[0]));
1421       break;
1422    case ir_binop_bit_and:
1423       inst = emit(AND(result_dst, op[0], op[1]));
1424       break;
1425    case ir_binop_bit_xor:
1426       inst = emit(XOR(result_dst, op[0], op[1]));
1427       break;
1428    case ir_binop_bit_or:
1429       inst = emit(OR(result_dst, op[0], op[1]));
1430       break;
1431
1432    case ir_binop_lshift:
1433       inst = emit(SHL(result_dst, op[0], op[1]));
1434       break;
1435
1436    case ir_binop_rshift:
1437       if (ir->type->base_type == GLSL_TYPE_INT)
1438          inst = emit(ASR(result_dst, op[0], op[1]));
1439       else
1440          inst = emit(SHR(result_dst, op[0], op[1]));
1441       break;
1442
1443    case ir_binop_ubo_load: {
1444       ir_constant *uniform_block = ir->operands[0]->as_constant();
1445       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1446       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1447       src_reg offset = op[1];
1448
1449       /* Now, load the vector from that offset. */
1450       assert(ir->type->is_vector() || ir->type->is_scalar());
1451
1452       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1453       packed_consts.type = result.type;
1454       src_reg surf_index =
1455          src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1456       if (const_offset_ir) {
1457          offset = src_reg(const_offset / 16);
1458       } else {
1459          emit(SHR(dst_reg(offset), offset, src_reg(4)));
1460       }
1461
1462       vec4_instruction *pull =
1463          emit(new(mem_ctx) vec4_instruction(this,
1464                                             VS_OPCODE_PULL_CONSTANT_LOAD,
1465                                             dst_reg(packed_consts),
1466                                             surf_index,
1467                                             offset));
1468       pull->base_mrf = 14;
1469       pull->mlen = 1;
1470
1471       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1472       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1473                                             const_offset % 16 / 4,
1474                                             const_offset % 16 / 4,
1475                                             const_offset % 16 / 4);
1476
1477       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
1478       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1479          emit(CMP(result_dst, packed_consts, src_reg(0u),
1480                   BRW_CONDITIONAL_NZ));
1481          emit(AND(result_dst, result, src_reg(0x1)));
1482       } else {
1483          emit(MOV(result_dst, packed_consts));
1484       }
1485       break;
1486    }
1487
1488    case ir_quadop_vector:
1489       assert(!"not reached: should be handled by lower_quadop_vector");
1490       break;
1491    }
1492 }
1493
1494
1495 void
1496 vec4_visitor::visit(ir_swizzle *ir)
1497 {
1498    src_reg src;
1499    int i = 0;
1500    int swizzle[4];
1501
1502    /* Note that this is only swizzles in expressions, not those on the left
1503     * hand side of an assignment, which do write masking.  See ir_assignment
1504     * for that.
1505     */
1506
1507    ir->val->accept(this);
1508    src = this->result;
1509    assert(src.file != BAD_FILE);
1510
1511    for (i = 0; i < ir->type->vector_elements; i++) {
1512       switch (i) {
1513       case 0:
1514          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1515          break;
1516       case 1:
1517          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1518          break;
1519       case 2:
1520          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1521          break;
1522       case 3:
1523          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1524             break;
1525       }
1526    }
1527    for (; i < 4; i++) {
1528       /* Replicate the last channel out. */
1529       swizzle[i] = swizzle[ir->type->vector_elements - 1];
1530    }
1531
1532    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1533
1534    this->result = src;
1535 }
1536
1537 void
1538 vec4_visitor::visit(ir_dereference_variable *ir)
1539 {
1540    const struct glsl_type *type = ir->type;
1541    dst_reg *reg = variable_storage(ir->var);
1542
1543    if (!reg) {
1544       fail("Failed to find variable storage for %s\n", ir->var->name);
1545       this->result = src_reg(brw_null_reg());
1546       return;
1547    }
1548
1549    this->result = src_reg(*reg);
1550
1551    /* System values get their swizzle from the dst_reg writemask */
1552    if (ir->var->mode == ir_var_system_value)
1553       return;
1554
1555    if (type->is_scalar() || type->is_vector() || type->is_matrix())
1556       this->result.swizzle = swizzle_for_size(type->vector_elements);
1557 }
1558
1559 void
1560 vec4_visitor::visit(ir_dereference_array *ir)
1561 {
1562    ir_constant *constant_index;
1563    src_reg src;
1564    int element_size = type_size(ir->type);
1565
1566    constant_index = ir->array_index->constant_expression_value();
1567
1568    ir->array->accept(this);
1569    src = this->result;
1570
1571    if (constant_index) {
1572       src.reg_offset += constant_index->value.i[0] * element_size;
1573    } else {
1574       /* Variable index array dereference.  It eats the "vec4" of the
1575        * base of the array and an index that offsets the Mesa register
1576        * index.
1577        */
1578       ir->array_index->accept(this);
1579
1580       src_reg index_reg;
1581
1582       if (element_size == 1) {
1583          index_reg = this->result;
1584       } else {
1585          index_reg = src_reg(this, glsl_type::int_type);
1586
1587          emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1588       }
1589
1590       if (src.reladdr) {
1591          src_reg temp = src_reg(this, glsl_type::int_type);
1592
1593          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1594
1595          index_reg = temp;
1596       }
1597
1598       src.reladdr = ralloc(mem_ctx, src_reg);
1599       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1600    }
1601
1602    /* If the type is smaller than a vec4, replicate the last channel out. */
1603    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1604       src.swizzle = swizzle_for_size(ir->type->vector_elements);
1605    else
1606       src.swizzle = BRW_SWIZZLE_NOOP;
1607    src.type = brw_type_for_base_type(ir->type);
1608
1609    this->result = src;
1610 }
1611
1612 void
1613 vec4_visitor::visit(ir_dereference_record *ir)
1614 {
1615    unsigned int i;
1616    const glsl_type *struct_type = ir->record->type;
1617    int offset = 0;
1618
1619    ir->record->accept(this);
1620
1621    for (i = 0; i < struct_type->length; i++) {
1622       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1623          break;
1624       offset += type_size(struct_type->fields.structure[i].type);
1625    }
1626
1627    /* If the type is smaller than a vec4, replicate the last channel out. */
1628    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1629       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1630    else
1631       this->result.swizzle = BRW_SWIZZLE_NOOP;
1632    this->result.type = brw_type_for_base_type(ir->type);
1633
1634    this->result.reg_offset += offset;
1635 }
1636
1637 /**
1638  * We want to be careful in assignment setup to hit the actual storage
1639  * instead of potentially using a temporary like we might with the
1640  * ir_dereference handler.
1641  */
1642 static dst_reg
1643 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1644 {
1645    /* The LHS must be a dereference.  If the LHS is a variable indexed array
1646     * access of a vector, it must be separated into a series conditional moves
1647     * before reaching this point (see ir_vec_index_to_cond_assign).
1648     */
1649    assert(ir->as_dereference());
1650    ir_dereference_array *deref_array = ir->as_dereference_array();
1651    if (deref_array) {
1652       assert(!deref_array->array->type->is_vector());
1653    }
1654
1655    /* Use the rvalue deref handler for the most part.  We'll ignore
1656     * swizzles in it and write swizzles using writemask, though.
1657     */
1658    ir->accept(v);
1659    return dst_reg(v->result);
1660 }
1661
1662 void
1663 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1664                               const struct glsl_type *type, uint32_t predicate)
1665 {
1666    if (type->base_type == GLSL_TYPE_STRUCT) {
1667       for (unsigned int i = 0; i < type->length; i++) {
1668          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1669       }
1670       return;
1671    }
1672
1673    if (type->is_array()) {
1674       for (unsigned int i = 0; i < type->length; i++) {
1675          emit_block_move(dst, src, type->fields.array, predicate);
1676       }
1677       return;
1678    }
1679
1680    if (type->is_matrix()) {
1681       const struct glsl_type *vec_type;
1682
1683       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1684                                          type->vector_elements, 1);
1685
1686       for (int i = 0; i < type->matrix_columns; i++) {
1687          emit_block_move(dst, src, vec_type, predicate);
1688       }
1689       return;
1690    }
1691
1692    assert(type->is_scalar() || type->is_vector());
1693
1694    dst->type = brw_type_for_base_type(type);
1695    src->type = dst->type;
1696
1697    dst->writemask = (1 << type->vector_elements) - 1;
1698
1699    src->swizzle = swizzle_for_size(type->vector_elements);
1700
1701    vec4_instruction *inst = emit(MOV(*dst, *src));
1702    inst->predicate = predicate;
1703
1704    dst->reg_offset++;
1705    src->reg_offset++;
1706 }
1707
1708
1709 /* If the RHS processing resulted in an instruction generating a
1710  * temporary value, and it would be easy to rewrite the instruction to
1711  * generate its result right into the LHS instead, do so.  This ends
1712  * up reliably removing instructions where it can be tricky to do so
1713  * later without real UD chain information.
1714  */
1715 bool
1716 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1717                                      dst_reg dst,
1718                                      src_reg src,
1719                                      vec4_instruction *pre_rhs_inst,
1720                                      vec4_instruction *last_rhs_inst)
1721 {
1722    /* This could be supported, but it would take more smarts. */
1723    if (ir->condition)
1724       return false;
1725
1726    if (pre_rhs_inst == last_rhs_inst)
1727       return false; /* No instructions generated to work with. */
1728
1729    /* Make sure the last instruction generated our source reg. */
1730    if (src.file != GRF ||
1731        src.file != last_rhs_inst->dst.file ||
1732        src.reg != last_rhs_inst->dst.reg ||
1733        src.reg_offset != last_rhs_inst->dst.reg_offset ||
1734        src.reladdr ||
1735        src.abs ||
1736        src.negate ||
1737        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1738       return false;
1739
1740    /* Check that that last instruction fully initialized the channels
1741     * we want to use, in the order we want to use them.  We could
1742     * potentially reswizzle the operands of many instructions so that
1743     * we could handle out of order channels, but don't yet.
1744     */
1745
1746    for (unsigned i = 0; i < 4; i++) {
1747       if (dst.writemask & (1 << i)) {
1748          if (!(last_rhs_inst->dst.writemask & (1 << i)))
1749             return false;
1750
1751          if (BRW_GET_SWZ(src.swizzle, i) != i)
1752             return false;
1753       }
1754    }
1755
1756    /* Success!  Rewrite the instruction. */
1757    last_rhs_inst->dst.file = dst.file;
1758    last_rhs_inst->dst.reg = dst.reg;
1759    last_rhs_inst->dst.reg_offset = dst.reg_offset;
1760    last_rhs_inst->dst.reladdr = dst.reladdr;
1761    last_rhs_inst->dst.writemask &= dst.writemask;
1762
1763    return true;
1764 }
1765
1766 void
1767 vec4_visitor::visit(ir_assignment *ir)
1768 {
1769    dst_reg dst = get_assignment_lhs(ir->lhs, this);
1770    uint32_t predicate = BRW_PREDICATE_NONE;
1771
1772    if (!ir->lhs->type->is_scalar() &&
1773        !ir->lhs->type->is_vector()) {
1774       ir->rhs->accept(this);
1775       src_reg src = this->result;
1776
1777       if (ir->condition) {
1778          emit_bool_to_cond_code(ir->condition, &predicate);
1779       }
1780
1781       /* emit_block_move doesn't account for swizzles in the source register.
1782        * This should be ok, since the source register is a structure or an
1783        * array, and those can't be swizzled.  But double-check to be sure.
1784        */
1785       assert(src.swizzle ==
1786              (ir->rhs->type->is_matrix()
1787               ? swizzle_for_size(ir->rhs->type->vector_elements)
1788               : BRW_SWIZZLE_NOOP));
1789
1790       emit_block_move(&dst, &src, ir->rhs->type, predicate);
1791       return;
1792    }
1793
1794    /* Now we're down to just a scalar/vector with writemasks. */
1795    int i;
1796
1797    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1798    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1799
1800    ir->rhs->accept(this);
1801
1802    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1803
1804    src_reg src = this->result;
1805
1806    int swizzles[4];
1807    int first_enabled_chan = 0;
1808    int src_chan = 0;
1809
1810    assert(ir->lhs->type->is_vector() ||
1811           ir->lhs->type->is_scalar());
1812    dst.writemask = ir->write_mask;
1813
1814    for (int i = 0; i < 4; i++) {
1815       if (dst.writemask & (1 << i)) {
1816          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1817          break;
1818       }
1819    }
1820
1821    /* Swizzle a small RHS vector into the channels being written.
1822     *
1823     * glsl ir treats write_mask as dictating how many channels are
1824     * present on the RHS while in our instructions we need to make
1825     * those channels appear in the slots of the vec4 they're written to.
1826     */
1827    for (int i = 0; i < 4; i++) {
1828       if (dst.writemask & (1 << i))
1829          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1830       else
1831          swizzles[i] = first_enabled_chan;
1832    }
1833    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1834                               swizzles[2], swizzles[3]);
1835
1836    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1837       return;
1838    }
1839
1840    if (ir->condition) {
1841       emit_bool_to_cond_code(ir->condition, &predicate);
1842    }
1843
1844    for (i = 0; i < type_size(ir->lhs->type); i++) {
1845       vec4_instruction *inst = emit(MOV(dst, src));
1846       inst->predicate = predicate;
1847
1848       dst.reg_offset++;
1849       src.reg_offset++;
1850    }
1851 }
1852
1853 void
1854 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1855 {
1856    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1857       foreach_list(node, &ir->components) {
1858          ir_constant *field_value = (ir_constant *)node;
1859
1860          emit_constant_values(dst, field_value);
1861       }
1862       return;
1863    }
1864
1865    if (ir->type->is_array()) {
1866       for (unsigned int i = 0; i < ir->type->length; i++) {
1867          emit_constant_values(dst, ir->array_elements[i]);
1868       }
1869       return;
1870    }
1871
1872    if (ir->type->is_matrix()) {
1873       for (int i = 0; i < ir->type->matrix_columns; i++) {
1874          float *vec = &ir->value.f[i * ir->type->vector_elements];
1875
1876          for (int j = 0; j < ir->type->vector_elements; j++) {
1877             dst->writemask = 1 << j;
1878             dst->type = BRW_REGISTER_TYPE_F;
1879
1880             emit(MOV(*dst, src_reg(vec[j])));
1881          }
1882          dst->reg_offset++;
1883       }
1884       return;
1885    }
1886
1887    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
1888
1889    for (int i = 0; i < ir->type->vector_elements; i++) {
1890       if (!(remaining_writemask & (1 << i)))
1891          continue;
1892
1893       dst->writemask = 1 << i;
1894       dst->type = brw_type_for_base_type(ir->type);
1895
1896       /* Find other components that match the one we're about to
1897        * write.  Emits fewer instructions for things like vec4(0.5,
1898        * 1.5, 1.5, 1.5).
1899        */
1900       for (int j = i + 1; j < ir->type->vector_elements; j++) {
1901          if (ir->type->base_type == GLSL_TYPE_BOOL) {
1902             if (ir->value.b[i] == ir->value.b[j])
1903                dst->writemask |= (1 << j);
1904          } else {
1905             /* u, i, and f storage all line up, so no need for a
1906              * switch case for comparing each type.
1907              */
1908             if (ir->value.u[i] == ir->value.u[j])
1909                dst->writemask |= (1 << j);
1910          }
1911       }
1912
1913       switch (ir->type->base_type) {
1914       case GLSL_TYPE_FLOAT:
1915          emit(MOV(*dst, src_reg(ir->value.f[i])));
1916          break;
1917       case GLSL_TYPE_INT:
1918          emit(MOV(*dst, src_reg(ir->value.i[i])));
1919          break;
1920       case GLSL_TYPE_UINT:
1921          emit(MOV(*dst, src_reg(ir->value.u[i])));
1922          break;
1923       case GLSL_TYPE_BOOL:
1924          emit(MOV(*dst, src_reg(ir->value.b[i])));
1925          break;
1926       default:
1927          assert(!"Non-float/uint/int/bool constant");
1928          break;
1929       }
1930
1931       remaining_writemask &= ~dst->writemask;
1932    }
1933    dst->reg_offset++;
1934 }
1935
1936 void
1937 vec4_visitor::visit(ir_constant *ir)
1938 {
1939    dst_reg dst = dst_reg(this, ir->type);
1940    this->result = src_reg(dst);
1941
1942    emit_constant_values(&dst, ir);
1943 }
1944
1945 void
1946 vec4_visitor::visit(ir_call *ir)
1947 {
1948    assert(!"not reached");
1949 }
1950
1951 void
1952 vec4_visitor::visit(ir_texture *ir)
1953 {
1954    int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
1955
1956    /* Should be lowered by do_lower_texture_projection */
1957    assert(!ir->projector);
1958
1959    /* Generate code to compute all the subexpression trees.  This has to be
1960     * done before loading any values into MRFs for the sampler message since
1961     * generating these values may involve SEND messages that need the MRFs.
1962     */
1963    src_reg coordinate;
1964    if (ir->coordinate) {
1965       ir->coordinate->accept(this);
1966       coordinate = this->result;
1967    }
1968
1969    src_reg shadow_comparitor;
1970    if (ir->shadow_comparitor) {
1971       ir->shadow_comparitor->accept(this);
1972       shadow_comparitor = this->result;
1973    }
1974
1975    src_reg lod, dPdx, dPdy;
1976    switch (ir->op) {
1977    case ir_txf:
1978    case ir_txl:
1979    case ir_txs:
1980       ir->lod_info.lod->accept(this);
1981       lod = this->result;
1982       break;
1983    case ir_txd:
1984       ir->lod_info.grad.dPdx->accept(this);
1985       dPdx = this->result;
1986
1987       ir->lod_info.grad.dPdy->accept(this);
1988       dPdy = this->result;
1989       break;
1990    case ir_tex:
1991    case ir_txb:
1992       break;
1993    }
1994
1995    vec4_instruction *inst = NULL;
1996    switch (ir->op) {
1997    case ir_tex:
1998    case ir_txl:
1999       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2000       break;
2001    case ir_txd:
2002       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2003       break;
2004    case ir_txf:
2005       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2006       break;
2007    case ir_txs:
2008       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2009       break;
2010    case ir_txb:
2011       assert(!"TXB is not valid for vertex shaders.");
2012    }
2013
2014    /* Texel offsets go in the message header; Gen4 also requires headers. */
2015    inst->header_present = ir->offset || intel->gen < 5;
2016    inst->base_mrf = 2;
2017    inst->mlen = inst->header_present + 1; /* always at least one */
2018    inst->sampler = sampler;
2019    inst->dst = dst_reg(this, ir->type);
2020    inst->shadow_compare = ir->shadow_comparitor != NULL;
2021
2022    if (ir->offset != NULL && ir->op != ir_txf)
2023       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2024
2025    /* MRF for the first parameter */
2026    int param_base = inst->base_mrf + inst->header_present;
2027
2028    if (ir->op == ir_txs) {
2029       int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2030       emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, writemask),
2031            lod));
2032    } else {
2033       int i, coord_mask = 0, zero_mask = 0;
2034       /* Load the coordinate */
2035       /* FINISHME: gl_clamp_mask and saturate */
2036       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2037          coord_mask |= (1 << i);
2038       for (; i < 4; i++)
2039          zero_mask |= (1 << i);
2040
2041       if (ir->offset && ir->op == ir_txf) {
2042          /* It appears that the ld instruction used for txf does its
2043           * address bounds check before adding in the offset.  To work
2044           * around this, just add the integer offset to the integer
2045           * texel coordinate, and don't put the offset in the header.
2046           */
2047          ir_constant *offset = ir->offset->as_constant();
2048          assert(offset);
2049
2050          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2051             src_reg src = coordinate;
2052             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2053                                        BRW_GET_SWZ(src.swizzle, j),
2054                                        BRW_GET_SWZ(src.swizzle, j),
2055                                        BRW_GET_SWZ(src.swizzle, j));
2056             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2057                      src, offset->value.i[j]));
2058          }
2059       } else {
2060          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2061                   coordinate));
2062       }
2063       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2064                src_reg(0)));
2065       /* Load the shadow comparitor */
2066       if (ir->shadow_comparitor) {
2067          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2068                           WRITEMASK_X),
2069                   shadow_comparitor));
2070          inst->mlen++;
2071       }
2072
2073       /* Load the LOD info */
2074       if (ir->op == ir_txl) {
2075          int mrf, writemask;
2076          if (intel->gen >= 5) {
2077             mrf = param_base + 1;
2078             if (ir->shadow_comparitor) {
2079                writemask = WRITEMASK_Y;
2080                /* mlen already incremented */
2081             } else {
2082                writemask = WRITEMASK_X;
2083                inst->mlen++;
2084             }
2085          } else /* intel->gen == 4 */ {
2086             mrf = param_base;
2087             writemask = WRITEMASK_Z;
2088          }
2089          emit(MOV(dst_reg(MRF, mrf, ir->lod_info.lod->type, writemask), lod));
2090       } else if (ir->op == ir_txf) {
2091          emit(MOV(dst_reg(MRF, param_base, ir->lod_info.lod->type, WRITEMASK_W),
2092                   lod));
2093       } else if (ir->op == ir_txd) {
2094          const glsl_type *type = ir->lod_info.grad.dPdx->type;
2095
2096          if (intel->gen >= 5) {
2097             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2098             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2099             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2100             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2101             inst->mlen++;
2102
2103             if (ir->type->vector_elements == 3) {
2104                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2105                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2106                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2107                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2108                inst->mlen++;
2109             }
2110          } else /* intel->gen == 4 */ {
2111             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2112             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2113             inst->mlen += 2;
2114          }
2115       }
2116    }
2117
2118    emit(inst);
2119
2120    swizzle_result(ir, src_reg(inst->dst), sampler);
2121 }
2122
2123 void
2124 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2125 {
2126    this->result = orig_val;
2127
2128    int s = c->key.tex.swizzles[sampler];
2129
2130    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2131                         || s == SWIZZLE_NOOP)
2132       return;
2133
2134    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2135    int swizzle[4];
2136
2137    for (int i = 0; i < 4; i++) {
2138       switch (GET_SWZ(s, i)) {
2139       case SWIZZLE_ZERO:
2140          zero_mask |= (1 << i);
2141          break;
2142       case SWIZZLE_ONE:
2143          one_mask |= (1 << i);
2144          break;
2145       default:
2146          copy_mask |= (1 << i);
2147          swizzle[i] = GET_SWZ(s, i);
2148          break;
2149       }
2150    }
2151
2152    this->result = src_reg(this, ir->type);
2153    dst_reg swizzled_result(this->result);
2154
2155    if (copy_mask) {
2156       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2157       swizzled_result.writemask = copy_mask;
2158       emit(MOV(swizzled_result, orig_val));
2159    }
2160
2161    if (zero_mask) {
2162       swizzled_result.writemask = zero_mask;
2163       emit(MOV(swizzled_result, src_reg(0.0f)));
2164    }
2165
2166    if (one_mask) {
2167       swizzled_result.writemask = one_mask;
2168       emit(MOV(swizzled_result, src_reg(1.0f)));
2169    }
2170 }
2171
2172 void
2173 vec4_visitor::visit(ir_return *ir)
2174 {
2175    assert(!"not reached");
2176 }
2177
2178 void
2179 vec4_visitor::visit(ir_discard *ir)
2180 {
2181    assert(!"not reached");
2182 }
2183
2184 void
2185 vec4_visitor::visit(ir_if *ir)
2186 {
2187    /* Don't point the annotation at the if statement, because then it plus
2188     * the then and else blocks get printed.
2189     */
2190    this->base_ir = ir->condition;
2191
2192    if (intel->gen == 6) {
2193       emit_if_gen6(ir);
2194    } else {
2195       uint32_t predicate;
2196       emit_bool_to_cond_code(ir->condition, &predicate);
2197       emit(IF(predicate));
2198    }
2199
2200    visit_instructions(&ir->then_instructions);
2201
2202    if (!ir->else_instructions.is_empty()) {
2203       this->base_ir = ir->condition;
2204       emit(BRW_OPCODE_ELSE);
2205
2206       visit_instructions(&ir->else_instructions);
2207    }
2208
2209    this->base_ir = ir->condition;
2210    emit(BRW_OPCODE_ENDIF);
2211 }
2212
2213 void
2214 vec4_visitor::emit_ndc_computation()
2215 {
2216    /* Get the position */
2217    src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
2218
2219    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2220    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2221    output_reg[BRW_VERT_RESULT_NDC] = ndc;
2222
2223    current_annotation = "NDC";
2224    dst_reg ndc_w = ndc;
2225    ndc_w.writemask = WRITEMASK_W;
2226    src_reg pos_w = pos;
2227    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2228    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2229
2230    dst_reg ndc_xyz = ndc;
2231    ndc_xyz.writemask = WRITEMASK_XYZ;
2232
2233    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2234 }
2235
2236 void
2237 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2238 {
2239    if (intel->gen < 6 &&
2240        ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2241         c->key.userclip_active || brw->has_negative_rhw_bug)) {
2242       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2243       dst_reg header1_w = header1;
2244       header1_w.writemask = WRITEMASK_W;
2245       GLuint i;
2246
2247       emit(MOV(header1, 0u));
2248
2249       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2250          src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2251
2252          current_annotation = "Point size";
2253          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2254          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2255       }
2256
2257       current_annotation = "Clipping flags";
2258       for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2259          vec4_instruction *inst;
2260
2261          inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2262                          src_reg(this->userplane[i])));
2263          inst->conditional_mod = BRW_CONDITIONAL_L;
2264
2265          inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2266          inst->predicate = BRW_PREDICATE_NORMAL;
2267       }
2268
2269       /* i965 clipping workaround:
2270        * 1) Test for -ve rhw
2271        * 2) If set,
2272        *      set ndc = (0,0,0,0)
2273        *      set ucp[6] = 1
2274        *
2275        * Later, clipping will detect ucp[6] and ensure the primitive is
2276        * clipped against all fixed planes.
2277        */
2278       if (brw->has_negative_rhw_bug) {
2279 #if 0
2280          /* FINISHME */
2281          brw_CMP(p,
2282                  vec8(brw_null_reg()),
2283                  BRW_CONDITIONAL_L,
2284                  brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2285                  brw_imm_f(0));
2286
2287          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2288          brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2289          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2290 #endif
2291       }
2292
2293       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2294    } else if (intel->gen < 6) {
2295       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2296    } else {
2297       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2298       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2299          emit(MOV(brw_writemask(reg, WRITEMASK_W),
2300                   src_reg(output_reg[VERT_RESULT_PSIZ])));
2301       }
2302    }
2303 }
2304
2305 void
2306 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2307 {
2308    if (intel->gen < 6) {
2309       /* Clip distance slots are set aside in gen5, but they are not used.  It
2310        * is not clear whether we actually need to set aside space for them,
2311        * but the performance cost is negligible.
2312        */
2313       return;
2314    }
2315
2316    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2317     *
2318     *     "If a linked set of shaders forming the vertex stage contains no
2319     *     static write to gl_ClipVertex or gl_ClipDistance, but the
2320     *     application has requested clipping against user clip planes through
2321     *     the API, then the coordinate written to gl_Position is used for
2322     *     comparison against the user clip planes."
2323     *
2324     * This function is only called if the shader didn't write to
2325     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
2326     * if the user wrote to it; otherwise we use gl_Position.
2327     */
2328    gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2329    if (!(c->prog_data.outputs_written
2330          & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2331       clip_vertex = VERT_RESULT_HPOS;
2332    }
2333
2334    for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2335         ++i) {
2336       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2337                src_reg(output_reg[clip_vertex]),
2338                src_reg(this->userplane[i + offset])));
2339    }
2340 }
2341
2342 void
2343 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2344 {
2345    assert (vert_result < VERT_RESULT_MAX);
2346    reg.type = output_reg[vert_result].type;
2347    current_annotation = output_reg_annotation[vert_result];
2348    /* Copy the register, saturating if necessary */
2349    vec4_instruction *inst = emit(MOV(reg,
2350                                      src_reg(output_reg[vert_result])));
2351    if ((vert_result == VERT_RESULT_COL0 ||
2352         vert_result == VERT_RESULT_COL1 ||
2353         vert_result == VERT_RESULT_BFC0 ||
2354         vert_result == VERT_RESULT_BFC1) &&
2355        c->key.clamp_vertex_color) {
2356       inst->saturate = true;
2357    }
2358 }
2359
2360 void
2361 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2362 {
2363    struct brw_reg hw_reg = brw_message_reg(mrf);
2364    dst_reg reg = dst_reg(MRF, mrf);
2365    reg.type = BRW_REGISTER_TYPE_F;
2366
2367    switch (vert_result) {
2368    case VERT_RESULT_PSIZ:
2369       /* PSIZ is always in slot 0, and is coupled with other flags. */
2370       current_annotation = "indices, point width, clip flags";
2371       emit_psiz_and_flags(hw_reg);
2372       break;
2373    case BRW_VERT_RESULT_NDC:
2374       current_annotation = "NDC";
2375       emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2376       break;
2377    case BRW_VERT_RESULT_HPOS_DUPLICATE:
2378    case VERT_RESULT_HPOS:
2379       current_annotation = "gl_Position";
2380       emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2381       break;
2382    case VERT_RESULT_CLIP_DIST0:
2383    case VERT_RESULT_CLIP_DIST1:
2384       if (this->c->key.uses_clip_distance) {
2385          emit_generic_urb_slot(reg, vert_result);
2386       } else {
2387          current_annotation = "user clip distances";
2388          emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2389       }
2390       break;
2391    case VERT_RESULT_EDGE:
2392       /* This is present when doing unfilled polygons.  We're supposed to copy
2393        * the edge flag from the user-provided vertex array
2394        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2395        * of that attribute (starts as 1.0f).  This is then used in clipping to
2396        * determine which edges should be drawn as wireframe.
2397        */
2398       current_annotation = "edge flag";
2399       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2400                                     glsl_type::float_type, WRITEMASK_XYZW))));
2401       break;
2402    case BRW_VERT_RESULT_PAD:
2403       /* No need to write to this slot */
2404       break;
2405    default:
2406       emit_generic_urb_slot(reg, vert_result);
2407       break;
2408    }
2409 }
2410
2411 static int
2412 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2413 {
2414    struct intel_context *intel = &brw->intel;
2415
2416    if (intel->gen >= 6) {
2417       /* URB data written (does not include the message header reg) must
2418        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
2419        * section 5.4.3.2.2: URB_INTERLEAVED.
2420        *
2421        * URB entries are allocated on a multiple of 1024 bits, so an
2422        * extra 128 bits written here to make the end align to 256 is
2423        * no problem.
2424        */
2425       if ((mlen % 2) != 1)
2426          mlen++;
2427    }
2428
2429    return mlen;
2430 }
2431
2432 /**
2433  * Generates the VUE payload plus the 1 or 2 URB write instructions to
2434  * complete the VS thread.
2435  *
2436  * The VUE layout is documented in Volume 2a.
2437  */
2438 void
2439 vec4_visitor::emit_urb_writes()
2440 {
2441    /* MRF 0 is reserved for the debugger, so start with message header
2442     * in MRF 1.
2443     */
2444    int base_mrf = 1;
2445    int mrf = base_mrf;
2446    /* In the process of generating our URB write message contents, we
2447     * may need to unspill a register or load from an array.  Those
2448     * reads would use MRFs 14-15.
2449     */
2450    int max_usable_mrf = 13;
2451
2452    /* The following assertion verifies that max_usable_mrf causes an
2453     * even-numbered amount of URB write data, which will meet gen6's
2454     * requirements for length alignment.
2455     */
2456    assert ((max_usable_mrf - base_mrf) % 2 == 0);
2457
2458    /* First mrf is the g0-based message header containing URB handles and such,
2459     * which is implied in VS_OPCODE_URB_WRITE.
2460     */
2461    mrf++;
2462
2463    if (intel->gen < 6) {
2464       emit_ndc_computation();
2465    }
2466
2467    /* Set up the VUE data for the first URB write */
2468    int slot;
2469    for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) {
2470       emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2471
2472       /* If this was max_usable_mrf, we can't fit anything more into this URB
2473        * WRITE.
2474        */
2475       if (mrf > max_usable_mrf) {
2476          slot++;
2477          break;
2478       }
2479    }
2480
2481    current_annotation = "URB write";
2482    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2483    inst->base_mrf = base_mrf;
2484    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2485    inst->eot = (slot >= c->prog_data.vue_map.num_slots);
2486
2487    /* Optional second URB write */
2488    if (!inst->eot) {
2489       mrf = base_mrf + 1;
2490
2491       for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
2492          assert(mrf < max_usable_mrf);
2493
2494          emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2495       }
2496
2497       current_annotation = "URB write";
2498       inst = emit(VS_OPCODE_URB_WRITE);
2499       inst->base_mrf = base_mrf;
2500       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2501       inst->eot = true;
2502       /* URB destination offset.  In the previous write, we got MRFs
2503        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
2504        * URB row increments, and each of our MRFs is half of one of
2505        * those, since we're doing interleaved writes.
2506        */
2507       inst->offset = (max_usable_mrf - base_mrf) / 2;
2508    }
2509 }
2510
2511 src_reg
2512 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2513                                  src_reg *reladdr, int reg_offset)
2514 {
2515    /* Because we store the values to scratch interleaved like our
2516     * vertex data, we need to scale the vec4 index by 2.
2517     */
2518    int message_header_scale = 2;
2519
2520    /* Pre-gen6, the message header uses byte offsets instead of vec4
2521     * (16-byte) offset units.
2522     */
2523    if (intel->gen < 6)
2524       message_header_scale *= 16;
2525
2526    if (reladdr) {
2527       src_reg index = src_reg(this, glsl_type::int_type);
2528
2529       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2530       emit_before(inst, MUL(dst_reg(index),
2531                             index, src_reg(message_header_scale)));
2532
2533       return index;
2534    } else {
2535       return src_reg(reg_offset * message_header_scale);
2536    }
2537 }
2538
2539 src_reg
2540 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2541                                        src_reg *reladdr, int reg_offset)
2542 {
2543    if (reladdr) {
2544       src_reg index = src_reg(this, glsl_type::int_type);
2545
2546       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2547
2548       /* Pre-gen6, the message header uses byte offsets instead of vec4
2549        * (16-byte) offset units.
2550        */
2551       if (intel->gen < 6) {
2552          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2553       }
2554
2555       return index;
2556    } else {
2557       int message_header_scale = intel->gen < 6 ? 16 : 1;
2558       return src_reg(reg_offset * message_header_scale);
2559    }
2560 }
2561
2562 /**
2563  * Emits an instruction before @inst to load the value named by @orig_src
2564  * from scratch space at @base_offset to @temp.
2565  *
2566  * @base_offset is measured in 32-byte units (the size of a register).
2567  */
2568 void
2569 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2570                                 dst_reg temp, src_reg orig_src,
2571                                 int base_offset)
2572 {
2573    int reg_offset = base_offset + orig_src.reg_offset;
2574    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2575
2576    emit_before(inst, SCRATCH_READ(temp, index));
2577 }
2578
2579 /**
2580  * Emits an instruction after @inst to store the value to be written
2581  * to @orig_dst to scratch space at @base_offset, from @temp.
2582  *
2583  * @base_offset is measured in 32-byte units (the size of a register).
2584  */
2585 void
2586 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2587 {
2588    int reg_offset = base_offset + inst->dst.reg_offset;
2589    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2590
2591    /* Create a temporary register to store *inst's result in.
2592     *
2593     * We have to be careful in MOVing from our temporary result register in
2594     * the scratch write.  If we swizzle from channels of the temporary that
2595     * weren't initialized, it will confuse live interval analysis, which will
2596     * make spilling fail to make progress.
2597     */
2598    src_reg temp = src_reg(this, glsl_type::vec4_type);
2599    temp.type = inst->dst.type;
2600    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2601    int swizzles[4];
2602    for (int i = 0; i < 4; i++)
2603       if (inst->dst.writemask & (1 << i))
2604          swizzles[i] = i;
2605       else
2606          swizzles[i] = first_writemask_chan;
2607    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2608                                swizzles[2], swizzles[3]);
2609
2610    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2611                                        inst->dst.writemask));
2612    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2613    write->predicate = inst->predicate;
2614    write->ir = inst->ir;
2615    write->annotation = inst->annotation;
2616    inst->insert_after(write);
2617
2618    inst->dst.file = temp.file;
2619    inst->dst.reg = temp.reg;
2620    inst->dst.reg_offset = temp.reg_offset;
2621    inst->dst.reladdr = NULL;
2622 }
2623
2624 /**
2625  * We can't generally support array access in GRF space, because a
2626  * single instruction's destination can only span 2 contiguous
2627  * registers.  So, we send all GRF arrays that get variable index
2628  * access to scratch space.
2629  */
2630 void
2631 vec4_visitor::move_grf_array_access_to_scratch()
2632 {
2633    int scratch_loc[this->virtual_grf_count];
2634
2635    for (int i = 0; i < this->virtual_grf_count; i++) {
2636       scratch_loc[i] = -1;
2637    }
2638
2639    /* First, calculate the set of virtual GRFs that need to be punted
2640     * to scratch due to having any array access on them, and where in
2641     * scratch.
2642     */
2643    foreach_list(node, &this->instructions) {
2644       vec4_instruction *inst = (vec4_instruction *)node;
2645
2646       if (inst->dst.file == GRF && inst->dst.reladdr &&
2647           scratch_loc[inst->dst.reg] == -1) {
2648          scratch_loc[inst->dst.reg] = c->last_scratch;
2649          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2650       }
2651
2652       for (int i = 0 ; i < 3; i++) {
2653          src_reg *src = &inst->src[i];
2654
2655          if (src->file == GRF && src->reladdr &&
2656              scratch_loc[src->reg] == -1) {
2657             scratch_loc[src->reg] = c->last_scratch;
2658             c->last_scratch += this->virtual_grf_sizes[src->reg];
2659          }
2660       }
2661    }
2662
2663    /* Now, for anything that will be accessed through scratch, rewrite
2664     * it to load/store.  Note that this is a _safe list walk, because
2665     * we may generate a new scratch_write instruction after the one
2666     * we're processing.
2667     */
2668    foreach_list_safe(node, &this->instructions) {
2669       vec4_instruction *inst = (vec4_instruction *)node;
2670
2671       /* Set up the annotation tracking for new generated instructions. */
2672       base_ir = inst->ir;
2673       current_annotation = inst->annotation;
2674
2675       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2676          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2677       }
2678
2679       for (int i = 0 ; i < 3; i++) {
2680          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2681             continue;
2682
2683          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2684
2685          emit_scratch_read(inst, temp, inst->src[i],
2686                            scratch_loc[inst->src[i].reg]);
2687
2688          inst->src[i].file = temp.file;
2689          inst->src[i].reg = temp.reg;
2690          inst->src[i].reg_offset = temp.reg_offset;
2691          inst->src[i].reladdr = NULL;
2692       }
2693    }
2694 }
2695
2696 /**
2697  * Emits an instruction before @inst to load the value named by @orig_src
2698  * from the pull constant buffer (surface) at @base_offset to @temp.
2699  */
2700 void
2701 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2702                                       dst_reg temp, src_reg orig_src,
2703                                       int base_offset)
2704 {
2705    int reg_offset = base_offset + orig_src.reg_offset;
2706    src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2707    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2708    vec4_instruction *load;
2709
2710    load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2711                                         temp, index, offset);
2712    load->base_mrf = 14;
2713    load->mlen = 1;
2714    emit_before(inst, load);
2715 }
2716
2717 /**
2718  * Implements array access of uniforms by inserting a
2719  * PULL_CONSTANT_LOAD instruction.
2720  *
2721  * Unlike temporary GRF array access (where we don't support it due to
2722  * the difficulty of doing relative addressing on instruction
2723  * destinations), we could potentially do array access of uniforms
2724  * that were loaded in GRF space as push constants.  In real-world
2725  * usage we've seen, though, the arrays being used are always larger
2726  * than we could load as push constants, so just always move all
2727  * uniform array access out to a pull constant buffer.
2728  */
2729 void
2730 vec4_visitor::move_uniform_array_access_to_pull_constants()
2731 {
2732    int pull_constant_loc[this->uniforms];
2733
2734    for (int i = 0; i < this->uniforms; i++) {
2735       pull_constant_loc[i] = -1;
2736    }
2737
2738    /* Walk through and find array access of uniforms.  Put a copy of that
2739     * uniform in the pull constant buffer.
2740     *
2741     * Note that we don't move constant-indexed accesses to arrays.  No
2742     * testing has been done of the performance impact of this choice.
2743     */
2744    foreach_list_safe(node, &this->instructions) {
2745       vec4_instruction *inst = (vec4_instruction *)node;
2746
2747       for (int i = 0 ; i < 3; i++) {
2748          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2749             continue;
2750
2751          int uniform = inst->src[i].reg;
2752
2753          /* If this array isn't already present in the pull constant buffer,
2754           * add it.
2755           */
2756          if (pull_constant_loc[uniform] == -1) {
2757             const float **values = &prog_data->param[uniform * 4];
2758
2759             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2760
2761             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2762                prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2763             }
2764          }
2765
2766          /* Set up the annotation tracking for new generated instructions. */
2767          base_ir = inst->ir;
2768          current_annotation = inst->annotation;
2769
2770          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2771
2772          emit_pull_constant_load(inst, temp, inst->src[i],
2773                                  pull_constant_loc[uniform]);
2774
2775          inst->src[i].file = temp.file;
2776          inst->src[i].reg = temp.reg;
2777          inst->src[i].reg_offset = temp.reg_offset;
2778          inst->src[i].reladdr = NULL;
2779       }
2780    }
2781
2782    /* Now there are no accesses of the UNIFORM file with a reladdr, so
2783     * no need to track them as larger-than-vec4 objects.  This will be
2784     * relied on in cutting out unused uniform vectors from push
2785     * constants.
2786     */
2787    split_uniform_registers();
2788 }
2789
2790 void
2791 vec4_visitor::resolve_ud_negate(src_reg *reg)
2792 {
2793    if (reg->type != BRW_REGISTER_TYPE_UD ||
2794        !reg->negate)
2795       return;
2796
2797    src_reg temp = src_reg(this, glsl_type::uvec4_type);
2798    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2799    *reg = temp;
2800 }
2801
2802 vec4_visitor::vec4_visitor(struct brw_context *brw,
2803                            struct brw_vs_compile *c,
2804                            struct gl_shader_program *prog,
2805                            struct brw_shader *shader,
2806                            void *mem_ctx)
2807 {
2808    this->c = c;
2809    this->brw = brw;
2810    this->intel = &brw->intel;
2811    this->ctx = &intel->ctx;
2812    this->prog = prog;
2813    this->shader = shader;
2814
2815    this->mem_ctx = mem_ctx;
2816    this->failed = false;
2817
2818    this->base_ir = NULL;
2819    this->current_annotation = NULL;
2820    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
2821
2822    this->c = c;
2823    this->vp = &c->vp->program;
2824    this->prog_data = &c->prog_data;
2825
2826    this->variable_ht = hash_table_ctor(0,
2827                                        hash_table_pointer_hash,
2828                                        hash_table_pointer_compare);
2829
2830    this->virtual_grf_def = NULL;
2831    this->virtual_grf_use = NULL;
2832    this->virtual_grf_sizes = NULL;
2833    this->virtual_grf_count = 0;
2834    this->virtual_grf_reg_map = NULL;
2835    this->virtual_grf_reg_count = 0;
2836    this->virtual_grf_array_size = 0;
2837    this->live_intervals_valid = false;
2838
2839    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2840
2841    this->uniforms = 0;
2842 }
2843
2844 vec4_visitor::~vec4_visitor()
2845 {
2846    hash_table_dtor(this->variable_ht);
2847 }
2848
2849
2850 void
2851 vec4_visitor::fail(const char *format, ...)
2852 {
2853    va_list va;
2854    char *msg;
2855
2856    if (failed)
2857       return;
2858
2859    failed = true;
2860
2861    va_start(va, format);
2862    msg = ralloc_vasprintf(mem_ctx, format, va);
2863    va_end(va);
2864    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2865
2866    this->fail_msg = msg;
2867
2868    if (INTEL_DEBUG & DEBUG_VS) {
2869       fprintf(stderr, "%s",  msg);
2870    }
2871 }
2872
2873 } /* namespace brw */