src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(devinfo->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = 14;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = 13;
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 void
 280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 281 {
 282    static enum opcode dot_opcodes[] = {
 283       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 284    };
 285
 286    emit(dot_opcodes[elements - 2], dst, src0, src1);
 287 }
 288
 289 src_reg
 290 vec4_visitor::fix_3src_operand(src_reg src)
 291 {
 292    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 293     * able to use vertical stride of zero to replicate the vec4 uniform, like
 294     *
 295     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 296     *
 297     * But you can't, since vertical stride is always four in three-source
 298     * instructions. Instead, insert a MOV instruction to do the replication so
 299     * that the three-source instruction can consume it.
 300     */
 301
 302    /* The MOV is only needed if the source is a uniform or immediate. */
 303    if (src.file != UNIFORM && src.file != IMM)
 304       return src;
 305
 306    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 307       return src;
 308
 309    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 310    expanded.type = src.type;
 311    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 312    return src_reg(expanded);
 313 }
 314
 315 src_reg
 316 vec4_visitor::fix_math_operand(src_reg src)
 317 {
 318    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 319       return src;
 320
 321    /* The gen6 math instruction ignores the source modifiers --
 322     * swizzle, abs, negate, and at least some parts of the register
 323     * region description.
 324     *
 325     * Rather than trying to enumerate all these cases, *always* expand the
 326     * operand to a temp GRF for gen6.
 327     *
 328     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 329     * can't use.
 330     */
 331
 332    if (devinfo->gen == 7 && src.file != IMM)
 333       return src;
 334
 335    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 336    expanded.type = src.type;
 337    emit(MOV(expanded, src));
 338    return src_reg(expanded);
 339 }
 340
 341 void
 342 vec4_visitor::emit_math(enum opcode opcode,
 343                         const dst_reg &dst,
 344                         const src_reg &src0, const src_reg &src1)
 345 {
 346    vec4_instruction *math =
 347       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 348
 349    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 350       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 351       math->dst = dst_reg(this, glsl_type::vec4_type);
 352       math->dst.type = dst.type;
 353       emit(MOV(dst, src_reg(math->dst)));
 354    } else if (devinfo->gen < 6) {
 355       math->base_mrf = 1;
 356       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 357    }
 358 }
 359
 360 void
 361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 362 {
 363    if (devinfo->gen < 7) {
 364       unreachable("ir_unop_pack_half_2x16 should be lowered");
 365    }
 366
 367    assert(dst.type == BRW_REGISTER_TYPE_UD);
 368    assert(src0.type == BRW_REGISTER_TYPE_F);
 369
 370    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 371     *
 372     *   Because this instruction does not have a 16-bit floating-point type,
 373     *   the destination data type must be Word (W).
 374     *
 375     *   The destination must be DWord-aligned and specify a horizontal stride
 376     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 377     *   each destination channel and the upper word is not modified.
 378     *
 379     * The above restriction implies that the f32to16 instruction must use
 380     * align1 mode, because only in align1 mode is it possible to specify
 381     * horizontal stride.  We choose here to defy the hardware docs and emit
 382     * align16 instructions.
 383     *
 384     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 385     * instructions. I was partially successful in that the code passed all
 386     * tests.  However, the code was dubiously correct and fragile, and the
 387     * tests were not harsh enough to probe that frailty. Not trusting the
 388     * code, I chose instead to remain in align16 mode in defiance of the hw
 389     * docs).
 390     *
 391     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 392     * simulator, emitting a f32to16 in align16 mode with UD as destination
 393     * data type is safe. The behavior differs from that specified in the PRM
 394     * in that the upper word of each destination channel is cleared to 0.
 395     */
 396
 397    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 398    src_reg tmp_src(tmp_dst);
 399
 400 #if 0
 401    /* Verify the undocumented behavior on which the following instructions
 402     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 403     * then the result of the bit-or instruction below will be incorrect.
 404     *
 405     * You should inspect the disasm output in order to verify that the MOV is
 406     * not optimized away.
 407     */
 408    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 409 #endif
 410
 411    /* Give tmp the form below, where "." means untouched.
 412     *
 413     *     w z          y          x w z          y          x
 414     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 415     *
 416     * That the upper word of each write-channel be 0 is required for the
 417     * following bit-shift and bit-or instructions to work. Note that this
 418     * relies on the undocumented hardware behavior mentioned above.
 419     */
 420    tmp_dst.writemask = WRITEMASK_XY;
 421    emit(F32TO16(tmp_dst, src0));
 422
 423    /* Give the write-channels of dst the form:
 424     *   0xhhhh0000
 425     */
 426    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 427    emit(SHL(dst, tmp_src, src_reg(16u)));
 428
 429    /* Finally, give the write-channels of dst the form of packHalf2x16's
 430     * output:
 431     *   0xhhhhllll
 432     */
 433    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 434    emit(OR(dst, src_reg(dst), tmp_src));
 435 }
 436
 437 void
 438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 439 {
 440    if (devinfo->gen < 7) {
 441       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 442    }
 443
 444    assert(dst.type == BRW_REGISTER_TYPE_F);
 445    assert(src0.type == BRW_REGISTER_TYPE_UD);
 446
 447    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 448     *
 449     *   Because this instruction does not have a 16-bit floating-point type,
 450     *   the source data type must be Word (W). The destination type must be
 451     *   F (Float).
 452     *
 453     * To use W as the source data type, we must adjust horizontal strides,
 454     * which is only possible in align1 mode. All my [chadv] attempts at
 455     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 456     * Piglit tests, so I gave up.
 457     *
 458     * I've verified that, on gen7 hardware and the simulator, it is safe to
 459     * emit f16to32 in align16 mode with UD as source data type.
 460     */
 461
 462    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 463    src_reg tmp_src(tmp_dst);
 464
 465    tmp_dst.writemask = WRITEMASK_X;
 466    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 467
 468    tmp_dst.writemask = WRITEMASK_Y;
 469    emit(SHR(tmp_dst, src0, src_reg(16u)));
 470
 471    dst.writemask = WRITEMASK_XY;
 472    emit(F16TO32(dst, tmp_src));
 473 }
 474
 475 void
 476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 477 {
 478    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 479     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 480     * is not suitable to generate the shift values, but we can use the packed
 481     * vector float and a type-converting MOV.
 482     */
 483    dst_reg shift(this, glsl_type::uvec4_type);
 484    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 485
 486    dst_reg shifted(this, glsl_type::uvec4_type);
 487    src0.swizzle = BRW_SWIZZLE_XXXX;
 488    emit(SHR(shifted, src0, src_reg(shift)));
 489
 490    shifted.type = BRW_REGISTER_TYPE_UB;
 491    dst_reg f(this, glsl_type::vec4_type);
 492    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 493
 494    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 495 }
 496
 497 void
 498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 499 {
 500    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 501     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 502     * is not suitable to generate the shift values, but we can use the packed
 503     * vector float and a type-converting MOV.
 504     */
 505    dst_reg shift(this, glsl_type::uvec4_type);
 506    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 507
 508    dst_reg shifted(this, glsl_type::uvec4_type);
 509    src0.swizzle = BRW_SWIZZLE_XXXX;
 510    emit(SHR(shifted, src0, src_reg(shift)));
 511
 512    shifted.type = BRW_REGISTER_TYPE_B;
 513    dst_reg f(this, glsl_type::vec4_type);
 514    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 515
 516    dst_reg scaled(this, glsl_type::vec4_type);
 517    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 518
 519    dst_reg max(this, glsl_type::vec4_type);
 520    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 521    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 522 }
 523
 524 void
 525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 526 {
 527    dst_reg saturated(this, glsl_type::vec4_type);
 528    vec4_instruction *inst = emit(MOV(saturated, src0));
 529    inst->saturate = true;
 530
 531    dst_reg scaled(this, glsl_type::vec4_type);
 532    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 533
 534    dst_reg rounded(this, glsl_type::vec4_type);
 535    emit(RNDE(rounded, src_reg(scaled)));
 536
 537    dst_reg u(this, glsl_type::uvec4_type);
 538    emit(MOV(u, src_reg(rounded)));
 539
 540    src_reg bytes(u);
 541    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 542 }
 543
 544 void
 545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 546 {
 547    dst_reg max(this, glsl_type::vec4_type);
 548    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 549
 550    dst_reg min(this, glsl_type::vec4_type);
 551    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 552
 553    dst_reg scaled(this, glsl_type::vec4_type);
 554    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 555
 556    dst_reg rounded(this, glsl_type::vec4_type);
 557    emit(RNDE(rounded, src_reg(scaled)));
 558
 559    dst_reg i(this, glsl_type::ivec4_type);
 560    emit(MOV(i, src_reg(rounded)));
 561
 562    src_reg bytes(i);
 563    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 564 }
 565
 566 void
 567 vec4_visitor::visit_instructions(const exec_list *list)
 568 {
 569    foreach_in_list(ir_instruction, ir, list) {
 570       base_ir = ir;
 571       ir->accept(this);
 572    }
 573 }
 574
 575
 576 static int
 577 type_size(const struct glsl_type *type)
 578 {
 579    unsigned int i;
 580    int size;
 581
 582    switch (type->base_type) {
 583    case GLSL_TYPE_UINT:
 584    case GLSL_TYPE_INT:
 585    case GLSL_TYPE_FLOAT:
 586    case GLSL_TYPE_BOOL:
 587       if (type->is_matrix()) {
 588          return type->matrix_columns;
 589       } else {
 590          /* Regardless of size of vector, it gets a vec4. This is bad
 591           * packing for things like floats, but otherwise arrays become a
 592           * mess.  Hopefully a later pass over the code can pack scalars
 593           * down if appropriate.
 594           */
 595          return 1;
 596       }
 597    case GLSL_TYPE_ARRAY:
 598       assert(type->length > 0);
 599       return type_size(type->fields.array) * type->length;
 600    case GLSL_TYPE_STRUCT:
 601       size = 0;
 602       for (i = 0; i < type->length; i++) {
 603          size += type_size(type->fields.structure[i].type);
 604       }
 605       return size;
 606    case GLSL_TYPE_SAMPLER:
 607       /* Samplers take up no register space, since they're baked in at
 608        * link time.
 609        */
 610       return 0;
 611    case GLSL_TYPE_ATOMIC_UINT:
 612       return 0;
 613    case GLSL_TYPE_IMAGE:
 614    case GLSL_TYPE_VOID:
 615    case GLSL_TYPE_DOUBLE:
 616    case GLSL_TYPE_ERROR:
 617    case GLSL_TYPE_INTERFACE:
 618       unreachable("not reached");
 619    }
 620
 621    return 0;
 622 }
 623
 624 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 625 {
 626    init();
 627
 628    this->file = GRF;
 629    this->reg = v->alloc.allocate(type_size(type));
 630
 631    if (type->is_array() || type->is_record()) {
 632       this->swizzle = BRW_SWIZZLE_NOOP;
 633    } else {
 634       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 635    }
 636
 637    this->type = brw_type_for_base_type(type);
 638 }
 639
 640 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 641 {
 642    assert(size > 0);
 643
 644    init();
 645
 646    this->file = GRF;
 647    this->reg = v->alloc.allocate(type_size(type) * size);
 648
 649    this->swizzle = BRW_SWIZZLE_NOOP;
 650
 651    this->type = brw_type_for_base_type(type);
 652 }
 653
 654 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 655 {
 656    init();
 657
 658    this->file = GRF;
 659    this->reg = v->alloc.allocate(type_size(type));
 660
 661    if (type->is_array() || type->is_record()) {
 662       this->writemask = WRITEMASK_XYZW;
 663    } else {
 664       this->writemask = (1 << type->vector_elements) - 1;
 665    }
 666
 667    this->type = brw_type_for_base_type(type);
 668 }
 669
 670 /* Our support for uniforms is piggy-backed on the struct
 671  * gl_fragment_program, because that's where the values actually
 672  * get stored, rather than in some global gl_shader_program uniform
 673  * store.
 674  */
 675 void
 676 vec4_visitor::setup_uniform_values(ir_variable *ir)
 677 {
 678    int namelen = strlen(ir->name);
 679
 680    /* The data for our (non-builtin) uniforms is stored in a series of
 681     * gl_uniform_driver_storage structs for each subcomponent that
 682     * glGetUniformLocation() could name.  We know it's been set up in the same
 683     * order we'd walk the type, so walk the list of storage and find anything
 684     * with our name, or the prefix of a component that starts with our name.
 685     */
 686    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 687       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 688
 689       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 690           (storage->name[namelen] != 0 &&
 691            storage->name[namelen] != '.' &&
 692            storage->name[namelen] != '[')) {
 693          continue;
 694       }
 695
 696       gl_constant_value *components = storage->storage;
 697       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 698                                storage->type->matrix_columns);
 699
 700       for (unsigned s = 0; s < vector_count; s++) {
 701          assert(uniforms < uniform_array_size);
 702          uniform_vector_size[uniforms] = storage->type->vector_elements;
 703
 704          int i;
 705          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 706             stage_prog_data->param[uniforms * 4 + i] = components;
 707             components++;
 708          }
 709          for (; i < 4; i++) {
 710             static gl_constant_value zero = { 0.0 };
 711             stage_prog_data->param[uniforms * 4 + i] = &zero;
 712          }
 713
 714          uniforms++;
 715       }
 716    }
 717 }
 718
 719 void
 720 vec4_visitor::setup_uniform_clipplane_values()
 721 {
 722    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 723
 724    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 725       assert(this->uniforms < uniform_array_size);
 726       this->uniform_vector_size[this->uniforms] = 4;
 727       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 728       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 729       for (int j = 0; j < 4; ++j) {
 730          stage_prog_data->param[this->uniforms * 4 + j] =
 731             (gl_constant_value *) &clip_planes[i][j];
 732       }
 733       ++this->uniforms;
 734    }
 735 }
 736
 737 /* Our support for builtin uniforms is even scarier than non-builtin.
 738  * It sits on top of the PROG_STATE_VAR parameters that are
 739  * automatically updated from GL context state.
 740  */
 741 void
 742 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 743 {
 744    const ir_state_slot *const slots = ir->get_state_slots();
 745    assert(slots != NULL);
 746
 747    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 748       /* This state reference has already been setup by ir_to_mesa,
 749        * but we'll get the same index back here.  We can reference
 750        * ParameterValues directly, since unlike brw_fs.cpp, we never
 751        * add new state references during compile.
 752        */
 753       int index = _mesa_add_state_reference(this->prog->Parameters,
 754                                             (gl_state_index *)slots[i].tokens);
 755       gl_constant_value *values =
 756          &this->prog->Parameters->ParameterValues[index][0];
 757
 758       assert(this->uniforms < uniform_array_size);
 759
 760       for (unsigned j = 0; j < 4; j++)
 761          stage_prog_data->param[this->uniforms * 4 + j] =
 762             &values[GET_SWZ(slots[i].swizzle, j)];
 763
 764       this->uniform_vector_size[this->uniforms] =
 765          (ir->type->is_scalar() || ir->type->is_vector() ||
 766           ir->type->is_matrix() ? ir->type->vector_elements : 4);
 767
 768       this->uniforms++;
 769    }
 770 }
 771
 772 dst_reg *
 773 vec4_visitor::variable_storage(ir_variable *var)
 774 {
 775    return (dst_reg *)hash_table_find(this->variable_ht, var);
 776 }
 777
 778 void
 779 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 780                                      enum brw_predicate *predicate)
 781 {
 782    ir_expression *expr = ir->as_expression();
 783
 784    *predicate = BRW_PREDICATE_NORMAL;
 785
 786    if (expr && expr->operation != ir_binop_ubo_load) {
 787       src_reg op[3];
 788       vec4_instruction *inst;
 789
 790       assert(expr->get_num_operands() <= 3);
 791       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 792          expr->operands[i]->accept(this);
 793          op[i] = this->result;
 794
 795          resolve_ud_negate(&op[i]);
 796       }
 797
 798       switch (expr->operation) {
 799       case ir_unop_logic_not:
 800          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 801          inst->conditional_mod = BRW_CONDITIONAL_Z;
 802          break;
 803
 804       case ir_binop_logic_xor:
 805          if (devinfo->gen <= 5) {
 806             src_reg temp = src_reg(this, ir->type);
 807             emit(XOR(dst_reg(temp), op[0], op[1]));
 808             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 809          } else {
 810             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 811          }
 812          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 813          break;
 814
 815       case ir_binop_logic_or:
 816          if (devinfo->gen <= 5) {
 817             src_reg temp = src_reg(this, ir->type);
 818             emit(OR(dst_reg(temp), op[0], op[1]));
 819             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 820          } else {
 821             inst = emit(OR(dst_null_d(), op[0], op[1]));
 822          }
 823          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 824          break;
 825
 826       case ir_binop_logic_and:
 827          if (devinfo->gen <= 5) {
 828             src_reg temp = src_reg(this, ir->type);
 829             emit(AND(dst_reg(temp), op[0], op[1]));
 830             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 831          } else {
 832             inst = emit(AND(dst_null_d(), op[0], op[1]));
 833          }
 834          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 835          break;
 836
 837       case ir_unop_f2b:
 838          if (devinfo->gen >= 6) {
 839             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 840          } else {
 841             inst = emit(MOV(dst_null_f(), op[0]));
 842             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 843          }
 844          break;
 845
 846       case ir_unop_i2b:
 847          if (devinfo->gen >= 6) {
 848             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 849          } else {
 850             inst = emit(MOV(dst_null_d(), op[0]));
 851             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 852          }
 853          break;
 854
 855       case ir_binop_all_equal:
 856          if (devinfo->gen <= 5) {
 857             resolve_bool_comparison(expr->operands[0], &op[0]);
 858             resolve_bool_comparison(expr->operands[1], &op[1]);
 859          }
 860          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 861          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 862          break;
 863
 864       case ir_binop_any_nequal:
 865          if (devinfo->gen <= 5) {
 866             resolve_bool_comparison(expr->operands[0], &op[0]);
 867             resolve_bool_comparison(expr->operands[1], &op[1]);
 868          }
 869          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 870          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 871          break;
 872
 873       case ir_unop_any:
 874          if (devinfo->gen <= 5) {
 875             resolve_bool_comparison(expr->operands[0], &op[0]);
 876          }
 877          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 878          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 879          break;
 880
 881       case ir_binop_greater:
 882       case ir_binop_gequal:
 883       case ir_binop_less:
 884       case ir_binop_lequal:
 885       case ir_binop_equal:
 886       case ir_binop_nequal:
 887          if (devinfo->gen <= 5) {
 888             resolve_bool_comparison(expr->operands[0], &op[0]);
 889             resolve_bool_comparison(expr->operands[1], &op[1]);
 890          }
 891          emit(CMP(dst_null_d(), op[0], op[1],
 892                   brw_conditional_for_comparison(expr->operation)));
 893          break;
 894
 895       case ir_triop_csel: {
 896          /* Expand the boolean condition into the flag register. */
 897          inst = emit(MOV(dst_null_d(), op[0]));
 898          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 899
 900          /* Select which boolean to return. */
 901          dst_reg temp(this, expr->operands[1]->type);
 902          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 903          inst->predicate = BRW_PREDICATE_NORMAL;
 904
 905          /* Expand the result to a condition code. */
 906          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 907          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 908          break;
 909       }
 910
 911       default:
 912          unreachable("not reached");
 913       }
 914       return;
 915    }
 916
 917    ir->accept(this);
 918
 919    resolve_ud_negate(&this->result);
 920
 921    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 922    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 923 }
 924
 925 /**
 926  * Emit a gen6 IF statement with the comparison folded into the IF
 927  * instruction.
 928  */
 929 void
 930 vec4_visitor::emit_if_gen6(ir_if *ir)
 931 {
 932    ir_expression *expr = ir->condition->as_expression();
 933
 934    if (expr && expr->operation != ir_binop_ubo_load) {
 935       src_reg op[3];
 936       dst_reg temp;
 937
 938       assert(expr->get_num_operands() <= 3);
 939       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 940          expr->operands[i]->accept(this);
 941          op[i] = this->result;
 942       }
 943
 944       switch (expr->operation) {
 945       case ir_unop_logic_not:
 946          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 947          return;
 948
 949       case ir_binop_logic_xor:
 950          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 951          return;
 952
 953       case ir_binop_logic_or:
 954          temp = dst_reg(this, glsl_type::bool_type);
 955          emit(OR(temp, op[0], op[1]));
 956          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 957          return;
 958
 959       case ir_binop_logic_and:
 960          temp = dst_reg(this, glsl_type::bool_type);
 961          emit(AND(temp, op[0], op[1]));
 962          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 963          return;
 964
 965       case ir_unop_f2b:
 966          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 967          return;
 968
 969       case ir_unop_i2b:
 970          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 971          return;
 972
 973       case ir_binop_greater:
 974       case ir_binop_gequal:
 975       case ir_binop_less:
 976       case ir_binop_lequal:
 977       case ir_binop_equal:
 978       case ir_binop_nequal:
 979          emit(IF(op[0], op[1],
 980                  brw_conditional_for_comparison(expr->operation)));
 981          return;
 982
 983       case ir_binop_all_equal:
 984          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 985          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 986          return;
 987
 988       case ir_binop_any_nequal:
 989          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 990          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 991          return;
 992
 993       case ir_unop_any:
 994          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 995          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 996          return;
 997
 998       case ir_triop_csel: {
 999          /* Expand the boolean condition into the flag register. */
1000          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1001          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1002
1003          /* Select which boolean to return. */
1004          dst_reg temp(this, expr->operands[1]->type);
1005          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1006          inst->predicate = BRW_PREDICATE_NORMAL;
1007
1008          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1009          return;
1010       }
1011
1012       default:
1013          unreachable("not reached");
1014       }
1015       return;
1016    }
1017
1018    ir->condition->accept(this);
1019
1020    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1021 }
1022
1023 void
1024 vec4_visitor::visit(ir_variable *ir)
1025 {
1026    dst_reg *reg = NULL;
1027
1028    if (variable_storage(ir))
1029       return;
1030
1031    switch (ir->data.mode) {
1032    case ir_var_shader_in:
1033       assert(ir->data.location != -1);
1034       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1035       break;
1036
1037    case ir_var_shader_out:
1038       assert(ir->data.location != -1);
1039       reg = new(mem_ctx) dst_reg(this, ir->type);
1040
1041       for (int i = 0; i < type_size(ir->type); i++) {
1042          output_reg[ir->data.location + i] = *reg;
1043          output_reg[ir->data.location + i].reg_offset = i;
1044          output_reg[ir->data.location + i].type =
1045             brw_type_for_base_type(ir->type->get_scalar_type());
1046          output_reg_annotation[ir->data.location + i] = ir->name;
1047       }
1048       break;
1049
1050    case ir_var_auto:
1051    case ir_var_temporary:
1052       reg = new(mem_ctx) dst_reg(this, ir->type);
1053       break;
1054
1055    case ir_var_uniform:
1056       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1057
1058       /* Thanks to the lower_ubo_reference pass, we will see only
1059        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1060        * variables, so no need for them to be in variable_ht.
1061        *
1062        * Some uniforms, such as samplers and atomic counters, have no actual
1063        * storage, so we should ignore them.
1064        */
1065       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1066          return;
1067
1068       /* Track how big the whole uniform variable is, in case we need to put a
1069        * copy of its data into pull constants for array access.
1070        */
1071       assert(this->uniforms < uniform_array_size);
1072       this->uniform_size[this->uniforms] = type_size(ir->type);
1073
1074       if (!strncmp(ir->name, "gl_", 3)) {
1075          setup_builtin_uniform_values(ir);
1076       } else {
1077          setup_uniform_values(ir);
1078       }
1079       break;
1080
1081    case ir_var_system_value:
1082       reg = make_reg_for_system_value(ir);
1083       break;
1084
1085    default:
1086       unreachable("not reached");
1087    }
1088
1089    reg->type = brw_type_for_base_type(ir->type);
1090    hash_table_insert(this->variable_ht, reg, ir);
1091 }
1092
1093 void
1094 vec4_visitor::visit(ir_loop *ir)
1095 {
1096    /* We don't want debugging output to print the whole body of the
1097     * loop as the annotation.
1098     */
1099    this->base_ir = NULL;
1100
1101    emit(BRW_OPCODE_DO);
1102
1103    visit_instructions(&ir->body_instructions);
1104
1105    emit(BRW_OPCODE_WHILE);
1106 }
1107
1108 void
1109 vec4_visitor::visit(ir_loop_jump *ir)
1110 {
1111    switch (ir->mode) {
1112    case ir_loop_jump::jump_break:
1113       emit(BRW_OPCODE_BREAK);
1114       break;
1115    case ir_loop_jump::jump_continue:
1116       emit(BRW_OPCODE_CONTINUE);
1117       break;
1118    }
1119 }
1120
1121
1122 void
1123 vec4_visitor::visit(ir_function_signature *)
1124 {
1125    unreachable("not reached");
1126 }
1127
1128 void
1129 vec4_visitor::visit(ir_function *ir)
1130 {
1131    /* Ignore function bodies other than main() -- we shouldn't see calls to
1132     * them since they should all be inlined.
1133     */
1134    if (strcmp(ir->name, "main") == 0) {
1135       const ir_function_signature *sig;
1136       exec_list empty;
1137
1138       sig = ir->matching_signature(NULL, &empty, false);
1139
1140       assert(sig);
1141
1142       visit_instructions(&sig->body);
1143    }
1144 }
1145
1146 bool
1147 vec4_visitor::try_emit_mad(ir_expression *ir)
1148 {
1149    /* 3-src instructions were introduced in gen6. */
1150    if (devinfo->gen < 6)
1151       return false;
1152
1153    /* MAD can only handle floating-point data. */
1154    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1155       return false;
1156
1157    ir_rvalue *nonmul;
1158    ir_expression *mul;
1159    bool mul_negate, mul_abs;
1160
1161    for (int i = 0; i < 2; i++) {
1162       mul_negate = false;
1163       mul_abs = false;
1164
1165       mul = ir->operands[i]->as_expression();
1166       nonmul = ir->operands[1 - i];
1167
1168       if (mul && mul->operation == ir_unop_abs) {
1169          mul = mul->operands[0]->as_expression();
1170          mul_abs = true;
1171       } else if (mul && mul->operation == ir_unop_neg) {
1172          mul = mul->operands[0]->as_expression();
1173          mul_negate = true;
1174       }
1175
1176       if (mul && mul->operation == ir_binop_mul)
1177          break;
1178    }
1179
1180    if (!mul || mul->operation != ir_binop_mul)
1181       return false;
1182
1183    nonmul->accept(this);
1184    src_reg src0 = fix_3src_operand(this->result);
1185
1186    mul->operands[0]->accept(this);
1187    src_reg src1 = fix_3src_operand(this->result);
1188    src1.negate ^= mul_negate;
1189    src1.abs = mul_abs;
1190    if (mul_abs)
1191       src1.negate = false;
1192
1193    mul->operands[1]->accept(this);
1194    src_reg src2 = fix_3src_operand(this->result);
1195    src2.abs = mul_abs;
1196    if (mul_abs)
1197       src2.negate = false;
1198
1199    this->result = src_reg(this, ir->type);
1200    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1201
1202    return true;
1203 }
1204
1205 bool
1206 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1207 {
1208    /* This optimization relies on CMP setting the destination to 0 when
1209     * false.  Early hardware only sets the least significant bit, and
1210     * leaves the other bits undefined.  So we can't use it.
1211     */
1212    if (devinfo->gen < 6)
1213       return false;
1214
1215    ir_expression *const cmp = ir->operands[0]->as_expression();
1216
1217    if (cmp == NULL)
1218       return false;
1219
1220    switch (cmp->operation) {
1221    case ir_binop_less:
1222    case ir_binop_greater:
1223    case ir_binop_lequal:
1224    case ir_binop_gequal:
1225    case ir_binop_equal:
1226    case ir_binop_nequal:
1227       break;
1228
1229    default:
1230       return false;
1231    }
1232
1233    cmp->operands[0]->accept(this);
1234    const src_reg cmp_src0 = this->result;
1235
1236    cmp->operands[1]->accept(this);
1237    const src_reg cmp_src1 = this->result;
1238
1239    this->result = src_reg(this, ir->type);
1240
1241    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1242             brw_conditional_for_comparison(cmp->operation)));
1243
1244    /* If the comparison is false, this->result will just happen to be zero.
1245     */
1246    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1247                                        this->result, src_reg(1.0f));
1248    inst->predicate = BRW_PREDICATE_NORMAL;
1249    inst->predicate_inverse = true;
1250
1251    return true;
1252 }
1253
1254 void
1255 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1256                           src_reg src0, src_reg src1)
1257 {
1258    vec4_instruction *inst;
1259
1260    if (devinfo->gen >= 6) {
1261       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1262       inst->conditional_mod = conditionalmod;
1263    } else {
1264       emit(CMP(dst, src0, src1, conditionalmod));
1265
1266       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1267       inst->predicate = BRW_PREDICATE_NORMAL;
1268    }
1269 }
1270
1271 void
1272 vec4_visitor::emit_lrp(const dst_reg &dst,
1273                        const src_reg &x, const src_reg &y, const src_reg &a)
1274 {
1275    if (devinfo->gen >= 6) {
1276       /* Note that the instruction's argument order is reversed from GLSL
1277        * and the IR.
1278        */
1279       emit(LRP(dst,
1280                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1281    } else {
1282       /* Earlier generations don't support three source operations, so we
1283        * need to emit x*(1-a) + y*a.
1284        */
1285       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1286       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1287       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1288       y_times_a.writemask           = dst.writemask;
1289       one_minus_a.writemask         = dst.writemask;
1290       x_times_one_minus_a.writemask = dst.writemask;
1291
1292       emit(MUL(y_times_a, y, a));
1293       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1294       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1295       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1296    }
1297 }
1298
1299 /**
1300  * Emits the instructions needed to perform a pull constant load. before_block
1301  * and before_inst can be NULL in which case the instruction will be appended
1302  * to the end of the instruction list.
1303  */
1304 void
1305 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1306                                           src_reg surf_index,
1307                                           src_reg offset_reg,
1308                                           bblock_t *before_block,
1309                                           vec4_instruction *before_inst)
1310 {
1311    assert((before_inst == NULL && before_block == NULL) ||
1312           (before_inst && before_block));
1313
1314    vec4_instruction *pull;
1315
1316    if (devinfo->gen >= 9) {
1317       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1318       src_reg header(this, glsl_type::uvec4_type, 2);
1319
1320       pull = new(mem_ctx)
1321          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1322                           dst_reg(header));
1323
1324       if (before_inst)
1325          emit_before(before_block, before_inst, pull);
1326       else
1327          emit(pull);
1328
1329       dst_reg index_reg = retype(offset(dst_reg(header), 1),
1330                                  offset_reg.type);
1331       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1332
1333       if (before_inst)
1334          emit_before(before_block, before_inst, pull);
1335       else
1336          emit(pull);
1337
1338       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1339                                            dst,
1340                                            surf_index,
1341                                            header);
1342       pull->mlen = 2;
1343       pull->header_size = 1;
1344    } else if (devinfo->gen >= 7) {
1345       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1346
1347       grf_offset.type = offset_reg.type;
1348
1349       pull = MOV(grf_offset, offset_reg);
1350
1351       if (before_inst)
1352          emit_before(before_block, before_inst, pull);
1353       else
1354          emit(pull);
1355
1356       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1357                                            dst,
1358                                            surf_index,
1359                                            src_reg(grf_offset));
1360       pull->mlen = 1;
1361    } else {
1362       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1363                                            dst,
1364                                            surf_index,
1365                                            offset_reg);
1366       pull->base_mrf = 14;
1367       pull->mlen = 1;
1368    }
1369
1370    if (before_inst)
1371       emit_before(before_block, before_inst, pull);
1372    else
1373       emit(pull);
1374 }
1375
1376 void
1377 vec4_visitor::emit_uniformize(const dst_reg &dst, const src_reg &src)
1378 {
1379    const src_reg chan_index(this, glsl_type::uint_type);
1380
1381    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1382       ->force_writemask_all = true;
1383    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1384       ->force_writemask_all = true;
1385 }
1386
1387 void
1388 vec4_visitor::visit(ir_expression *ir)
1389 {
1390    unsigned int operand;
1391    src_reg op[ARRAY_SIZE(ir->operands)];
1392    vec4_instruction *inst;
1393
1394    if (ir->operation == ir_binop_add) {
1395       if (try_emit_mad(ir))
1396          return;
1397    }
1398
1399    if (ir->operation == ir_unop_b2f) {
1400       if (try_emit_b2f_of_compare(ir))
1401          return;
1402    }
1403
1404    /* Storage for our result.  Ideally for an assignment we'd be using
1405     * the actual storage for the result here, instead.
1406     */
1407    dst_reg result_dst(this, ir->type);
1408    src_reg result_src(result_dst);
1409
1410    if (ir->operation == ir_triop_csel) {
1411       ir->operands[1]->accept(this);
1412       op[1] = this->result;
1413       ir->operands[2]->accept(this);
1414       op[2] = this->result;
1415
1416       enum brw_predicate predicate;
1417       emit_bool_to_cond_code(ir->operands[0], &predicate);
1418       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1419       inst->predicate = predicate;
1420       this->result = result_src;
1421       return;
1422    }
1423
1424    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1425       this->result.file = BAD_FILE;
1426       ir->operands[operand]->accept(this);
1427       if (this->result.file == BAD_FILE) {
1428          fprintf(stderr, "Failed to get tree for expression operand:\n");
1429          ir->operands[operand]->fprint(stderr);
1430          exit(1);
1431       }
1432       op[operand] = this->result;
1433
1434       /* Matrix expression operands should have been broken down to vector
1435        * operations already.
1436        */
1437       assert(!ir->operands[operand]->type->is_matrix());
1438    }
1439
1440    /* If nothing special happens, this is the result. */
1441    this->result = result_src;
1442
1443    switch (ir->operation) {
1444    case ir_unop_logic_not:
1445       emit(NOT(result_dst, op[0]));
1446       break;
1447    case ir_unop_neg:
1448       op[0].negate = !op[0].negate;
1449       emit(MOV(result_dst, op[0]));
1450       break;
1451    case ir_unop_abs:
1452       op[0].abs = true;
1453       op[0].negate = false;
1454       emit(MOV(result_dst, op[0]));
1455       break;
1456
1457    case ir_unop_sign:
1458       if (ir->type->is_float()) {
1459          /* AND(val, 0x80000000) gives the sign bit.
1460           *
1461           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1462           * zero.
1463           */
1464          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1465
1466          op[0].type = BRW_REGISTER_TYPE_UD;
1467          result_dst.type = BRW_REGISTER_TYPE_UD;
1468          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1469
1470          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1471          inst->predicate = BRW_PREDICATE_NORMAL;
1472
1473          this->result.type = BRW_REGISTER_TYPE_F;
1474       } else {
1475          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1476           *               -> non-negative val generates 0x00000000.
1477           *  Predicated OR sets 1 if val is positive.
1478           */
1479          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1480
1481          emit(ASR(result_dst, op[0], src_reg(31)));
1482
1483          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1484          inst->predicate = BRW_PREDICATE_NORMAL;
1485       }
1486       break;
1487
1488    case ir_unop_rcp:
1489       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1490       break;
1491
1492    case ir_unop_exp2:
1493       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1494       break;
1495    case ir_unop_log2:
1496       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1497       break;
1498    case ir_unop_exp:
1499    case ir_unop_log:
1500       unreachable("not reached: should be handled by ir_explog_to_explog2");
1501    case ir_unop_sin:
1502       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1503       break;
1504    case ir_unop_cos:
1505       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1506       break;
1507
1508    case ir_unop_dFdx:
1509    case ir_unop_dFdx_coarse:
1510    case ir_unop_dFdx_fine:
1511    case ir_unop_dFdy:
1512    case ir_unop_dFdy_coarse:
1513    case ir_unop_dFdy_fine:
1514       unreachable("derivatives not valid in vertex shader");
1515
1516    case ir_unop_bitfield_reverse:
1517       emit(BFREV(result_dst, op[0]));
1518       break;
1519    case ir_unop_bit_count:
1520       emit(CBIT(result_dst, op[0]));
1521       break;
1522    case ir_unop_find_msb: {
1523       src_reg temp = src_reg(this, glsl_type::uint_type);
1524
1525       inst = emit(FBH(dst_reg(temp), op[0]));
1526       inst->dst.writemask = WRITEMASK_XYZW;
1527
1528       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1529        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1530        * subtract the result from 31 to convert the MSB count into an LSB count.
1531        */
1532
1533       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1534       temp.swizzle = BRW_SWIZZLE_NOOP;
1535       emit(MOV(result_dst, temp));
1536
1537       src_reg src_tmp = src_reg(result_dst);
1538       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1539
1540       src_tmp.negate = true;
1541       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1542       inst->predicate = BRW_PREDICATE_NORMAL;
1543       break;
1544    }
1545    case ir_unop_find_lsb:
1546       emit(FBL(result_dst, op[0]));
1547       break;
1548    case ir_unop_saturate:
1549       inst = emit(MOV(result_dst, op[0]));
1550       inst->saturate = true;
1551       break;
1552
1553    case ir_unop_noise:
1554       unreachable("not reached: should be handled by lower_noise");
1555
1556    case ir_binop_add:
1557       emit(ADD(result_dst, op[0], op[1]));
1558       break;
1559    case ir_binop_sub:
1560       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1561
1562    case ir_binop_mul:
1563       if (devinfo->gen < 8 && ir->type->is_integer()) {
1564          /* For integer multiplication, the MUL uses the low 16 bits of one of
1565           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1566           * accumulates in the contribution of the upper 16 bits of that
1567           * operand.  If we can determine that one of the args is in the low
1568           * 16 bits, though, we can just emit a single MUL.
1569           */
1570          if (ir->operands[0]->is_uint16_constant()) {
1571             if (devinfo->gen < 7)
1572                emit(MUL(result_dst, op[0], op[1]));
1573             else
1574                emit(MUL(result_dst, op[1], op[0]));
1575          } else if (ir->operands[1]->is_uint16_constant()) {
1576             if (devinfo->gen < 7)
1577                emit(MUL(result_dst, op[1], op[0]));
1578             else
1579                emit(MUL(result_dst, op[0], op[1]));
1580          } else {
1581             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1582
1583             emit(MUL(acc, op[0], op[1]));
1584             emit(MACH(dst_null_d(), op[0], op[1]));
1585             emit(MOV(result_dst, src_reg(acc)));
1586          }
1587       } else {
1588          emit(MUL(result_dst, op[0], op[1]));
1589       }
1590       break;
1591    case ir_binop_imul_high: {
1592       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1593
1594       emit(MUL(acc, op[0], op[1]));
1595       emit(MACH(result_dst, op[0], op[1]));
1596       break;
1597    }
1598    case ir_binop_div:
1599       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1600       assert(ir->type->is_integer());
1601       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1602       break;
1603    case ir_binop_carry: {
1604       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1605
1606       emit(ADDC(dst_null_ud(), op[0], op[1]));
1607       emit(MOV(result_dst, src_reg(acc)));
1608       break;
1609    }
1610    case ir_binop_borrow: {
1611       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1612
1613       emit(SUBB(dst_null_ud(), op[0], op[1]));
1614       emit(MOV(result_dst, src_reg(acc)));
1615       break;
1616    }
1617    case ir_binop_mod:
1618       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1619       assert(ir->type->is_integer());
1620       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1621       break;
1622
1623    case ir_binop_less:
1624    case ir_binop_greater:
1625    case ir_binop_lequal:
1626    case ir_binop_gequal:
1627    case ir_binop_equal:
1628    case ir_binop_nequal: {
1629       if (devinfo->gen <= 5) {
1630          resolve_bool_comparison(ir->operands[0], &op[0]);
1631          resolve_bool_comparison(ir->operands[1], &op[1]);
1632       }
1633       emit(CMP(result_dst, op[0], op[1],
1634                brw_conditional_for_comparison(ir->operation)));
1635       break;
1636    }
1637
1638    case ir_binop_all_equal:
1639       if (devinfo->gen <= 5) {
1640          resolve_bool_comparison(ir->operands[0], &op[0]);
1641          resolve_bool_comparison(ir->operands[1], &op[1]);
1642       }
1643
1644       /* "==" operator producing a scalar boolean. */
1645       if (ir->operands[0]->type->is_vector() ||
1646           ir->operands[1]->type->is_vector()) {
1647          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1648          emit(MOV(result_dst, src_reg(0)));
1649          inst = emit(MOV(result_dst, src_reg(~0)));
1650          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1651       } else {
1652          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1653       }
1654       break;
1655    case ir_binop_any_nequal:
1656       if (devinfo->gen <= 5) {
1657          resolve_bool_comparison(ir->operands[0], &op[0]);
1658          resolve_bool_comparison(ir->operands[1], &op[1]);
1659       }
1660
1661       /* "!=" operator producing a scalar boolean. */
1662       if (ir->operands[0]->type->is_vector() ||
1663           ir->operands[1]->type->is_vector()) {
1664          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1665
1666          emit(MOV(result_dst, src_reg(0)));
1667          inst = emit(MOV(result_dst, src_reg(~0)));
1668          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1669       } else {
1670          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1671       }
1672       break;
1673
1674    case ir_unop_any:
1675       if (devinfo->gen <= 5) {
1676          resolve_bool_comparison(ir->operands[0], &op[0]);
1677       }
1678       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1679       emit(MOV(result_dst, src_reg(0)));
1680
1681       inst = emit(MOV(result_dst, src_reg(~0)));
1682       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1683       break;
1684
1685    case ir_binop_logic_xor:
1686       emit(XOR(result_dst, op[0], op[1]));
1687       break;
1688
1689    case ir_binop_logic_or:
1690       emit(OR(result_dst, op[0], op[1]));
1691       break;
1692
1693    case ir_binop_logic_and:
1694       emit(AND(result_dst, op[0], op[1]));
1695       break;
1696
1697    case ir_binop_dot:
1698       assert(ir->operands[0]->type->is_vector());
1699       assert(ir->operands[0]->type == ir->operands[1]->type);
1700       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1701       break;
1702
1703    case ir_unop_sqrt:
1704       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1705       break;
1706    case ir_unop_rsq:
1707       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1708       break;
1709
1710    case ir_unop_bitcast_i2f:
1711    case ir_unop_bitcast_u2f:
1712       this->result = op[0];
1713       this->result.type = BRW_REGISTER_TYPE_F;
1714       break;
1715
1716    case ir_unop_bitcast_f2i:
1717       this->result = op[0];
1718       this->result.type = BRW_REGISTER_TYPE_D;
1719       break;
1720
1721    case ir_unop_bitcast_f2u:
1722       this->result = op[0];
1723       this->result.type = BRW_REGISTER_TYPE_UD;
1724       break;
1725
1726    case ir_unop_i2f:
1727    case ir_unop_i2u:
1728    case ir_unop_u2i:
1729    case ir_unop_u2f:
1730    case ir_unop_f2i:
1731    case ir_unop_f2u:
1732       emit(MOV(result_dst, op[0]));
1733       break;
1734    case ir_unop_b2i:
1735       emit(AND(result_dst, op[0], src_reg(1)));
1736       break;
1737    case ir_unop_b2f:
1738       if (devinfo->gen <= 5) {
1739          resolve_bool_comparison(ir->operands[0], &op[0]);
1740       }
1741       op[0].type = BRW_REGISTER_TYPE_D;
1742       result_dst.type = BRW_REGISTER_TYPE_D;
1743       emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1744       result_dst.type = BRW_REGISTER_TYPE_F;
1745       break;
1746    case ir_unop_f2b:
1747       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1748       break;
1749    case ir_unop_i2b:
1750       emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1751       break;
1752
1753    case ir_unop_trunc:
1754       emit(RNDZ(result_dst, op[0]));
1755       break;
1756    case ir_unop_ceil: {
1757          src_reg tmp = src_reg(this, ir->type);
1758          op[0].negate = !op[0].negate;
1759          emit(RNDD(dst_reg(tmp), op[0]));
1760          tmp.negate = true;
1761          emit(MOV(result_dst, tmp));
1762       }
1763       break;
1764    case ir_unop_floor:
1765       inst = emit(RNDD(result_dst, op[0]));
1766       break;
1767    case ir_unop_fract:
1768       inst = emit(FRC(result_dst, op[0]));
1769       break;
1770    case ir_unop_round_even:
1771       emit(RNDE(result_dst, op[0]));
1772       break;
1773
1774    case ir_binop_min:
1775       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1776       break;
1777    case ir_binop_max:
1778       emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1779       break;
1780
1781    case ir_binop_pow:
1782       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1783       break;
1784
1785    case ir_unop_bit_not:
1786       inst = emit(NOT(result_dst, op[0]));
1787       break;
1788    case ir_binop_bit_and:
1789       inst = emit(AND(result_dst, op[0], op[1]));
1790       break;
1791    case ir_binop_bit_xor:
1792       inst = emit(XOR(result_dst, op[0], op[1]));
1793       break;
1794    case ir_binop_bit_or:
1795       inst = emit(OR(result_dst, op[0], op[1]));
1796       break;
1797
1798    case ir_binop_lshift:
1799       inst = emit(SHL(result_dst, op[0], op[1]));
1800       break;
1801
1802    case ir_binop_rshift:
1803       if (ir->type->base_type == GLSL_TYPE_INT)
1804          inst = emit(ASR(result_dst, op[0], op[1]));
1805       else
1806          inst = emit(SHR(result_dst, op[0], op[1]));
1807       break;
1808
1809    case ir_binop_bfm:
1810       emit(BFI1(result_dst, op[0], op[1]));
1811       break;
1812
1813    case ir_binop_ubo_load: {
1814       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1815       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1816       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1817       src_reg offset;
1818
1819       /* Now, load the vector from that offset. */
1820       assert(ir->type->is_vector() || ir->type->is_scalar());
1821
1822       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1823       packed_consts.type = result.type;
1824       src_reg surf_index;
1825
1826       if (const_uniform_block) {
1827          /* The block index is a constant, so just emit the binding table entry
1828           * as an immediate.
1829           */
1830          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1831                               const_uniform_block->value.u[0]);
1832       } else {
1833          /* The block index is not a constant. Evaluate the index expression
1834           * per-channel and add the base UBO index; we have to select a value
1835           * from any live channel.
1836           */
1837          surf_index = src_reg(this, glsl_type::uint_type);
1838          emit(ADD(dst_reg(surf_index), op[0],
1839                   src_reg(prog_data->base.binding_table.ubo_start)));
1840          emit_uniformize(dst_reg(surf_index), surf_index);
1841
1842          /* Assume this may touch any UBO. It would be nice to provide
1843           * a tighter bound, but the array information is already lowered away.
1844           */
1845          brw_mark_surface_used(&prog_data->base,
1846                                prog_data->base.binding_table.ubo_start +
1847                                shader_prog->NumUniformBlocks - 1);
1848       }
1849
1850       if (const_offset_ir) {
1851          if (devinfo->gen >= 8) {
1852             /* Store the offset in a GRF so we can send-from-GRF. */
1853             offset = src_reg(this, glsl_type::int_type);
1854             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1855          } else {
1856             /* Immediates are fine on older generations since they'll be moved
1857              * to a (potentially fake) MRF at the generator level.
1858              */
1859             offset = src_reg(const_offset / 16);
1860          }
1861       } else {
1862          offset = src_reg(this, glsl_type::uint_type);
1863          emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1864       }
1865
1866       emit_pull_constant_load_reg(dst_reg(packed_consts),
1867                                   surf_index,
1868                                   offset,
1869                                   NULL, NULL /* before_block/inst */);
1870
1871       packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1872       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1873                                             const_offset % 16 / 4,
1874                                             const_offset % 16 / 4,
1875                                             const_offset % 16 / 4);
1876
1877       /* UBO bools are any nonzero int.  We need to convert them to 0/~0. */
1878       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1879          emit(CMP(result_dst, packed_consts, src_reg(0u),
1880                   BRW_CONDITIONAL_NZ));
1881       } else {
1882          emit(MOV(result_dst, packed_consts));
1883       }
1884       break;
1885    }
1886
1887    case ir_binop_vector_extract:
1888       unreachable("should have been lowered by vec_index_to_cond_assign");
1889
1890    case ir_triop_fma:
1891       op[0] = fix_3src_operand(op[0]);
1892       op[1] = fix_3src_operand(op[1]);
1893       op[2] = fix_3src_operand(op[2]);
1894       /* Note that the instruction's argument order is reversed from GLSL
1895        * and the IR.
1896        */
1897       emit(MAD(result_dst, op[2], op[1], op[0]));
1898       break;
1899
1900    case ir_triop_lrp:
1901       emit_lrp(result_dst, op[0], op[1], op[2]);
1902       break;
1903
1904    case ir_triop_csel:
1905       unreachable("already handled above");
1906       break;
1907
1908    case ir_triop_bfi:
1909       op[0] = fix_3src_operand(op[0]);
1910       op[1] = fix_3src_operand(op[1]);
1911       op[2] = fix_3src_operand(op[2]);
1912       emit(BFI2(result_dst, op[0], op[1], op[2]));
1913       break;
1914
1915    case ir_triop_bitfield_extract:
1916       op[0] = fix_3src_operand(op[0]);
1917       op[1] = fix_3src_operand(op[1]);
1918       op[2] = fix_3src_operand(op[2]);
1919       /* Note that the instruction's argument order is reversed from GLSL
1920        * and the IR.
1921        */
1922       emit(BFE(result_dst, op[2], op[1], op[0]));
1923       break;
1924
1925    case ir_triop_vector_insert:
1926       unreachable("should have been lowered by lower_vector_insert");
1927
1928    case ir_quadop_bitfield_insert:
1929       unreachable("not reached: should be handled by "
1930               "bitfield_insert_to_bfm_bfi\n");
1931
1932    case ir_quadop_vector:
1933       unreachable("not reached: should be handled by lower_quadop_vector");
1934
1935    case ir_unop_pack_half_2x16:
1936       emit_pack_half_2x16(result_dst, op[0]);
1937       break;
1938    case ir_unop_unpack_half_2x16:
1939       emit_unpack_half_2x16(result_dst, op[0]);
1940       break;
1941    case ir_unop_unpack_unorm_4x8:
1942       emit_unpack_unorm_4x8(result_dst, op[0]);
1943       break;
1944    case ir_unop_unpack_snorm_4x8:
1945       emit_unpack_snorm_4x8(result_dst, op[0]);
1946       break;
1947    case ir_unop_pack_unorm_4x8:
1948       emit_pack_unorm_4x8(result_dst, op[0]);
1949       break;
1950    case ir_unop_pack_snorm_4x8:
1951       emit_pack_snorm_4x8(result_dst, op[0]);
1952       break;
1953    case ir_unop_pack_snorm_2x16:
1954    case ir_unop_pack_unorm_2x16:
1955    case ir_unop_unpack_snorm_2x16:
1956    case ir_unop_unpack_unorm_2x16:
1957       unreachable("not reached: should be handled by lower_packing_builtins");
1958    case ir_unop_unpack_half_2x16_split_x:
1959    case ir_unop_unpack_half_2x16_split_y:
1960    case ir_binop_pack_half_2x16_split:
1961    case ir_unop_interpolate_at_centroid:
1962    case ir_binop_interpolate_at_sample:
1963    case ir_binop_interpolate_at_offset:
1964       unreachable("not reached: should not occur in vertex shader");
1965    case ir_binop_ldexp:
1966       unreachable("not reached: should be handled by ldexp_to_arith()");
1967    case ir_unop_d2f:
1968    case ir_unop_f2d:
1969    case ir_unop_d2i:
1970    case ir_unop_i2d:
1971    case ir_unop_d2u:
1972    case ir_unop_u2d:
1973    case ir_unop_d2b:
1974    case ir_unop_pack_double_2x32:
1975    case ir_unop_unpack_double_2x32:
1976    case ir_unop_frexp_sig:
1977    case ir_unop_frexp_exp:
1978       unreachable("fp64 todo");
1979    }
1980 }
1981
1982
1983 void
1984 vec4_visitor::visit(ir_swizzle *ir)
1985 {
1986    /* Note that this is only swizzles in expressions, not those on the left
1987     * hand side of an assignment, which do write masking.  See ir_assignment
1988     * for that.
1989     */
1990    const unsigned swz = brw_compose_swizzle(
1991       brw_swizzle_for_size(ir->type->vector_elements),
1992       BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
1993
1994    ir->val->accept(this);
1995    this->result = swizzle(this->result, swz);
1996 }
1997
1998 void
1999 vec4_visitor::visit(ir_dereference_variable *ir)
2000 {
2001    const struct glsl_type *type = ir->type;
2002    dst_reg *reg = variable_storage(ir->var);
2003
2004    if (!reg) {
2005       fail("Failed to find variable storage for %s\n", ir->var->name);
2006       this->result = src_reg(brw_null_reg());
2007       return;
2008    }
2009
2010    this->result = src_reg(*reg);
2011
2012    /* System values get their swizzle from the dst_reg writemask */
2013    if (ir->var->data.mode == ir_var_system_value)
2014       return;
2015
2016    if (type->is_scalar() || type->is_vector() || type->is_matrix())
2017       this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2018 }
2019
2020
2021 int
2022 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2023 {
2024    /* Under normal circumstances array elements are stored consecutively, so
2025     * the stride is equal to the size of the array element.
2026     */
2027    return type_size(ir->type);
2028 }
2029
2030
2031 void
2032 vec4_visitor::visit(ir_dereference_array *ir)
2033 {
2034    ir_constant *constant_index;
2035    src_reg src;
2036    int array_stride = compute_array_stride(ir);
2037
2038    constant_index = ir->array_index->constant_expression_value();
2039
2040    ir->array->accept(this);
2041    src = this->result;
2042
2043    if (constant_index) {
2044       src.reg_offset += constant_index->value.i[0] * array_stride;
2045    } else {
2046       /* Variable index array dereference.  It eats the "vec4" of the
2047        * base of the array and an index that offsets the Mesa register
2048        * index.
2049        */
2050       ir->array_index->accept(this);
2051
2052       src_reg index_reg;
2053
2054       if (array_stride == 1) {
2055          index_reg = this->result;
2056       } else {
2057          index_reg = src_reg(this, glsl_type::int_type);
2058
2059          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2060       }
2061
2062       if (src.reladdr) {
2063          src_reg temp = src_reg(this, glsl_type::int_type);
2064
2065          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2066
2067          index_reg = temp;
2068       }
2069
2070       src.reladdr = ralloc(mem_ctx, src_reg);
2071       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2072    }
2073
2074    /* If the type is smaller than a vec4, replicate the last channel out. */
2075    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2076       src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2077    else
2078       src.swizzle = BRW_SWIZZLE_NOOP;
2079    src.type = brw_type_for_base_type(ir->type);
2080
2081    this->result = src;
2082 }
2083
2084 void
2085 vec4_visitor::visit(ir_dereference_record *ir)
2086 {
2087    unsigned int i;
2088    const glsl_type *struct_type = ir->record->type;
2089    int offset = 0;
2090
2091    ir->record->accept(this);
2092
2093    for (i = 0; i < struct_type->length; i++) {
2094       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2095          break;
2096       offset += type_size(struct_type->fields.structure[i].type);
2097    }
2098
2099    /* If the type is smaller than a vec4, replicate the last channel out. */
2100    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2101       this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2102    else
2103       this->result.swizzle = BRW_SWIZZLE_NOOP;
2104    this->result.type = brw_type_for_base_type(ir->type);
2105
2106    this->result.reg_offset += offset;
2107 }
2108
2109 /**
2110  * We want to be careful in assignment setup to hit the actual storage
2111  * instead of potentially using a temporary like we might with the
2112  * ir_dereference handler.
2113  */
2114 static dst_reg
2115 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2116 {
2117    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2118     * access of a vector, it must be separated into a series conditional moves
2119     * before reaching this point (see ir_vec_index_to_cond_assign).
2120     */
2121    assert(ir->as_dereference());
2122    ir_dereference_array *deref_array = ir->as_dereference_array();
2123    if (deref_array) {
2124       assert(!deref_array->array->type->is_vector());
2125    }
2126
2127    /* Use the rvalue deref handler for the most part.  We'll ignore
2128     * swizzles in it and write swizzles using writemask, though.
2129     */
2130    ir->accept(v);
2131    return dst_reg(v->result);
2132 }
2133
2134 void
2135 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2136                               const struct glsl_type *type,
2137                               enum brw_predicate predicate)
2138 {
2139    if (type->base_type == GLSL_TYPE_STRUCT) {
2140       for (unsigned int i = 0; i < type->length; i++) {
2141          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2142       }
2143       return;
2144    }
2145
2146    if (type->is_array()) {
2147       for (unsigned int i = 0; i < type->length; i++) {
2148          emit_block_move(dst, src, type->fields.array, predicate);
2149       }
2150       return;
2151    }
2152
2153    if (type->is_matrix()) {
2154       const struct glsl_type *vec_type;
2155
2156       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2157                                          type->vector_elements, 1);
2158
2159       for (int i = 0; i < type->matrix_columns; i++) {
2160          emit_block_move(dst, src, vec_type, predicate);
2161       }
2162       return;
2163    }
2164
2165    assert(type->is_scalar() || type->is_vector());
2166
2167    dst->type = brw_type_for_base_type(type);
2168    src->type = dst->type;
2169
2170    dst->writemask = (1 << type->vector_elements) - 1;
2171
2172    src->swizzle = brw_swizzle_for_size(type->vector_elements);
2173
2174    vec4_instruction *inst = emit(MOV(*dst, *src));
2175    inst->predicate = predicate;
2176
2177    dst->reg_offset++;
2178    src->reg_offset++;
2179 }
2180
2181
2182 /* If the RHS processing resulted in an instruction generating a
2183  * temporary value, and it would be easy to rewrite the instruction to
2184  * generate its result right into the LHS instead, do so.  This ends
2185  * up reliably removing instructions where it can be tricky to do so
2186  * later without real UD chain information.
2187  */
2188 bool
2189 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2190                                      dst_reg dst,
2191                                      src_reg src,
2192                                      vec4_instruction *pre_rhs_inst,
2193                                      vec4_instruction *last_rhs_inst)
2194 {
2195    /* This could be supported, but it would take more smarts. */
2196    if (ir->condition)
2197       return false;
2198
2199    if (pre_rhs_inst == last_rhs_inst)
2200       return false; /* No instructions generated to work with. */
2201
2202    /* Make sure the last instruction generated our source reg. */
2203    if (src.file != GRF ||
2204        src.file != last_rhs_inst->dst.file ||
2205        src.reg != last_rhs_inst->dst.reg ||
2206        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2207        src.reladdr ||
2208        src.abs ||
2209        src.negate ||
2210        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2211       return false;
2212
2213    /* Check that that last instruction fully initialized the channels
2214     * we want to use, in the order we want to use them.  We could
2215     * potentially reswizzle the operands of many instructions so that
2216     * we could handle out of order channels, but don't yet.
2217     */
2218
2219    for (unsigned i = 0; i < 4; i++) {
2220       if (dst.writemask & (1 << i)) {
2221          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2222             return false;
2223
2224          if (BRW_GET_SWZ(src.swizzle, i) != i)
2225             return false;
2226       }
2227    }
2228
2229    /* Success!  Rewrite the instruction. */
2230    last_rhs_inst->dst.file = dst.file;
2231    last_rhs_inst->dst.reg = dst.reg;
2232    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2233    last_rhs_inst->dst.reladdr = dst.reladdr;
2234    last_rhs_inst->dst.writemask &= dst.writemask;
2235
2236    return true;
2237 }
2238
2239 void
2240 vec4_visitor::visit(ir_assignment *ir)
2241 {
2242    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2243    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2244
2245    if (!ir->lhs->type->is_scalar() &&
2246        !ir->lhs->type->is_vector()) {
2247       ir->rhs->accept(this);
2248       src_reg src = this->result;
2249
2250       if (ir->condition) {
2251          emit_bool_to_cond_code(ir->condition, &predicate);
2252       }
2253
2254       /* emit_block_move doesn't account for swizzles in the source register.
2255        * This should be ok, since the source register is a structure or an
2256        * array, and those can't be swizzled.  But double-check to be sure.
2257        */
2258       assert(src.swizzle ==
2259              (ir->rhs->type->is_matrix()
2260               ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2261               : BRW_SWIZZLE_NOOP));
2262
2263       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2264       return;
2265    }
2266
2267    /* Now we're down to just a scalar/vector with writemasks. */
2268    int i;
2269
2270    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2271    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2272
2273    ir->rhs->accept(this);
2274
2275    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2276
2277    int swizzles[4];
2278    int src_chan = 0;
2279
2280    assert(ir->lhs->type->is_vector() ||
2281           ir->lhs->type->is_scalar());
2282    dst.writemask = ir->write_mask;
2283
2284    /* Swizzle a small RHS vector into the channels being written.
2285     *
2286     * glsl ir treats write_mask as dictating how many channels are
2287     * present on the RHS while in our instructions we need to make
2288     * those channels appear in the slots of the vec4 they're written to.
2289     */
2290    for (int i = 0; i < 4; i++)
2291       swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2292
2293    src_reg src = swizzle(this->result,
2294                          BRW_SWIZZLE4(swizzles[0], swizzles[1],
2295                                       swizzles[2], swizzles[3]));
2296
2297    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2298       return;
2299    }
2300
2301    if (ir->condition) {
2302       emit_bool_to_cond_code(ir->condition, &predicate);
2303    }
2304
2305    for (i = 0; i < type_size(ir->lhs->type); i++) {
2306       vec4_instruction *inst = emit(MOV(dst, src));
2307       inst->predicate = predicate;
2308
2309       dst.reg_offset++;
2310       src.reg_offset++;
2311    }
2312 }
2313
2314 void
2315 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2316 {
2317    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2318       foreach_in_list(ir_constant, field_value, &ir->components) {
2319          emit_constant_values(dst, field_value);
2320       }
2321       return;
2322    }
2323
2324    if (ir->type->is_array()) {
2325       for (unsigned int i = 0; i < ir->type->length; i++) {
2326          emit_constant_values(dst, ir->array_elements[i]);
2327       }
2328       return;
2329    }
2330
2331    if (ir->type->is_matrix()) {
2332       for (int i = 0; i < ir->type->matrix_columns; i++) {
2333          float *vec = &ir->value.f[i * ir->type->vector_elements];
2334
2335          for (int j = 0; j < ir->type->vector_elements; j++) {
2336             dst->writemask = 1 << j;
2337             dst->type = BRW_REGISTER_TYPE_F;
2338
2339             emit(MOV(*dst, src_reg(vec[j])));
2340          }
2341          dst->reg_offset++;
2342       }
2343       return;
2344    }
2345
2346    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2347
2348    for (int i = 0; i < ir->type->vector_elements; i++) {
2349       if (!(remaining_writemask & (1 << i)))
2350          continue;
2351
2352       dst->writemask = 1 << i;
2353       dst->type = brw_type_for_base_type(ir->type);
2354
2355       /* Find other components that match the one we're about to
2356        * write.  Emits fewer instructions for things like vec4(0.5,
2357        * 1.5, 1.5, 1.5).
2358        */
2359       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2360          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2361             if (ir->value.b[i] == ir->value.b[j])
2362                dst->writemask |= (1 << j);
2363          } else {
2364             /* u, i, and f storage all line up, so no need for a
2365              * switch case for comparing each type.
2366              */
2367             if (ir->value.u[i] == ir->value.u[j])
2368                dst->writemask |= (1 << j);
2369          }
2370       }
2371
2372       switch (ir->type->base_type) {
2373       case GLSL_TYPE_FLOAT:
2374          emit(MOV(*dst, src_reg(ir->value.f[i])));
2375          break;
2376       case GLSL_TYPE_INT:
2377          emit(MOV(*dst, src_reg(ir->value.i[i])));
2378          break;
2379       case GLSL_TYPE_UINT:
2380          emit(MOV(*dst, src_reg(ir->value.u[i])));
2381          break;
2382       case GLSL_TYPE_BOOL:
2383          emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2384          break;
2385       default:
2386          unreachable("Non-float/uint/int/bool constant");
2387       }
2388
2389       remaining_writemask &= ~dst->writemask;
2390    }
2391    dst->reg_offset++;
2392 }
2393
2394 void
2395 vec4_visitor::visit(ir_constant *ir)
2396 {
2397    dst_reg dst = dst_reg(this, ir->type);
2398    this->result = src_reg(dst);
2399
2400    emit_constant_values(&dst, ir);
2401 }
2402
2403 void
2404 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2405 {
2406    ir_dereference *deref = static_cast<ir_dereference *>(
2407       ir->actual_parameters.get_head());
2408    ir_variable *location = deref->variable_referenced();
2409    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2410                           location->data.binding);
2411
2412    /* Calculate the surface offset */
2413    src_reg offset(this, glsl_type::uint_type);
2414    ir_dereference_array *deref_array = deref->as_dereference_array();
2415    if (deref_array) {
2416       deref_array->array_index->accept(this);
2417
2418       src_reg tmp(this, glsl_type::uint_type);
2419       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2420       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2421    } else {
2422       offset = location->data.atomic.offset;
2423    }
2424
2425    /* Emit the appropriate machine instruction */
2426    const char *callee = ir->callee->function_name();
2427    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2428
2429    if (!strcmp("__intrinsic_atomic_read", callee)) {
2430       emit_untyped_surface_read(surf_index, dst, offset);
2431
2432    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2433       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2434                           src_reg(), src_reg());
2435
2436    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2437       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2438                           src_reg(), src_reg());
2439    }
2440 }
2441
2442 void
2443 vec4_visitor::visit(ir_call *ir)
2444 {
2445    const char *callee = ir->callee->function_name();
2446
2447    if (!strcmp("__intrinsic_atomic_read", callee) ||
2448        !strcmp("__intrinsic_atomic_increment", callee) ||
2449        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2450       visit_atomic_counter_intrinsic(ir);
2451    } else {
2452       unreachable("Unsupported intrinsic.");
2453    }
2454 }
2455
2456 src_reg
2457 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2458 {
2459    vec4_instruction *inst =
2460       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2461                                     dst_reg(this, glsl_type::uvec4_type));
2462    inst->base_mrf = 2;
2463    inst->src[1] = sampler;
2464
2465    int param_base;
2466
2467    if (devinfo->gen >= 9) {
2468       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2469       vec4_instruction *header_inst = new(mem_ctx)
2470          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2471                           dst_reg(MRF, inst->base_mrf));
2472
2473       emit(header_inst);
2474
2475       inst->mlen = 2;
2476       inst->header_size = 1;
2477       param_base = inst->base_mrf + 1;
2478    } else {
2479       inst->mlen = 1;
2480       param_base = inst->base_mrf;
2481    }
2482
2483    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2484    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2485    int zero_mask = 0xf & ~coord_mask;
2486
2487    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2488             coordinate));
2489
2490    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2491             src_reg(0)));
2492
2493    emit(inst);
2494    return src_reg(inst->dst);
2495 }
2496
2497 static bool
2498 is_high_sampler(const struct brw_device_info *devinfo, src_reg sampler)
2499 {
2500    if (devinfo->gen < 8 && !devinfo->is_haswell)
2501       return false;
2502
2503    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2504 }
2505
2506 void
2507 vec4_visitor::visit(ir_texture *ir)
2508 {
2509    uint32_t sampler =
2510       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2511
2512    ir_rvalue *nonconst_sampler_index =
2513       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2514
2515    /* Handle non-constant sampler array indexing */
2516    src_reg sampler_reg;
2517    if (nonconst_sampler_index) {
2518       /* The highest sampler which may be used by this operation is
2519        * the last element of the array. Mark it here, because the generator
2520        * doesn't have enough information to determine the bound.
2521        */
2522       uint32_t array_size = ir->sampler->as_dereference_array()
2523          ->array->type->array_size();
2524
2525       uint32_t max_used = sampler + array_size - 1;
2526       if (ir->op == ir_tg4 && devinfo->gen < 8) {
2527          max_used += prog_data->base.binding_table.gather_texture_start;
2528       } else {
2529          max_used += prog_data->base.binding_table.texture_start;
2530       }
2531
2532       brw_mark_surface_used(&prog_data->base, max_used);
2533
2534       /* Emit code to evaluate the actual indexing expression */
2535       nonconst_sampler_index->accept(this);
2536       dst_reg temp(this, glsl_type::uint_type);
2537       emit(ADD(temp, this->result, src_reg(sampler)));
2538       emit_uniformize(temp, src_reg(temp));
2539
2540       sampler_reg = src_reg(temp);
2541    } else {
2542       /* Single sampler, or constant array index; the indexing expression
2543        * is just an immediate.
2544        */
2545       sampler_reg = src_reg(sampler);
2546    }
2547
2548    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2549     * emitting anything other than setting up the constant result.
2550     */
2551    if (ir->op == ir_tg4) {
2552       ir_constant *chan = ir->lod_info.component->as_constant();
2553       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2554       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2555          dst_reg result(this, ir->type);
2556          this->result = src_reg(result);
2557          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2558          return;
2559       }
2560    }
2561
2562    /* Should be lowered by do_lower_texture_projection */
2563    assert(!ir->projector);
2564
2565    /* Should be lowered */
2566    assert(!ir->offset || !ir->offset->type->is_array());
2567
2568    /* Generate code to compute all the subexpression trees.  This has to be
2569     * done before loading any values into MRFs for the sampler message since
2570     * generating these values may involve SEND messages that need the MRFs.
2571     */
2572    src_reg coordinate;
2573    if (ir->coordinate) {
2574       ir->coordinate->accept(this);
2575       coordinate = this->result;
2576    }
2577
2578    src_reg shadow_comparitor;
2579    if (ir->shadow_comparitor) {
2580       ir->shadow_comparitor->accept(this);
2581       shadow_comparitor = this->result;
2582    }
2583
2584    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2585    src_reg offset_value;
2586    if (has_nonconstant_offset) {
2587       ir->offset->accept(this);
2588       offset_value = src_reg(this->result);
2589    }
2590
2591    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2592    src_reg lod, dPdx, dPdy, sample_index, mcs;
2593    switch (ir->op) {
2594    case ir_tex:
2595       lod = src_reg(0.0f);
2596       lod_type = glsl_type::float_type;
2597       break;
2598    case ir_txf:
2599    case ir_txl:
2600    case ir_txs:
2601       ir->lod_info.lod->accept(this);
2602       lod = this->result;
2603       lod_type = ir->lod_info.lod->type;
2604       break;
2605    case ir_query_levels:
2606       lod = src_reg(0);
2607       lod_type = glsl_type::int_type;
2608       break;
2609    case ir_txf_ms:
2610       ir->lod_info.sample_index->accept(this);
2611       sample_index = this->result;
2612       sample_index_type = ir->lod_info.sample_index->type;
2613
2614       if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2615          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2616       else
2617          mcs = src_reg(0u);
2618       break;
2619    case ir_txd:
2620       ir->lod_info.grad.dPdx->accept(this);
2621       dPdx = this->result;
2622
2623       ir->lod_info.grad.dPdy->accept(this);
2624       dPdy = this->result;
2625
2626       lod_type = ir->lod_info.grad.dPdx->type;
2627       break;
2628    case ir_txb:
2629    case ir_lod:
2630    case ir_tg4:
2631       break;
2632    }
2633
2634    enum opcode opcode;
2635    switch (ir->op) {
2636    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2637    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2638    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2639    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2640    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2641    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2642    case ir_tg4: opcode = has_nonconstant_offset
2643                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2644    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2645    case ir_txb:
2646       unreachable("TXB is not valid for vertex shaders.");
2647    case ir_lod:
2648       unreachable("LOD is not valid for vertex shaders.");
2649    default:
2650       unreachable("Unrecognized tex op");
2651    }
2652
2653    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2654       opcode, dst_reg(this, ir->type));
2655
2656    if (ir->offset != NULL && !has_nonconstant_offset) {
2657       inst->offset =
2658          brw_texture_offset(ir->offset->as_constant()->value.i,
2659                             ir->offset->type->vector_elements);
2660    }
2661
2662    /* Stuff the channel select bits in the top of the texture offset */
2663    if (ir->op == ir_tg4)
2664       inst->offset |= gather_channel(ir, sampler) << 16;
2665
2666    /* The message header is necessary for:
2667     * - Gen4 (always)
2668     * - Gen9+ for selecting SIMD4x2
2669     * - Texel offsets
2670     * - Gather channel selection
2671     * - Sampler indices too large to fit in a 4-bit value.
2672     */
2673    inst->header_size =
2674       (devinfo->gen < 5 || devinfo->gen >= 9 ||
2675        inst->offset != 0 || ir->op == ir_tg4 ||
2676        is_high_sampler(devinfo, sampler_reg)) ? 1 : 0;
2677    inst->base_mrf = 2;
2678    inst->mlen = inst->header_size + 1; /* always at least one */
2679    inst->dst.writemask = WRITEMASK_XYZW;
2680    inst->shadow_compare = ir->shadow_comparitor != NULL;
2681
2682    inst->src[1] = sampler_reg;
2683
2684    /* MRF for the first parameter */
2685    int param_base = inst->base_mrf + inst->header_size;
2686
2687    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2688       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2689       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2690    } else {
2691       /* Load the coordinate */
2692       /* FINISHME: gl_clamp_mask and saturate */
2693       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2694       int zero_mask = 0xf & ~coord_mask;
2695
2696       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2697                coordinate));
2698
2699       if (zero_mask != 0) {
2700          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2701                   src_reg(0)));
2702       }
2703       /* Load the shadow comparitor */
2704       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2705          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2706                           WRITEMASK_X),
2707                   shadow_comparitor));
2708          inst->mlen++;
2709       }
2710
2711       /* Load the LOD info */
2712       if (ir->op == ir_tex || ir->op == ir_txl) {
2713          int mrf, writemask;
2714          if (devinfo->gen >= 5) {
2715             mrf = param_base + 1;
2716             if (ir->shadow_comparitor) {
2717                writemask = WRITEMASK_Y;
2718                /* mlen already incremented */
2719             } else {
2720                writemask = WRITEMASK_X;
2721                inst->mlen++;
2722             }
2723          } else /* devinfo->gen == 4 */ {
2724             mrf = param_base;
2725             writemask = WRITEMASK_W;
2726          }
2727          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2728       } else if (ir->op == ir_txf) {
2729          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2730       } else if (ir->op == ir_txf_ms) {
2731          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2732                   sample_index));
2733          if (devinfo->gen >= 7) {
2734             /* MCS data is in the first channel of `mcs`, but we need to get it into
2735              * the .y channel of the second vec4 of params, so replicate .x across
2736              * the whole vec4 and then mask off everything except .y
2737              */
2738             mcs.swizzle = BRW_SWIZZLE_XXXX;
2739             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2740                      mcs));
2741          }
2742          inst->mlen++;
2743       } else if (ir->op == ir_txd) {
2744          const glsl_type *type = lod_type;
2745
2746          if (devinfo->gen >= 5) {
2747             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2748             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2749             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2750             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2751             inst->mlen++;
2752
2753             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2754                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2755                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2756                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2757                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2758                inst->mlen++;
2759
2760                if (ir->shadow_comparitor) {
2761                   emit(MOV(dst_reg(MRF, param_base + 2,
2762                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2763                            shadow_comparitor));
2764                }
2765             }
2766          } else /* devinfo->gen == 4 */ {
2767             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2768             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2769             inst->mlen += 2;
2770          }
2771       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2772          if (ir->shadow_comparitor) {
2773             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2774                      shadow_comparitor));
2775          }
2776
2777          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2778                   offset_value));
2779          inst->mlen++;
2780       }
2781    }
2782
2783    emit(inst);
2784
2785    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2786     * spec requires layers.
2787     */
2788    if (ir->op == ir_txs) {
2789       glsl_type const *type = ir->sampler->type;
2790       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2791           type->sampler_array) {
2792          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2793                    writemask(inst->dst, WRITEMASK_Z),
2794                    src_reg(inst->dst), src_reg(6));
2795       }
2796    }
2797
2798    if (devinfo->gen == 6 && ir->op == ir_tg4) {
2799       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2800    }
2801
2802    swizzle_result(ir, src_reg(inst->dst), sampler);
2803 }
2804
2805 /**
2806  * Apply workarounds for Gen6 gather with UINT/SINT
2807  */
2808 void
2809 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2810 {
2811    if (!wa)
2812       return;
2813
2814    int width = (wa & WA_8BIT) ? 8 : 16;
2815    dst_reg dst_f = dst;
2816    dst_f.type = BRW_REGISTER_TYPE_F;
2817
2818    /* Convert from UNORM to UINT */
2819    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2820    emit(MOV(dst, src_reg(dst_f)));
2821
2822    if (wa & WA_SIGN) {
2823       /* Reinterpret the UINT value as a signed INT value by
2824        * shifting the sign bit into place, then shifting back
2825        * preserving sign.
2826        */
2827       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2828       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2829    }
2830 }
2831
2832 /**
2833  * Set up the gather channel based on the swizzle, for gather4.
2834  */
2835 uint32_t
2836 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2837 {
2838    ir_constant *chan = ir->lod_info.component->as_constant();
2839    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2840    switch (swiz) {
2841       case SWIZZLE_X: return 0;
2842       case SWIZZLE_Y:
2843          /* gather4 sampler is broken for green channel on RG32F --
2844           * we must ask for blue instead.
2845           */
2846          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2847             return 2;
2848          return 1;
2849       case SWIZZLE_Z: return 2;
2850       case SWIZZLE_W: return 3;
2851       default:
2852          unreachable("Not reached"); /* zero, one swizzles handled already */
2853    }
2854 }
2855
2856 void
2857 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2858 {
2859    int s = key->tex.swizzles[sampler];
2860
2861    this->result = src_reg(this, ir->type);
2862    dst_reg swizzled_result(this->result);
2863
2864    if (ir->op == ir_query_levels) {
2865       /* # levels is in .w */
2866       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2867       emit(MOV(swizzled_result, orig_val));
2868       return;
2869    }
2870
2871    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2872                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2873       emit(MOV(swizzled_result, orig_val));
2874       return;
2875    }
2876
2877
2878    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2879    int swizzle[4] = {0};
2880
2881    for (int i = 0; i < 4; i++) {
2882       switch (GET_SWZ(s, i)) {
2883       case SWIZZLE_ZERO:
2884          zero_mask |= (1 << i);
2885          break;
2886       case SWIZZLE_ONE:
2887          one_mask |= (1 << i);
2888          break;
2889       default:
2890          copy_mask |= (1 << i);
2891          swizzle[i] = GET_SWZ(s, i);
2892          break;
2893       }
2894    }
2895
2896    if (copy_mask) {
2897       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2898       swizzled_result.writemask = copy_mask;
2899       emit(MOV(swizzled_result, orig_val));
2900    }
2901
2902    if (zero_mask) {
2903       swizzled_result.writemask = zero_mask;
2904       emit(MOV(swizzled_result, src_reg(0.0f)));
2905    }
2906
2907    if (one_mask) {
2908       swizzled_result.writemask = one_mask;
2909       emit(MOV(swizzled_result, src_reg(1.0f)));
2910    }
2911 }
2912
2913 void
2914 vec4_visitor::visit(ir_return *)
2915 {
2916    unreachable("not reached");
2917 }
2918
2919 void
2920 vec4_visitor::visit(ir_discard *)
2921 {
2922    unreachable("not reached");
2923 }
2924
2925 void
2926 vec4_visitor::visit(ir_if *ir)
2927 {
2928    /* Don't point the annotation at the if statement, because then it plus
2929     * the then and else blocks get printed.
2930     */
2931    this->base_ir = ir->condition;
2932
2933    if (devinfo->gen == 6) {
2934       emit_if_gen6(ir);
2935    } else {
2936       enum brw_predicate predicate;
2937       emit_bool_to_cond_code(ir->condition, &predicate);
2938       emit(IF(predicate));
2939    }
2940
2941    visit_instructions(&ir->then_instructions);
2942
2943    if (!ir->else_instructions.is_empty()) {
2944       this->base_ir = ir->condition;
2945       emit(BRW_OPCODE_ELSE);
2946
2947       visit_instructions(&ir->else_instructions);
2948    }
2949
2950    this->base_ir = ir->condition;
2951    emit(BRW_OPCODE_ENDIF);
2952 }
2953
2954 void
2955 vec4_visitor::visit(ir_emit_vertex *)
2956 {
2957    unreachable("not reached");
2958 }
2959
2960 void
2961 vec4_visitor::visit(ir_end_primitive *)
2962 {
2963    unreachable("not reached");
2964 }
2965
2966 void
2967 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2968                                   dst_reg dst, src_reg offset,
2969                                   src_reg src0, src_reg src1)
2970 {
2971    unsigned mlen = 0;
2972
2973    /* Set the atomic operation offset. */
2974    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2975    mlen++;
2976
2977    /* Set the atomic operation arguments. */
2978    if (src0.file != BAD_FILE) {
2979       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2980       mlen++;
2981    }
2982
2983    if (src1.file != BAD_FILE) {
2984       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2985       mlen++;
2986    }
2987
2988    /* Emit the instruction.  Note that this maps to the normal SIMD8
2989     * untyped atomic message on Ivy Bridge, but that's OK because
2990     * unused channels will be masked out.
2991     */
2992    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2993                                  brw_message_reg(0),
2994                                  src_reg(surf_index), src_reg(atomic_op));
2995    inst->mlen = mlen;
2996 }
2997
2998 void
2999 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3000                                         src_reg offset)
3001 {
3002    /* Set the surface read offset. */
3003    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3004
3005    /* Emit the instruction.  Note that this maps to the normal SIMD8
3006     * untyped surface read message, but that's OK because unused
3007     * channels will be masked out.
3008     */
3009    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3010                                  brw_message_reg(0),
3011                                  src_reg(surf_index), src_reg(1));
3012    inst->mlen = 1;
3013 }
3014
3015 void
3016 vec4_visitor::emit_ndc_computation()
3017 {
3018    /* Get the position */
3019    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3020
3021    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3022    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3023    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3024
3025    current_annotation = "NDC";
3026    dst_reg ndc_w = ndc;
3027    ndc_w.writemask = WRITEMASK_W;
3028    src_reg pos_w = pos;
3029    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3030    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3031
3032    dst_reg ndc_xyz = ndc;
3033    ndc_xyz.writemask = WRITEMASK_XYZ;
3034
3035    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3036 }
3037
3038 void
3039 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3040 {
3041    if (devinfo->gen < 6 &&
3042        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3043         key->userclip_active || devinfo->has_negative_rhw_bug)) {
3044       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3045       dst_reg header1_w = header1;
3046       header1_w.writemask = WRITEMASK_W;
3047
3048       emit(MOV(header1, 0u));
3049
3050       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3051          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3052
3053          current_annotation = "Point size";
3054          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3055          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3056       }
3057
3058       if (key->userclip_active) {
3059          current_annotation = "Clipping flags";
3060          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3061          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3062
3063          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3064          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3065          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3066
3067          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3068          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3069          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3070          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3071       }
3072
3073       /* i965 clipping workaround:
3074        * 1) Test for -ve rhw
3075        * 2) If set,
3076        *      set ndc = (0,0,0,0)
3077        *      set ucp[6] = 1
3078        *
3079        * Later, clipping will detect ucp[6] and ensure the primitive is
3080        * clipped against all fixed planes.
3081        */
3082       if (devinfo->has_negative_rhw_bug) {
3083          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3084          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3085          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3086          vec4_instruction *inst;
3087          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3088          inst->predicate = BRW_PREDICATE_NORMAL;
3089          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3090          inst->predicate = BRW_PREDICATE_NORMAL;
3091       }
3092
3093       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3094    } else if (devinfo->gen < 6) {
3095       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3096    } else {
3097       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3098       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3099          dst_reg reg_w = reg;
3100          reg_w.writemask = WRITEMASK_W;
3101          emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3102       }
3103       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3104          dst_reg reg_y = reg;
3105          reg_y.writemask = WRITEMASK_Y;
3106          reg_y.type = BRW_REGISTER_TYPE_D;
3107          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3108       }
3109       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3110          dst_reg reg_z = reg;
3111          reg_z.writemask = WRITEMASK_Z;
3112          reg_z.type = BRW_REGISTER_TYPE_D;
3113          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3114       }
3115    }
3116 }
3117
3118 void
3119 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3120 {
3121    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3122     *
3123     *     "If a linked set of shaders forming the vertex stage contains no
3124     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3125     *     application has requested clipping against user clip planes through
3126     *     the API, then the coordinate written to gl_Position is used for
3127     *     comparison against the user clip planes."
3128     *
3129     * This function is only called if the shader didn't write to
3130     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3131     * if the user wrote to it; otherwise we use gl_Position.
3132     */
3133    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3134    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3135       clip_vertex = VARYING_SLOT_POS;
3136    }
3137
3138    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3139         ++i) {
3140       reg.writemask = 1 << i;
3141       emit(DP4(reg,
3142                src_reg(output_reg[clip_vertex]),
3143                src_reg(this->userplane[i + offset])));
3144    }
3145 }
3146
3147 vec4_instruction *
3148 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3149 {
3150    assert (varying < VARYING_SLOT_MAX);
3151    reg.type = output_reg[varying].type;
3152    current_annotation = output_reg_annotation[varying];
3153    /* Copy the register, saturating if necessary */
3154    return emit(MOV(reg, src_reg(output_reg[varying])));
3155 }
3156
3157 void
3158 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3159 {
3160    reg.type = BRW_REGISTER_TYPE_F;
3161
3162    switch (varying) {
3163    case VARYING_SLOT_PSIZ:
3164    {
3165       /* PSIZ is always in slot 0, and is coupled with other flags. */
3166       current_annotation = "indices, point width, clip flags";
3167       emit_psiz_and_flags(reg);
3168       break;
3169    }
3170    case BRW_VARYING_SLOT_NDC:
3171       current_annotation = "NDC";
3172       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3173       break;
3174    case VARYING_SLOT_POS:
3175       current_annotation = "gl_Position";
3176       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3177       break;
3178    case VARYING_SLOT_EDGE:
3179       /* This is present when doing unfilled polygons.  We're supposed to copy
3180        * the edge flag from the user-provided vertex array
3181        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3182        * of that attribute (starts as 1.0f).  This is then used in clipping to
3183        * determine which edges should be drawn as wireframe.
3184        */
3185       current_annotation = "edge flag";
3186       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3187                                     glsl_type::float_type, WRITEMASK_XYZW))));
3188       break;
3189    case BRW_VARYING_SLOT_PAD:
3190       /* No need to write to this slot */
3191       break;
3192    case VARYING_SLOT_COL0:
3193    case VARYING_SLOT_COL1:
3194    case VARYING_SLOT_BFC0:
3195    case VARYING_SLOT_BFC1: {
3196       /* These built-in varyings are only supported in compatibility mode,
3197        * and we only support GS in core profile.  So, this must be a vertex
3198        * shader.
3199        */
3200       assert(stage == MESA_SHADER_VERTEX);
3201       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3202       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3203          inst->saturate = true;
3204       break;
3205    }
3206
3207    default:
3208       emit_generic_urb_slot(reg, varying);
3209       break;
3210    }
3211 }
3212
3213 static int
3214 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3215 {
3216    if (devinfo->gen >= 6) {
3217       /* URB data written (does not include the message header reg) must
3218        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3219        * section 5.4.3.2.2: URB_INTERLEAVED.
3220        *
3221        * URB entries are allocated on a multiple of 1024 bits, so an
3222        * extra 128 bits written here to make the end align to 256 is
3223        * no problem.
3224        */
3225       if ((mlen % 2) != 1)
3226          mlen++;
3227    }
3228
3229    return mlen;
3230 }
3231
3232
3233 /**
3234  * Generates the VUE payload plus the necessary URB write instructions to
3235  * output it.
3236  *
3237  * The VUE layout is documented in Volume 2a.
3238  */
3239 void
3240 vec4_visitor::emit_vertex()
3241 {
3242    /* MRF 0 is reserved for the debugger, so start with message header
3243     * in MRF 1.
3244     */
3245    int base_mrf = 1;
3246    int mrf = base_mrf;
3247    /* In the process of generating our URB write message contents, we
3248     * may need to unspill a register or load from an array.  Those
3249     * reads would use MRFs 14-15.
3250     */
3251    int max_usable_mrf = 13;
3252
3253    /* The following assertion verifies that max_usable_mrf causes an
3254     * even-numbered amount of URB write data, which will meet gen6's
3255     * requirements for length alignment.
3256     */
3257    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3258
3259    /* First mrf is the g0-based message header containing URB handles and
3260     * such.
3261     */
3262    emit_urb_write_header(mrf++);
3263
3264    if (devinfo->gen < 6) {
3265       emit_ndc_computation();
3266    }
3267
3268    /* Lower legacy ff and ClipVertex clipping to clip distances */
3269    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3270       current_annotation = "user clip distances";
3271
3272       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3273       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3274
3275       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3276       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3277    }
3278
3279    /* We may need to split this up into several URB writes, so do them in a
3280     * loop.
3281     */
3282    int slot = 0;
3283    bool complete = false;
3284    do {
3285       /* URB offset is in URB row increments, and each of our MRFs is half of
3286        * one of those, since we're doing interleaved writes.
3287        */
3288       int offset = slot / 2;
3289
3290       mrf = base_mrf + 1;
3291       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3292          emit_urb_slot(dst_reg(MRF, mrf++),
3293                        prog_data->vue_map.slot_to_varying[slot]);
3294
3295          /* If this was max_usable_mrf, we can't fit anything more into this
3296           * URB WRITE.
3297           */
3298          if (mrf > max_usable_mrf) {
3299             slot++;
3300             break;
3301          }
3302       }
3303
3304       complete = slot >= prog_data->vue_map.num_slots;
3305       current_annotation = "URB write";
3306       vec4_instruction *inst = emit_urb_write_opcode(complete);
3307       inst->base_mrf = base_mrf;
3308       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3309       inst->offset += offset;
3310    } while(!complete);
3311 }
3312
3313
3314 src_reg
3315 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3316                                  src_reg *reladdr, int reg_offset)
3317 {
3318    /* Because we store the values to scratch interleaved like our
3319     * vertex data, we need to scale the vec4 index by 2.
3320     */
3321    int message_header_scale = 2;
3322
3323    /* Pre-gen6, the message header uses byte offsets instead of vec4
3324     * (16-byte) offset units.
3325     */
3326    if (devinfo->gen < 6)
3327       message_header_scale *= 16;
3328
3329    if (reladdr) {
3330       src_reg index = src_reg(this, glsl_type::int_type);
3331
3332       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3333                                    src_reg(reg_offset)));
3334       emit_before(block, inst, MUL(dst_reg(index), index,
3335                                    src_reg(message_header_scale)));
3336
3337       return index;
3338    } else {
3339       return src_reg(reg_offset * message_header_scale);
3340    }
3341 }
3342
3343 src_reg
3344 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3345                                        src_reg *reladdr, int reg_offset)
3346 {
3347    if (reladdr) {
3348       src_reg index = src_reg(this, glsl_type::int_type);
3349
3350       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3351                                    src_reg(reg_offset)));
3352
3353       /* Pre-gen6, the message header uses byte offsets instead of vec4
3354        * (16-byte) offset units.
3355        */
3356       if (devinfo->gen < 6) {
3357          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3358       }
3359
3360       return index;
3361    } else if (devinfo->gen >= 8) {
3362       /* Store the offset in a GRF so we can send-from-GRF. */
3363       src_reg offset = src_reg(this, glsl_type::int_type);
3364       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3365       return offset;
3366    } else {
3367       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3368       return src_reg(reg_offset * message_header_scale);
3369    }
3370 }
3371
3372 /**
3373  * Emits an instruction before @inst to load the value named by @orig_src
3374  * from scratch space at @base_offset to @temp.
3375  *
3376  * @base_offset is measured in 32-byte units (the size of a register).
3377  */
3378 void
3379 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3380                                 dst_reg temp, src_reg orig_src,
3381                                 int base_offset)
3382 {
3383    int reg_offset = base_offset + orig_src.reg_offset;
3384    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3385                                       reg_offset);
3386
3387    emit_before(block, inst, SCRATCH_READ(temp, index));
3388 }
3389
3390 /**
3391  * Emits an instruction after @inst to store the value to be written
3392  * to @orig_dst to scratch space at @base_offset, from @temp.
3393  *
3394  * @base_offset is measured in 32-byte units (the size of a register).
3395  */
3396 void
3397 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3398                                  int base_offset)
3399 {
3400    int reg_offset = base_offset + inst->dst.reg_offset;
3401    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3402                                       reg_offset);
3403
3404    /* Create a temporary register to store *inst's result in.
3405     *
3406     * We have to be careful in MOVing from our temporary result register in
3407     * the scratch write.  If we swizzle from channels of the temporary that
3408     * weren't initialized, it will confuse live interval analysis, which will
3409     * make spilling fail to make progress.
3410     */
3411    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3412                                        inst->dst.type),
3413                                 brw_swizzle_for_mask(inst->dst.writemask));
3414    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3415                                        inst->dst.writemask));
3416    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3417    write->predicate = inst->predicate;
3418    write->ir = inst->ir;
3419    write->annotation = inst->annotation;
3420    inst->insert_after(block, write);
3421
3422    inst->dst.file = temp.file;
3423    inst->dst.reg = temp.reg;
3424    inst->dst.reg_offset = temp.reg_offset;
3425    inst->dst.reladdr = NULL;
3426 }
3427
3428 /**
3429  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3430  * adds the scratch read(s) before \p inst. The function also checks for
3431  * recursive reladdr scratch accesses, issuing the corresponding scratch
3432  * loads and rewriting reladdr references accordingly.
3433  *
3434  * \return \p src if it did not require a scratch load, otherwise, the
3435  * register holding the result of the scratch load that the caller should
3436  * use to rewrite src.
3437  */
3438 src_reg
3439 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3440                                    vec4_instruction *inst, src_reg src)
3441 {
3442    /* Resolve recursive reladdr scratch access by calling ourselves
3443     * with src.reladdr
3444     */
3445    if (src.reladdr)
3446       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3447                                           *src.reladdr);
3448
3449    /* Now handle scratch access on src */
3450    if (src.file == GRF && scratch_loc[src.reg] != -1) {
3451       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3452       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3453       src.reg = temp.reg;
3454       src.reg_offset = temp.reg_offset;
3455       src.reladdr = NULL;
3456    }
3457
3458    return src;
3459 }
3460
3461 /**
3462  * We can't generally support array access in GRF space, because a
3463  * single instruction's destination can only span 2 contiguous
3464  * registers.  So, we send all GRF arrays that get variable index
3465  * access to scratch space.
3466  */
3467 void
3468 vec4_visitor::move_grf_array_access_to_scratch()
3469 {
3470    int scratch_loc[this->alloc.count];
3471    memset(scratch_loc, -1, sizeof(scratch_loc));
3472
3473    /* First, calculate the set of virtual GRFs that need to be punted
3474     * to scratch due to having any array access on them, and where in
3475     * scratch.
3476     */
3477    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3478       if (inst->dst.file == GRF && inst->dst.reladdr) {
3479          if (scratch_loc[inst->dst.reg] == -1) {
3480             scratch_loc[inst->dst.reg] = c->last_scratch;
3481             c->last_scratch += this->alloc.sizes[inst->dst.reg];
3482          }
3483
3484          for (src_reg *iter = inst->dst.reladdr;
3485               iter->reladdr;
3486               iter = iter->reladdr) {
3487             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3488                scratch_loc[iter->reg] = c->last_scratch;
3489                c->last_scratch += this->alloc.sizes[iter->reg];
3490             }
3491          }
3492       }
3493
3494       for (int i = 0 ; i < 3; i++) {
3495          for (src_reg *iter = &inst->src[i];
3496               iter->reladdr;
3497               iter = iter->reladdr) {
3498             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3499                scratch_loc[iter->reg] = c->last_scratch;
3500                c->last_scratch += this->alloc.sizes[iter->reg];
3501             }
3502          }
3503       }
3504    }
3505
3506    /* Now, for anything that will be accessed through scratch, rewrite
3507     * it to load/store.  Note that this is a _safe list walk, because
3508     * we may generate a new scratch_write instruction after the one
3509     * we're processing.
3510     */
3511    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3512       /* Set up the annotation tracking for new generated instructions. */
3513       base_ir = inst->ir;
3514       current_annotation = inst->annotation;
3515
3516       /* First handle scratch access on the dst. Notice we have to handle
3517        * the case where the dst's reladdr also points to scratch space.
3518        */
3519       if (inst->dst.reladdr)
3520          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3521                                                    *inst->dst.reladdr);
3522
3523       /* Now that we have handled any (possibly recursive) reladdr scratch
3524        * accesses for dst we can safely do the scratch write for dst itself
3525        */
3526       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3527          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3528
3529       /* Now handle scratch access on any src. In this case, since inst->src[i]
3530        * already is a src_reg, we can just call emit_resolve_reladdr with
3531        * inst->src[i] and it will take care of handling scratch loads for
3532        * both src and src.reladdr (recursively).
3533        */
3534       for (int i = 0 ; i < 3; i++) {
3535          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3536                                              inst->src[i]);
3537       }
3538    }
3539 }
3540
3541 /**
3542  * Emits an instruction before @inst to load the value named by @orig_src
3543  * from the pull constant buffer (surface) at @base_offset to @temp.
3544  */
3545 void
3546 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3547                                       dst_reg temp, src_reg orig_src,
3548                                       int base_offset)
3549 {
3550    int reg_offset = base_offset + orig_src.reg_offset;
3551    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3552    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3553                                              reg_offset);
3554
3555    emit_pull_constant_load_reg(temp,
3556                                index,
3557                                offset,
3558                                block, inst);
3559 }
3560
3561 /**
3562  * Implements array access of uniforms by inserting a
3563  * PULL_CONSTANT_LOAD instruction.
3564  *
3565  * Unlike temporary GRF array access (where we don't support it due to
3566  * the difficulty of doing relative addressing on instruction
3567  * destinations), we could potentially do array access of uniforms
3568  * that were loaded in GRF space as push constants.  In real-world
3569  * usage we've seen, though, the arrays being used are always larger
3570  * than we could load as push constants, so just always move all
3571  * uniform array access out to a pull constant buffer.
3572  */
3573 void
3574 vec4_visitor::move_uniform_array_access_to_pull_constants()
3575 {
3576    int pull_constant_loc[this->uniforms];
3577    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3578    bool nested_reladdr;
3579
3580    /* Walk through and find array access of uniforms.  Put a copy of that
3581     * uniform in the pull constant buffer.
3582     *
3583     * Note that we don't move constant-indexed accesses to arrays.  No
3584     * testing has been done of the performance impact of this choice.
3585     */
3586    do {
3587       nested_reladdr = false;
3588
3589       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3590          for (int i = 0 ; i < 3; i++) {
3591             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3592                continue;
3593
3594             int uniform = inst->src[i].reg;
3595
3596             if (inst->src[i].reladdr->reladdr)
3597                nested_reladdr = true;  /* will need another pass */
3598
3599             /* If this array isn't already present in the pull constant buffer,
3600              * add it.
3601              */
3602             if (pull_constant_loc[uniform] == -1) {
3603                const gl_constant_value **values =
3604                   &stage_prog_data->param[uniform * 4];
3605
3606                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3607
3608                assert(uniform < uniform_array_size);
3609                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3610                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3611                      = values[j];
3612                }
3613             }
3614
3615             /* Set up the annotation tracking for new generated instructions. */
3616             base_ir = inst->ir;
3617             current_annotation = inst->annotation;
3618
3619             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3620
3621             emit_pull_constant_load(block, inst, temp, inst->src[i],
3622                                     pull_constant_loc[uniform]);
3623
3624             inst->src[i].file = temp.file;
3625             inst->src[i].reg = temp.reg;
3626             inst->src[i].reg_offset = temp.reg_offset;
3627             inst->src[i].reladdr = NULL;
3628          }
3629       }
3630    } while (nested_reladdr);
3631
3632    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3633     * no need to track them as larger-than-vec4 objects.  This will be
3634     * relied on in cutting out unused uniform vectors from push
3635     * constants.
3636     */
3637    split_uniform_registers();
3638 }
3639
3640 void
3641 vec4_visitor::resolve_ud_negate(src_reg *reg)
3642 {
3643    if (reg->type != BRW_REGISTER_TYPE_UD ||
3644        !reg->negate)
3645       return;
3646
3647    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3648    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3649    *reg = temp;
3650 }
3651
3652 /**
3653  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3654  *
3655  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3656  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3657  */
3658 void
3659 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3660 {
3661    assert(devinfo->gen <= 5);
3662
3663    if (!rvalue->type->is_boolean())
3664       return;
3665
3666    src_reg and_result = src_reg(this, rvalue->type);
3667    src_reg neg_result = src_reg(this, rvalue->type);
3668    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3669    emit(MOV(dst_reg(neg_result), negate(and_result)));
3670    *reg = neg_result;
3671 }
3672
3673 vec4_visitor::vec4_visitor(struct brw_context *brw,
3674                            struct brw_vec4_compile *c,
3675                            struct gl_program *prog,
3676                            const struct brw_vue_prog_key *key,
3677                            struct brw_vue_prog_data *prog_data,
3678                            struct gl_shader_program *shader_prog,
3679                            gl_shader_stage stage,
3680                            void *mem_ctx,
3681                            bool no_spills,
3682                            shader_time_shader_type st_base,
3683                            shader_time_shader_type st_written,
3684                            shader_time_shader_type st_reset)
3685    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3686      c(c),
3687      key(key),
3688      prog_data(prog_data),
3689      sanity_param_count(0),
3690      fail_msg(NULL),
3691      first_non_payload_grf(0),
3692      need_all_constants_in_pull_buffer(false),
3693      no_spills(no_spills),
3694      st_base(st_base),
3695      st_written(st_written),
3696      st_reset(st_reset)
3697 {
3698    this->mem_ctx = mem_ctx;
3699    this->failed = false;
3700
3701    this->base_ir = NULL;
3702    this->current_annotation = NULL;
3703    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3704
3705    this->variable_ht = hash_table_ctor(0,
3706                                        hash_table_pointer_hash,
3707                                        hash_table_pointer_compare);
3708
3709    this->virtual_grf_start = NULL;
3710    this->virtual_grf_end = NULL;
3711    this->live_intervals = NULL;
3712
3713    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3714
3715    this->uniforms = 0;
3716
3717    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3718     * at least one. See setup_uniforms() in brw_vec4.cpp.
3719     */
3720    this->uniform_array_size = 1;
3721    if (prog_data) {
3722       this->uniform_array_size =
3723          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3724    }
3725
3726    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3727    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3728 }
3729
3730 vec4_visitor::~vec4_visitor()
3731 {
3732    hash_table_dtor(this->variable_ht);
3733 }
3734
3735
3736 void
3737 vec4_visitor::fail(const char *format, ...)
3738 {
3739    va_list va;
3740    char *msg;
3741
3742    if (failed)
3743       return;
3744
3745    failed = true;
3746
3747    va_start(va, format);
3748    msg = ralloc_vasprintf(mem_ctx, format, va);
3749    va_end(va);
3750    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3751
3752    this->fail_msg = msg;
3753
3754    if (debug_enabled) {
3755       fprintf(stderr, "%s",  msg);
3756    }
3757 }
3758
3759 } /* namespace brw */