src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(devinfo->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = 14;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = 13;
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 void
 280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 281 {
 282    static enum opcode dot_opcodes[] = {
 283       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 284    };
 285
 286    emit(dot_opcodes[elements - 2], dst, src0, src1);
 287 }
 288
 289 src_reg
 290 vec4_visitor::fix_3src_operand(src_reg src)
 291 {
 292    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 293     * able to use vertical stride of zero to replicate the vec4 uniform, like
 294     *
 295     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 296     *
 297     * But you can't, since vertical stride is always four in three-source
 298     * instructions. Instead, insert a MOV instruction to do the replication so
 299     * that the three-source instruction can consume it.
 300     */
 301
 302    /* The MOV is only needed if the source is a uniform or immediate. */
 303    if (src.file != UNIFORM && src.file != IMM)
 304       return src;
 305
 306    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 307       return src;
 308
 309    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 310    expanded.type = src.type;
 311    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 312    return src_reg(expanded);
 313 }
 314
 315 src_reg
 316 vec4_visitor::fix_math_operand(src_reg src)
 317 {
 318    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 319       return src;
 320
 321    /* The gen6 math instruction ignores the source modifiers --
 322     * swizzle, abs, negate, and at least some parts of the register
 323     * region description.
 324     *
 325     * Rather than trying to enumerate all these cases, *always* expand the
 326     * operand to a temp GRF for gen6.
 327     *
 328     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 329     * can't use.
 330     */
 331
 332    if (devinfo->gen == 7 && src.file != IMM)
 333       return src;
 334
 335    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 336    expanded.type = src.type;
 337    emit(MOV(expanded, src));
 338    return src_reg(expanded);
 339 }
 340
 341 void
 342 vec4_visitor::emit_math(enum opcode opcode,
 343                         const dst_reg &dst,
 344                         const src_reg &src0, const src_reg &src1)
 345 {
 346    vec4_instruction *math =
 347       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 348
 349    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 350       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 351       math->dst = dst_reg(this, glsl_type::vec4_type);
 352       math->dst.type = dst.type;
 353       emit(MOV(dst, src_reg(math->dst)));
 354    } else if (devinfo->gen < 6) {
 355       math->base_mrf = 1;
 356       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 357    }
 358 }
 359
 360 void
 361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 362 {
 363    if (devinfo->gen < 7) {
 364       unreachable("ir_unop_pack_half_2x16 should be lowered");
 365    }
 366
 367    assert(dst.type == BRW_REGISTER_TYPE_UD);
 368    assert(src0.type == BRW_REGISTER_TYPE_F);
 369
 370    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 371     *
 372     *   Because this instruction does not have a 16-bit floating-point type,
 373     *   the destination data type must be Word (W).
 374     *
 375     *   The destination must be DWord-aligned and specify a horizontal stride
 376     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 377     *   each destination channel and the upper word is not modified.
 378     *
 379     * The above restriction implies that the f32to16 instruction must use
 380     * align1 mode, because only in align1 mode is it possible to specify
 381     * horizontal stride.  We choose here to defy the hardware docs and emit
 382     * align16 instructions.
 383     *
 384     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 385     * instructions. I was partially successful in that the code passed all
 386     * tests.  However, the code was dubiously correct and fragile, and the
 387     * tests were not harsh enough to probe that frailty. Not trusting the
 388     * code, I chose instead to remain in align16 mode in defiance of the hw
 389     * docs).
 390     *
 391     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 392     * simulator, emitting a f32to16 in align16 mode with UD as destination
 393     * data type is safe. The behavior differs from that specified in the PRM
 394     * in that the upper word of each destination channel is cleared to 0.
 395     */
 396
 397    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 398    src_reg tmp_src(tmp_dst);
 399
 400 #if 0
 401    /* Verify the undocumented behavior on which the following instructions
 402     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 403     * then the result of the bit-or instruction below will be incorrect.
 404     *
 405     * You should inspect the disasm output in order to verify that the MOV is
 406     * not optimized away.
 407     */
 408    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 409 #endif
 410
 411    /* Give tmp the form below, where "." means untouched.
 412     *
 413     *     w z          y          x w z          y          x
 414     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 415     *
 416     * That the upper word of each write-channel be 0 is required for the
 417     * following bit-shift and bit-or instructions to work. Note that this
 418     * relies on the undocumented hardware behavior mentioned above.
 419     */
 420    tmp_dst.writemask = WRITEMASK_XY;
 421    emit(F32TO16(tmp_dst, src0));
 422
 423    /* Give the write-channels of dst the form:
 424     *   0xhhhh0000
 425     */
 426    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 427    emit(SHL(dst, tmp_src, src_reg(16u)));
 428
 429    /* Finally, give the write-channels of dst the form of packHalf2x16's
 430     * output:
 431     *   0xhhhhllll
 432     */
 433    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 434    emit(OR(dst, src_reg(dst), tmp_src));
 435 }
 436
 437 void
 438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 439 {
 440    if (devinfo->gen < 7) {
 441       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 442    }
 443
 444    assert(dst.type == BRW_REGISTER_TYPE_F);
 445    assert(src0.type == BRW_REGISTER_TYPE_UD);
 446
 447    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 448     *
 449     *   Because this instruction does not have a 16-bit floating-point type,
 450     *   the source data type must be Word (W). The destination type must be
 451     *   F (Float).
 452     *
 453     * To use W as the source data type, we must adjust horizontal strides,
 454     * which is only possible in align1 mode. All my [chadv] attempts at
 455     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 456     * Piglit tests, so I gave up.
 457     *
 458     * I've verified that, on gen7 hardware and the simulator, it is safe to
 459     * emit f16to32 in align16 mode with UD as source data type.
 460     */
 461
 462    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 463    src_reg tmp_src(tmp_dst);
 464
 465    tmp_dst.writemask = WRITEMASK_X;
 466    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 467
 468    tmp_dst.writemask = WRITEMASK_Y;
 469    emit(SHR(tmp_dst, src0, src_reg(16u)));
 470
 471    dst.writemask = WRITEMASK_XY;
 472    emit(F16TO32(dst, tmp_src));
 473 }
 474
 475 void
 476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 477 {
 478    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 479     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 480     * is not suitable to generate the shift values, but we can use the packed
 481     * vector float and a type-converting MOV.
 482     */
 483    dst_reg shift(this, glsl_type::uvec4_type);
 484    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 485
 486    dst_reg shifted(this, glsl_type::uvec4_type);
 487    src0.swizzle = BRW_SWIZZLE_XXXX;
 488    emit(SHR(shifted, src0, src_reg(shift)));
 489
 490    shifted.type = BRW_REGISTER_TYPE_UB;
 491    dst_reg f(this, glsl_type::vec4_type);
 492    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 493
 494    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 495 }
 496
 497 void
 498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 499 {
 500    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 501     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 502     * is not suitable to generate the shift values, but we can use the packed
 503     * vector float and a type-converting MOV.
 504     */
 505    dst_reg shift(this, glsl_type::uvec4_type);
 506    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 507
 508    dst_reg shifted(this, glsl_type::uvec4_type);
 509    src0.swizzle = BRW_SWIZZLE_XXXX;
 510    emit(SHR(shifted, src0, src_reg(shift)));
 511
 512    shifted.type = BRW_REGISTER_TYPE_B;
 513    dst_reg f(this, glsl_type::vec4_type);
 514    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 515
 516    dst_reg scaled(this, glsl_type::vec4_type);
 517    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 518
 519    dst_reg max(this, glsl_type::vec4_type);
 520    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 521    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 522 }
 523
 524 void
 525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 526 {
 527    dst_reg saturated(this, glsl_type::vec4_type);
 528    vec4_instruction *inst = emit(MOV(saturated, src0));
 529    inst->saturate = true;
 530
 531    dst_reg scaled(this, glsl_type::vec4_type);
 532    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 533
 534    dst_reg rounded(this, glsl_type::vec4_type);
 535    emit(RNDE(rounded, src_reg(scaled)));
 536
 537    dst_reg u(this, glsl_type::uvec4_type);
 538    emit(MOV(u, src_reg(rounded)));
 539
 540    src_reg bytes(u);
 541    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 542 }
 543
 544 void
 545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 546 {
 547    dst_reg max(this, glsl_type::vec4_type);
 548    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 549
 550    dst_reg min(this, glsl_type::vec4_type);
 551    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 552
 553    dst_reg scaled(this, glsl_type::vec4_type);
 554    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 555
 556    dst_reg rounded(this, glsl_type::vec4_type);
 557    emit(RNDE(rounded, src_reg(scaled)));
 558
 559    dst_reg i(this, glsl_type::ivec4_type);
 560    emit(MOV(i, src_reg(rounded)));
 561
 562    src_reg bytes(i);
 563    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 564 }
 565
 566 void
 567 vec4_visitor::visit_instructions(const exec_list *list)
 568 {
 569    foreach_in_list(ir_instruction, ir, list) {
 570       base_ir = ir;
 571       ir->accept(this);
 572    }
 573 }
 574
 575 /**
 576  * Returns the minimum number of vec4 elements needed to pack a type.
 577  *
 578  * For simple types, it will return 1 (a single vec4); for matrices, the
 579  * number of columns; for array and struct, the sum of the vec4_size of
 580  * each of its elements; and for sampler and atomic, zero.
 581  *
 582  * This method is useful to calculate how much register space is needed to
 583  * store a particular type.
 584  */
 585 int
 586 vec4_visitor::type_size(const struct glsl_type *type)
 587 {
 588    unsigned int i;
 589    int size;
 590
 591    switch (type->base_type) {
 592    case GLSL_TYPE_UINT:
 593    case GLSL_TYPE_INT:
 594    case GLSL_TYPE_FLOAT:
 595    case GLSL_TYPE_BOOL:
 596       if (type->is_matrix()) {
 597          return type->matrix_columns;
 598       } else {
 599          /* Regardless of size of vector, it gets a vec4. This is bad
 600           * packing for things like floats, but otherwise arrays become a
 601           * mess.  Hopefully a later pass over the code can pack scalars
 602           * down if appropriate.
 603           */
 604          return 1;
 605       }
 606    case GLSL_TYPE_ARRAY:
 607       assert(type->length > 0);
 608       return type_size(type->fields.array) * type->length;
 609    case GLSL_TYPE_STRUCT:
 610       size = 0;
 611       for (i = 0; i < type->length; i++) {
 612          size += type_size(type->fields.structure[i].type);
 613       }
 614       return size;
 615    case GLSL_TYPE_SUBROUTINE:
 616       return 1;
 617
 618    case GLSL_TYPE_SAMPLER:
 619       /* Samplers take up no register space, since they're baked in at
 620        * link time.
 621        */
 622       return 0;
 623    case GLSL_TYPE_ATOMIC_UINT:
 624       return 0;
 625    case GLSL_TYPE_IMAGE:
 626    case GLSL_TYPE_VOID:
 627    case GLSL_TYPE_DOUBLE:
 628    case GLSL_TYPE_ERROR:
 629    case GLSL_TYPE_INTERFACE:
 630       unreachable("not reached");
 631    }
 632
 633    return 0;
 634 }
 635
 636 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 637 {
 638    init();
 639
 640    this->file = GRF;
 641    this->reg = v->alloc.allocate(v->type_size(type));
 642
 643    if (type->is_array() || type->is_record()) {
 644       this->swizzle = BRW_SWIZZLE_NOOP;
 645    } else {
 646       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 647    }
 648
 649    this->type = brw_type_for_base_type(type);
 650 }
 651
 652 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 653 {
 654    assert(size > 0);
 655
 656    init();
 657
 658    this->file = GRF;
 659    this->reg = v->alloc.allocate(v->type_size(type) * size);
 660
 661    this->swizzle = BRW_SWIZZLE_NOOP;
 662
 663    this->type = brw_type_for_base_type(type);
 664 }
 665
 666 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 667 {
 668    init();
 669
 670    this->file = GRF;
 671    this->reg = v->alloc.allocate(v->type_size(type));
 672
 673    if (type->is_array() || type->is_record()) {
 674       this->writemask = WRITEMASK_XYZW;
 675    } else {
 676       this->writemask = (1 << type->vector_elements) - 1;
 677    }
 678
 679    this->type = brw_type_for_base_type(type);
 680 }
 681
 682 void
 683 vec4_visitor::setup_vector_uniform_values(const gl_constant_value *values,
 684                                           unsigned n)
 685 {
 686    static const gl_constant_value zero = { 0 };
 687
 688    for (unsigned i = 0; i < n; ++i)
 689       stage_prog_data->param[4 * uniforms + i] = &values[i];
 690
 691    for (unsigned i = n; i < 4; ++i)
 692       stage_prog_data->param[4 * uniforms + i] = &zero;
 693
 694    uniform_vector_size[uniforms++] = n;
 695 }
 696
 697 /* Our support for uniforms is piggy-backed on the struct
 698  * gl_fragment_program, because that's where the values actually
 699  * get stored, rather than in some global gl_shader_program uniform
 700  * store.
 701  */
 702 void
 703 vec4_visitor::setup_uniform_values(ir_variable *ir)
 704 {
 705    int namelen = strlen(ir->name);
 706
 707    /* The data for our (non-builtin) uniforms is stored in a series of
 708     * gl_uniform_driver_storage structs for each subcomponent that
 709     * glGetUniformLocation() could name.  We know it's been set up in the same
 710     * order we'd walk the type, so walk the list of storage and find anything
 711     * with our name, or the prefix of a component that starts with our name.
 712     */
 713    for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
 714       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 715
 716       if (storage->builtin)
 717          continue;
 718
 719       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 720           (storage->name[namelen] != 0 &&
 721            storage->name[namelen] != '.' &&
 722            storage->name[namelen] != '[')) {
 723          continue;
 724       }
 725
 726       const unsigned vector_count = (MAX2(storage->array_elements, 1) *
 727                                      storage->type->matrix_columns);
 728       const unsigned vector_size = storage->type->vector_elements;
 729
 730       for (unsigned s = 0; s < vector_count; s++)
 731          setup_vector_uniform_values(&storage->storage[s * vector_size],
 732                                      vector_size);
 733    }
 734 }
 735
 736 void
 737 vec4_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
 738 {
 739    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 740       assert(this->uniforms < uniform_array_size);
 741       this->uniform_vector_size[this->uniforms] = 4;
 742       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 743       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 744       for (int j = 0; j < 4; ++j) {
 745          stage_prog_data->param[this->uniforms * 4 + j] =
 746             (gl_constant_value *) &clip_planes[i][j];
 747       }
 748       ++this->uniforms;
 749    }
 750 }
 751
 752 /* Our support for builtin uniforms is even scarier than non-builtin.
 753  * It sits on top of the PROG_STATE_VAR parameters that are
 754  * automatically updated from GL context state.
 755  */
 756 void
 757 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 758 {
 759    const ir_state_slot *const slots = ir->get_state_slots();
 760    assert(slots != NULL);
 761
 762    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 763       /* This state reference has already been setup by ir_to_mesa,
 764        * but we'll get the same index back here.  We can reference
 765        * ParameterValues directly, since unlike brw_fs.cpp, we never
 766        * add new state references during compile.
 767        */
 768       int index = _mesa_add_state_reference(this->prog->Parameters,
 769                                             (gl_state_index *)slots[i].tokens);
 770       gl_constant_value *values =
 771          &this->prog->Parameters->ParameterValues[index][0];
 772
 773       assert(this->uniforms < uniform_array_size);
 774
 775       for (unsigned j = 0; j < 4; j++)
 776          stage_prog_data->param[this->uniforms * 4 + j] =
 777             &values[GET_SWZ(slots[i].swizzle, j)];
 778
 779       this->uniform_vector_size[this->uniforms] =
 780          (ir->type->is_scalar() || ir->type->is_vector() ||
 781           ir->type->is_matrix() ? ir->type->vector_elements : 4);
 782
 783       this->uniforms++;
 784    }
 785 }
 786
 787 dst_reg *
 788 vec4_visitor::variable_storage(ir_variable *var)
 789 {
 790    return (dst_reg *)hash_table_find(this->variable_ht, var);
 791 }
 792
 793 void
 794 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 795                                      enum brw_predicate *predicate)
 796 {
 797    ir_expression *expr = ir->as_expression();
 798
 799    *predicate = BRW_PREDICATE_NORMAL;
 800
 801    if (expr && expr->operation != ir_binop_ubo_load) {
 802       src_reg op[3];
 803       vec4_instruction *inst;
 804
 805       assert(expr->get_num_operands() <= 3);
 806       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 807          expr->operands[i]->accept(this);
 808          op[i] = this->result;
 809
 810          resolve_ud_negate(&op[i]);
 811       }
 812
 813       switch (expr->operation) {
 814       case ir_unop_logic_not:
 815          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 816          inst->conditional_mod = BRW_CONDITIONAL_Z;
 817          break;
 818
 819       case ir_binop_logic_xor:
 820          if (devinfo->gen <= 5) {
 821             src_reg temp = src_reg(this, ir->type);
 822             emit(XOR(dst_reg(temp), op[0], op[1]));
 823             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 824          } else {
 825             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 826          }
 827          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 828          break;
 829
 830       case ir_binop_logic_or:
 831          if (devinfo->gen <= 5) {
 832             src_reg temp = src_reg(this, ir->type);
 833             emit(OR(dst_reg(temp), op[0], op[1]));
 834             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 835          } else {
 836             inst = emit(OR(dst_null_d(), op[0], op[1]));
 837          }
 838          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 839          break;
 840
 841       case ir_binop_logic_and:
 842          if (devinfo->gen <= 5) {
 843             src_reg temp = src_reg(this, ir->type);
 844             emit(AND(dst_reg(temp), op[0], op[1]));
 845             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 846          } else {
 847             inst = emit(AND(dst_null_d(), op[0], op[1]));
 848          }
 849          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 850          break;
 851
 852       case ir_unop_f2b:
 853          if (devinfo->gen >= 6) {
 854             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 855          } else {
 856             inst = emit(MOV(dst_null_f(), op[0]));
 857             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 858          }
 859          break;
 860
 861       case ir_unop_i2b:
 862          if (devinfo->gen >= 6) {
 863             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 864          } else {
 865             inst = emit(MOV(dst_null_d(), op[0]));
 866             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 867          }
 868          break;
 869
 870       case ir_binop_all_equal:
 871          if (devinfo->gen <= 5) {
 872             resolve_bool_comparison(expr->operands[0], &op[0]);
 873             resolve_bool_comparison(expr->operands[1], &op[1]);
 874          }
 875          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 876          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 877          break;
 878
 879       case ir_binop_any_nequal:
 880          if (devinfo->gen <= 5) {
 881             resolve_bool_comparison(expr->operands[0], &op[0]);
 882             resolve_bool_comparison(expr->operands[1], &op[1]);
 883          }
 884          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 885          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 886          break;
 887
 888       case ir_unop_any:
 889          if (devinfo->gen <= 5) {
 890             resolve_bool_comparison(expr->operands[0], &op[0]);
 891          }
 892          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 893          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 894          break;
 895
 896       case ir_binop_greater:
 897       case ir_binop_gequal:
 898       case ir_binop_less:
 899       case ir_binop_lequal:
 900       case ir_binop_equal:
 901       case ir_binop_nequal:
 902          if (devinfo->gen <= 5) {
 903             resolve_bool_comparison(expr->operands[0], &op[0]);
 904             resolve_bool_comparison(expr->operands[1], &op[1]);
 905          }
 906          emit(CMP(dst_null_d(), op[0], op[1],
 907                   brw_conditional_for_comparison(expr->operation)));
 908          break;
 909
 910       case ir_triop_csel: {
 911          /* Expand the boolean condition into the flag register. */
 912          inst = emit(MOV(dst_null_d(), op[0]));
 913          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 914
 915          /* Select which boolean to return. */
 916          dst_reg temp(this, expr->operands[1]->type);
 917          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 918          inst->predicate = BRW_PREDICATE_NORMAL;
 919
 920          /* Expand the result to a condition code. */
 921          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 922          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 923          break;
 924       }
 925
 926       default:
 927          unreachable("not reached");
 928       }
 929       return;
 930    }
 931
 932    ir->accept(this);
 933
 934    resolve_ud_negate(&this->result);
 935
 936    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 937    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 938 }
 939
 940 /**
 941  * Emit a gen6 IF statement with the comparison folded into the IF
 942  * instruction.
 943  */
 944 void
 945 vec4_visitor::emit_if_gen6(ir_if *ir)
 946 {
 947    ir_expression *expr = ir->condition->as_expression();
 948
 949    if (expr && expr->operation != ir_binop_ubo_load) {
 950       src_reg op[3];
 951       dst_reg temp;
 952
 953       assert(expr->get_num_operands() <= 3);
 954       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 955          expr->operands[i]->accept(this);
 956          op[i] = this->result;
 957       }
 958
 959       switch (expr->operation) {
 960       case ir_unop_logic_not:
 961          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 962          return;
 963
 964       case ir_binop_logic_xor:
 965          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 966          return;
 967
 968       case ir_binop_logic_or:
 969          temp = dst_reg(this, glsl_type::bool_type);
 970          emit(OR(temp, op[0], op[1]));
 971          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 972          return;
 973
 974       case ir_binop_logic_and:
 975          temp = dst_reg(this, glsl_type::bool_type);
 976          emit(AND(temp, op[0], op[1]));
 977          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 978          return;
 979
 980       case ir_unop_f2b:
 981          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 982          return;
 983
 984       case ir_unop_i2b:
 985          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 986          return;
 987
 988       case ir_binop_greater:
 989       case ir_binop_gequal:
 990       case ir_binop_less:
 991       case ir_binop_lequal:
 992       case ir_binop_equal:
 993       case ir_binop_nequal:
 994          emit(IF(op[0], op[1],
 995                  brw_conditional_for_comparison(expr->operation)));
 996          return;
 997
 998       case ir_binop_all_equal:
 999          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1000          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1001          return;
1002
1003       case ir_binop_any_nequal:
1004          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1005          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1006          return;
1007
1008       case ir_unop_any:
1009          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1010          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1011          return;
1012
1013       case ir_triop_csel: {
1014          /* Expand the boolean condition into the flag register. */
1015          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1016          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1017
1018          /* Select which boolean to return. */
1019          dst_reg temp(this, expr->operands[1]->type);
1020          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1021          inst->predicate = BRW_PREDICATE_NORMAL;
1022
1023          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1024          return;
1025       }
1026
1027       default:
1028          unreachable("not reached");
1029       }
1030       return;
1031    }
1032
1033    ir->condition->accept(this);
1034
1035    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1036 }
1037
1038 void
1039 vec4_visitor::visit(ir_variable *ir)
1040 {
1041    dst_reg *reg = NULL;
1042
1043    if (variable_storage(ir))
1044       return;
1045
1046    switch (ir->data.mode) {
1047    case ir_var_shader_in:
1048       assert(ir->data.location != -1);
1049       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1050       break;
1051
1052    case ir_var_shader_out:
1053       assert(ir->data.location != -1);
1054       reg = new(mem_ctx) dst_reg(this, ir->type);
1055
1056       for (int i = 0; i < type_size(ir->type); i++) {
1057          output_reg[ir->data.location + i] = *reg;
1058          output_reg[ir->data.location + i].reg_offset = i;
1059          output_reg[ir->data.location + i].type =
1060             brw_type_for_base_type(ir->type->get_scalar_type());
1061          output_reg_annotation[ir->data.location + i] = ir->name;
1062       }
1063       break;
1064
1065    case ir_var_auto:
1066    case ir_var_temporary:
1067       reg = new(mem_ctx) dst_reg(this, ir->type);
1068       break;
1069
1070    case ir_var_uniform:
1071       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1072
1073       /* Thanks to the lower_ubo_reference pass, we will see only
1074        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1075        * variables, so no need for them to be in variable_ht.
1076        *
1077        * Some uniforms, such as samplers and atomic counters, have no actual
1078        * storage, so we should ignore them.
1079        */
1080       if (ir->is_in_buffer_block() || type_size(ir->type) == 0)
1081          return;
1082
1083       /* Track how big the whole uniform variable is, in case we need to put a
1084        * copy of its data into pull constants for array access.
1085        */
1086       assert(this->uniforms < uniform_array_size);
1087       this->uniform_size[this->uniforms] = type_size(ir->type);
1088
1089       if (!strncmp(ir->name, "gl_", 3)) {
1090          setup_builtin_uniform_values(ir);
1091       } else {
1092          setup_uniform_values(ir);
1093       }
1094       break;
1095
1096    case ir_var_system_value:
1097       reg = make_reg_for_system_value(ir->data.location, ir->type);
1098       break;
1099
1100    default:
1101       unreachable("not reached");
1102    }
1103
1104    reg->type = brw_type_for_base_type(ir->type);
1105    hash_table_insert(this->variable_ht, reg, ir);
1106 }
1107
1108 void
1109 vec4_visitor::visit(ir_loop *ir)
1110 {
1111    /* We don't want debugging output to print the whole body of the
1112     * loop as the annotation.
1113     */
1114    this->base_ir = NULL;
1115
1116    emit(BRW_OPCODE_DO);
1117
1118    visit_instructions(&ir->body_instructions);
1119
1120    emit(BRW_OPCODE_WHILE);
1121 }
1122
1123 void
1124 vec4_visitor::visit(ir_loop_jump *ir)
1125 {
1126    switch (ir->mode) {
1127    case ir_loop_jump::jump_break:
1128       emit(BRW_OPCODE_BREAK);
1129       break;
1130    case ir_loop_jump::jump_continue:
1131       emit(BRW_OPCODE_CONTINUE);
1132       break;
1133    }
1134 }
1135
1136
1137 void
1138 vec4_visitor::visit(ir_function_signature *)
1139 {
1140    unreachable("not reached");
1141 }
1142
1143 void
1144 vec4_visitor::visit(ir_function *ir)
1145 {
1146    /* Ignore function bodies other than main() -- we shouldn't see calls to
1147     * them since they should all be inlined.
1148     */
1149    if (strcmp(ir->name, "main") == 0) {
1150       const ir_function_signature *sig;
1151       exec_list empty;
1152
1153       sig = ir->matching_signature(NULL, &empty, false);
1154
1155       assert(sig);
1156
1157       visit_instructions(&sig->body);
1158    }
1159 }
1160
1161 bool
1162 vec4_visitor::try_emit_mad(ir_expression *ir)
1163 {
1164    /* 3-src instructions were introduced in gen6. */
1165    if (devinfo->gen < 6)
1166       return false;
1167
1168    /* MAD can only handle floating-point data. */
1169    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1170       return false;
1171
1172    ir_rvalue *nonmul;
1173    ir_expression *mul;
1174    bool mul_negate, mul_abs;
1175
1176    for (int i = 0; i < 2; i++) {
1177       mul_negate = false;
1178       mul_abs = false;
1179
1180       mul = ir->operands[i]->as_expression();
1181       nonmul = ir->operands[1 - i];
1182
1183       if (mul && mul->operation == ir_unop_abs) {
1184          mul = mul->operands[0]->as_expression();
1185          mul_abs = true;
1186       } else if (mul && mul->operation == ir_unop_neg) {
1187          mul = mul->operands[0]->as_expression();
1188          mul_negate = true;
1189       }
1190
1191       if (mul && mul->operation == ir_binop_mul)
1192          break;
1193    }
1194
1195    if (!mul || mul->operation != ir_binop_mul)
1196       return false;
1197
1198    nonmul->accept(this);
1199    src_reg src0 = fix_3src_operand(this->result);
1200
1201    mul->operands[0]->accept(this);
1202    src_reg src1 = fix_3src_operand(this->result);
1203    src1.negate ^= mul_negate;
1204    src1.abs = mul_abs;
1205    if (mul_abs)
1206       src1.negate = false;
1207
1208    mul->operands[1]->accept(this);
1209    src_reg src2 = fix_3src_operand(this->result);
1210    src2.abs = mul_abs;
1211    if (mul_abs)
1212       src2.negate = false;
1213
1214    this->result = src_reg(this, ir->type);
1215    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1216
1217    return true;
1218 }
1219
1220 bool
1221 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1222 {
1223    /* This optimization relies on CMP setting the destination to 0 when
1224     * false.  Early hardware only sets the least significant bit, and
1225     * leaves the other bits undefined.  So we can't use it.
1226     */
1227    if (devinfo->gen < 6)
1228       return false;
1229
1230    ir_expression *const cmp = ir->operands[0]->as_expression();
1231
1232    if (cmp == NULL)
1233       return false;
1234
1235    switch (cmp->operation) {
1236    case ir_binop_less:
1237    case ir_binop_greater:
1238    case ir_binop_lequal:
1239    case ir_binop_gequal:
1240    case ir_binop_equal:
1241    case ir_binop_nequal:
1242       break;
1243
1244    default:
1245       return false;
1246    }
1247
1248    cmp->operands[0]->accept(this);
1249    const src_reg cmp_src0 = this->result;
1250
1251    cmp->operands[1]->accept(this);
1252    const src_reg cmp_src1 = this->result;
1253
1254    this->result = src_reg(this, ir->type);
1255
1256    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1257             brw_conditional_for_comparison(cmp->operation)));
1258
1259    /* If the comparison is false, this->result will just happen to be zero.
1260     */
1261    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1262                                        this->result, src_reg(1.0f));
1263    inst->predicate = BRW_PREDICATE_NORMAL;
1264    inst->predicate_inverse = true;
1265
1266    return true;
1267 }
1268
1269 void
1270 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1271                           src_reg src0, src_reg src1)
1272 {
1273    vec4_instruction *inst;
1274
1275    if (devinfo->gen >= 6) {
1276       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1277       inst->conditional_mod = conditionalmod;
1278    } else {
1279       emit(CMP(dst, src0, src1, conditionalmod));
1280
1281       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1282       inst->predicate = BRW_PREDICATE_NORMAL;
1283    }
1284 }
1285
1286 void
1287 vec4_visitor::emit_lrp(const dst_reg &dst,
1288                        const src_reg &x, const src_reg &y, const src_reg &a)
1289 {
1290    if (devinfo->gen >= 6) {
1291       /* Note that the instruction's argument order is reversed from GLSL
1292        * and the IR.
1293        */
1294       emit(LRP(dst,
1295                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1296    } else {
1297       /* Earlier generations don't support three source operations, so we
1298        * need to emit x*(1-a) + y*a.
1299        */
1300       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1301       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1302       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1303       y_times_a.writemask           = dst.writemask;
1304       one_minus_a.writemask         = dst.writemask;
1305       x_times_one_minus_a.writemask = dst.writemask;
1306
1307       emit(MUL(y_times_a, y, a));
1308       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1309       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1310       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1311    }
1312 }
1313
1314 /**
1315  * Emits the instructions needed to perform a pull constant load. before_block
1316  * and before_inst can be NULL in which case the instruction will be appended
1317  * to the end of the instruction list.
1318  */
1319 void
1320 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1321                                           src_reg surf_index,
1322                                           src_reg offset_reg,
1323                                           bblock_t *before_block,
1324                                           vec4_instruction *before_inst)
1325 {
1326    assert((before_inst == NULL && before_block == NULL) ||
1327           (before_inst && before_block));
1328
1329    vec4_instruction *pull;
1330
1331    if (devinfo->gen >= 9) {
1332       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1333       src_reg header(this, glsl_type::uvec4_type, 2);
1334
1335       pull = new(mem_ctx)
1336          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1337                           dst_reg(header));
1338
1339       if (before_inst)
1340          emit_before(before_block, before_inst, pull);
1341       else
1342          emit(pull);
1343
1344       dst_reg index_reg = retype(offset(dst_reg(header), 1),
1345                                  offset_reg.type);
1346       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1347
1348       if (before_inst)
1349          emit_before(before_block, before_inst, pull);
1350       else
1351          emit(pull);
1352
1353       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1354                                            dst,
1355                                            surf_index,
1356                                            header);
1357       pull->mlen = 2;
1358       pull->header_size = 1;
1359    } else if (devinfo->gen >= 7) {
1360       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1361
1362       grf_offset.type = offset_reg.type;
1363
1364       pull = MOV(grf_offset, offset_reg);
1365
1366       if (before_inst)
1367          emit_before(before_block, before_inst, pull);
1368       else
1369          emit(pull);
1370
1371       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1372                                            dst,
1373                                            surf_index,
1374                                            src_reg(grf_offset));
1375       pull->mlen = 1;
1376    } else {
1377       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1378                                            dst,
1379                                            surf_index,
1380                                            offset_reg);
1381       pull->base_mrf = 14;
1382       pull->mlen = 1;
1383    }
1384
1385    if (before_inst)
1386       emit_before(before_block, before_inst, pull);
1387    else
1388       emit(pull);
1389 }
1390
1391 src_reg
1392 vec4_visitor::emit_uniformize(const src_reg &src)
1393 {
1394    const src_reg chan_index(this, glsl_type::uint_type);
1395    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
1396                               src.type);
1397
1398    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1399       ->force_writemask_all = true;
1400    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1401       ->force_writemask_all = true;
1402
1403    return src_reg(dst);
1404 }
1405
1406 void
1407 vec4_visitor::visit(ir_expression *ir)
1408 {
1409    unsigned int operand;
1410    src_reg op[ARRAY_SIZE(ir->operands)];
1411    vec4_instruction *inst;
1412
1413    if (ir->operation == ir_binop_add) {
1414       if (try_emit_mad(ir))
1415          return;
1416    }
1417
1418    if (ir->operation == ir_unop_b2f) {
1419       if (try_emit_b2f_of_compare(ir))
1420          return;
1421    }
1422
1423    /* Storage for our result.  Ideally for an assignment we'd be using
1424     * the actual storage for the result here, instead.
1425     */
1426    dst_reg result_dst(this, ir->type);
1427    src_reg result_src(result_dst);
1428
1429    if (ir->operation == ir_triop_csel) {
1430       ir->operands[1]->accept(this);
1431       op[1] = this->result;
1432       ir->operands[2]->accept(this);
1433       op[2] = this->result;
1434
1435       enum brw_predicate predicate;
1436       emit_bool_to_cond_code(ir->operands[0], &predicate);
1437       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1438       inst->predicate = predicate;
1439       this->result = result_src;
1440       return;
1441    }
1442
1443    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1444       this->result.file = BAD_FILE;
1445       ir->operands[operand]->accept(this);
1446       if (this->result.file == BAD_FILE) {
1447          fprintf(stderr, "Failed to get tree for expression operand:\n");
1448          ir->operands[operand]->fprint(stderr);
1449          exit(1);
1450       }
1451       op[operand] = this->result;
1452
1453       /* Matrix expression operands should have been broken down to vector
1454        * operations already.
1455        */
1456       assert(!ir->operands[operand]->type->is_matrix());
1457    }
1458
1459    /* If nothing special happens, this is the result. */
1460    this->result = result_src;
1461
1462    switch (ir->operation) {
1463    case ir_unop_logic_not:
1464       emit(NOT(result_dst, op[0]));
1465       break;
1466    case ir_unop_neg:
1467       op[0].negate = !op[0].negate;
1468       emit(MOV(result_dst, op[0]));
1469       break;
1470    case ir_unop_abs:
1471       op[0].abs = true;
1472       op[0].negate = false;
1473       emit(MOV(result_dst, op[0]));
1474       break;
1475
1476    case ir_unop_sign:
1477       if (ir->type->is_float()) {
1478          /* AND(val, 0x80000000) gives the sign bit.
1479           *
1480           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1481           * zero.
1482           */
1483          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1484
1485          op[0].type = BRW_REGISTER_TYPE_UD;
1486          result_dst.type = BRW_REGISTER_TYPE_UD;
1487          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1488
1489          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1490          inst->predicate = BRW_PREDICATE_NORMAL;
1491
1492          this->result.type = BRW_REGISTER_TYPE_F;
1493       } else {
1494          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1495           *               -> non-negative val generates 0x00000000.
1496           *  Predicated OR sets 1 if val is positive.
1497           */
1498          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1499
1500          emit(ASR(result_dst, op[0], src_reg(31)));
1501
1502          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1503          inst->predicate = BRW_PREDICATE_NORMAL;
1504       }
1505       break;
1506
1507    case ir_unop_rcp:
1508       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1509       break;
1510
1511    case ir_unop_exp2:
1512       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1513       break;
1514    case ir_unop_log2:
1515       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1516       break;
1517    case ir_unop_exp:
1518    case ir_unop_log:
1519       unreachable("not reached: should be handled by ir_explog_to_explog2");
1520    case ir_unop_sin:
1521       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1522       break;
1523    case ir_unop_cos:
1524       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1525       break;
1526
1527    case ir_unop_dFdx:
1528    case ir_unop_dFdx_coarse:
1529    case ir_unop_dFdx_fine:
1530    case ir_unop_dFdy:
1531    case ir_unop_dFdy_coarse:
1532    case ir_unop_dFdy_fine:
1533       unreachable("derivatives not valid in vertex shader");
1534
1535    case ir_unop_bitfield_reverse:
1536       emit(BFREV(result_dst, op[0]));
1537       break;
1538    case ir_unop_bit_count:
1539       emit(CBIT(result_dst, op[0]));
1540       break;
1541    case ir_unop_find_msb: {
1542       src_reg temp = src_reg(this, glsl_type::uint_type);
1543
1544       inst = emit(FBH(dst_reg(temp), op[0]));
1545       inst->dst.writemask = WRITEMASK_XYZW;
1546
1547       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1548        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1549        * subtract the result from 31 to convert the MSB count into an LSB count.
1550        */
1551
1552       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1553       temp.swizzle = BRW_SWIZZLE_NOOP;
1554       emit(MOV(result_dst, temp));
1555
1556       src_reg src_tmp = src_reg(result_dst);
1557       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1558
1559       src_tmp.negate = true;
1560       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1561       inst->predicate = BRW_PREDICATE_NORMAL;
1562       break;
1563    }
1564    case ir_unop_find_lsb:
1565       emit(FBL(result_dst, op[0]));
1566       break;
1567    case ir_unop_saturate:
1568       inst = emit(MOV(result_dst, op[0]));
1569       inst->saturate = true;
1570       break;
1571
1572    case ir_unop_noise:
1573       unreachable("not reached: should be handled by lower_noise");
1574
1575    case ir_unop_subroutine_to_int:
1576       emit(MOV(result_dst, op[0]));
1577       break;
1578
1579    case ir_binop_add:
1580       emit(ADD(result_dst, op[0], op[1]));
1581       break;
1582    case ir_binop_sub:
1583       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1584
1585    case ir_binop_mul:
1586       if (devinfo->gen < 8 && ir->type->is_integer()) {
1587          /* For integer multiplication, the MUL uses the low 16 bits of one of
1588           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1589           * accumulates in the contribution of the upper 16 bits of that
1590           * operand.  If we can determine that one of the args is in the low
1591           * 16 bits, though, we can just emit a single MUL.
1592           */
1593          if (ir->operands[0]->is_uint16_constant()) {
1594             if (devinfo->gen < 7)
1595                emit(MUL(result_dst, op[0], op[1]));
1596             else
1597                emit(MUL(result_dst, op[1], op[0]));
1598          } else if (ir->operands[1]->is_uint16_constant()) {
1599             if (devinfo->gen < 7)
1600                emit(MUL(result_dst, op[1], op[0]));
1601             else
1602                emit(MUL(result_dst, op[0], op[1]));
1603          } else {
1604             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1605
1606             emit(MUL(acc, op[0], op[1]));
1607             emit(MACH(dst_null_d(), op[0], op[1]));
1608             emit(MOV(result_dst, src_reg(acc)));
1609          }
1610       } else {
1611          emit(MUL(result_dst, op[0], op[1]));
1612       }
1613       break;
1614    case ir_binop_imul_high: {
1615       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1616
1617       emit(MUL(acc, op[0], op[1]));
1618       emit(MACH(result_dst, op[0], op[1]));
1619       break;
1620    }
1621    case ir_binop_div:
1622       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1623       assert(ir->type->is_integer());
1624       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1625       break;
1626
1627    case ir_binop_carry:
1628       unreachable("Should have been lowered by carry_to_arith().");
1629
1630    case ir_binop_borrow:
1631       unreachable("Should have been lowered by borrow_to_arith().");
1632
1633    case ir_binop_mod:
1634       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1635       assert(ir->type->is_integer());
1636       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1637       break;
1638
1639    case ir_binop_less:
1640    case ir_binop_greater:
1641    case ir_binop_lequal:
1642    case ir_binop_gequal:
1643    case ir_binop_equal:
1644    case ir_binop_nequal: {
1645       if (devinfo->gen <= 5) {
1646          resolve_bool_comparison(ir->operands[0], &op[0]);
1647          resolve_bool_comparison(ir->operands[1], &op[1]);
1648       }
1649       emit(CMP(result_dst, op[0], op[1],
1650                brw_conditional_for_comparison(ir->operation)));
1651       break;
1652    }
1653
1654    case ir_binop_all_equal:
1655       if (devinfo->gen <= 5) {
1656          resolve_bool_comparison(ir->operands[0], &op[0]);
1657          resolve_bool_comparison(ir->operands[1], &op[1]);
1658       }
1659
1660       /* "==" operator producing a scalar boolean. */
1661       if (ir->operands[0]->type->is_vector() ||
1662           ir->operands[1]->type->is_vector()) {
1663          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1664          emit(MOV(result_dst, src_reg(0)));
1665          inst = emit(MOV(result_dst, src_reg(~0)));
1666          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1667       } else {
1668          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1669       }
1670       break;
1671    case ir_binop_any_nequal:
1672       if (devinfo->gen <= 5) {
1673          resolve_bool_comparison(ir->operands[0], &op[0]);
1674          resolve_bool_comparison(ir->operands[1], &op[1]);
1675       }
1676
1677       /* "!=" operator producing a scalar boolean. */
1678       if (ir->operands[0]->type->is_vector() ||
1679           ir->operands[1]->type->is_vector()) {
1680          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1681
1682          emit(MOV(result_dst, src_reg(0)));
1683          inst = emit(MOV(result_dst, src_reg(~0)));
1684          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1685       } else {
1686          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1687       }
1688       break;
1689
1690    case ir_unop_any:
1691       if (devinfo->gen <= 5) {
1692          resolve_bool_comparison(ir->operands[0], &op[0]);
1693       }
1694       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1695       emit(MOV(result_dst, src_reg(0)));
1696
1697       inst = emit(MOV(result_dst, src_reg(~0)));
1698       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1699       break;
1700
1701    case ir_binop_logic_xor:
1702       emit(XOR(result_dst, op[0], op[1]));
1703       break;
1704
1705    case ir_binop_logic_or:
1706       emit(OR(result_dst, op[0], op[1]));
1707       break;
1708
1709    case ir_binop_logic_and:
1710       emit(AND(result_dst, op[0], op[1]));
1711       break;
1712
1713    case ir_binop_dot:
1714       assert(ir->operands[0]->type->is_vector());
1715       assert(ir->operands[0]->type == ir->operands[1]->type);
1716       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1717       break;
1718
1719    case ir_unop_sqrt:
1720       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1721       break;
1722    case ir_unop_rsq:
1723       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1724       break;
1725
1726    case ir_unop_bitcast_i2f:
1727    case ir_unop_bitcast_u2f:
1728       this->result = op[0];
1729       this->result.type = BRW_REGISTER_TYPE_F;
1730       break;
1731
1732    case ir_unop_bitcast_f2i:
1733       this->result = op[0];
1734       this->result.type = BRW_REGISTER_TYPE_D;
1735       break;
1736
1737    case ir_unop_bitcast_f2u:
1738       this->result = op[0];
1739       this->result.type = BRW_REGISTER_TYPE_UD;
1740       break;
1741
1742    case ir_unop_i2f:
1743    case ir_unop_i2u:
1744    case ir_unop_u2i:
1745    case ir_unop_u2f:
1746    case ir_unop_f2i:
1747    case ir_unop_f2u:
1748       emit(MOV(result_dst, op[0]));
1749       break;
1750    case ir_unop_b2i:
1751    case ir_unop_b2f:
1752       if (devinfo->gen <= 5) {
1753          resolve_bool_comparison(ir->operands[0], &op[0]);
1754       }
1755       emit(MOV(result_dst, negate(op[0])));
1756       break;
1757    case ir_unop_f2b:
1758       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1759       break;
1760    case ir_unop_i2b:
1761       emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1762       break;
1763
1764    case ir_unop_trunc:
1765       emit(RNDZ(result_dst, op[0]));
1766       break;
1767    case ir_unop_ceil: {
1768          src_reg tmp = src_reg(this, ir->type);
1769          op[0].negate = !op[0].negate;
1770          emit(RNDD(dst_reg(tmp), op[0]));
1771          tmp.negate = true;
1772          emit(MOV(result_dst, tmp));
1773       }
1774       break;
1775    case ir_unop_floor:
1776       inst = emit(RNDD(result_dst, op[0]));
1777       break;
1778    case ir_unop_fract:
1779       inst = emit(FRC(result_dst, op[0]));
1780       break;
1781    case ir_unop_round_even:
1782       emit(RNDE(result_dst, op[0]));
1783       break;
1784
1785    case ir_binop_min:
1786       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1787       break;
1788    case ir_binop_max:
1789       emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1790       break;
1791
1792    case ir_binop_pow:
1793       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1794       break;
1795
1796    case ir_unop_bit_not:
1797       inst = emit(NOT(result_dst, op[0]));
1798       break;
1799    case ir_binop_bit_and:
1800       inst = emit(AND(result_dst, op[0], op[1]));
1801       break;
1802    case ir_binop_bit_xor:
1803       inst = emit(XOR(result_dst, op[0], op[1]));
1804       break;
1805    case ir_binop_bit_or:
1806       inst = emit(OR(result_dst, op[0], op[1]));
1807       break;
1808
1809    case ir_binop_lshift:
1810       inst = emit(SHL(result_dst, op[0], op[1]));
1811       break;
1812
1813    case ir_binop_rshift:
1814       if (ir->type->base_type == GLSL_TYPE_INT)
1815          inst = emit(ASR(result_dst, op[0], op[1]));
1816       else
1817          inst = emit(SHR(result_dst, op[0], op[1]));
1818       break;
1819
1820    case ir_binop_bfm:
1821       emit(BFI1(result_dst, op[0], op[1]));
1822       break;
1823
1824    case ir_binop_ubo_load: {
1825       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1826       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1827       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1828       src_reg offset;
1829
1830       /* Now, load the vector from that offset. */
1831       assert(ir->type->is_vector() || ir->type->is_scalar());
1832
1833       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1834       packed_consts.type = result.type;
1835       src_reg surf_index;
1836
1837       if (const_uniform_block) {
1838          /* The block index is a constant, so just emit the binding table entry
1839           * as an immediate.
1840           */
1841          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1842                               const_uniform_block->value.u[0]);
1843       } else {
1844          /* The block index is not a constant. Evaluate the index expression
1845           * per-channel and add the base UBO index; we have to select a value
1846           * from any live channel.
1847           */
1848          surf_index = src_reg(this, glsl_type::uint_type);
1849          emit(ADD(dst_reg(surf_index), op[0],
1850                   src_reg(prog_data->base.binding_table.ubo_start)));
1851          surf_index = emit_uniformize(surf_index);
1852
1853          /* Assume this may touch any UBO. It would be nice to provide
1854           * a tighter bound, but the array information is already lowered away.
1855           */
1856          brw_mark_surface_used(&prog_data->base,
1857                                prog_data->base.binding_table.ubo_start +
1858                                shader_prog->NumUniformBlocks - 1);
1859       }
1860
1861       if (const_offset_ir) {
1862          if (devinfo->gen >= 8) {
1863             /* Store the offset in a GRF so we can send-from-GRF. */
1864             offset = src_reg(this, glsl_type::int_type);
1865             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1866          } else {
1867             /* Immediates are fine on older generations since they'll be moved
1868              * to a (potentially fake) MRF at the generator level.
1869              */
1870             offset = src_reg(const_offset / 16);
1871          }
1872       } else {
1873          offset = src_reg(this, glsl_type::uint_type);
1874          emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1875       }
1876
1877       emit_pull_constant_load_reg(dst_reg(packed_consts),
1878                                   surf_index,
1879                                   offset,
1880                                   NULL, NULL /* before_block/inst */);
1881
1882       packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1883       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1884                                             const_offset % 16 / 4,
1885                                             const_offset % 16 / 4,
1886                                             const_offset % 16 / 4);
1887
1888       /* UBO bools are any nonzero int.  We need to convert them to 0/~0. */
1889       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1890          emit(CMP(result_dst, packed_consts, src_reg(0u),
1891                   BRW_CONDITIONAL_NZ));
1892       } else {
1893          emit(MOV(result_dst, packed_consts));
1894       }
1895       break;
1896    }
1897
1898    case ir_binop_vector_extract:
1899       unreachable("should have been lowered by vec_index_to_cond_assign");
1900
1901    case ir_triop_fma:
1902       op[0] = fix_3src_operand(op[0]);
1903       op[1] = fix_3src_operand(op[1]);
1904       op[2] = fix_3src_operand(op[2]);
1905       /* Note that the instruction's argument order is reversed from GLSL
1906        * and the IR.
1907        */
1908       emit(MAD(result_dst, op[2], op[1], op[0]));
1909       break;
1910
1911    case ir_triop_lrp:
1912       emit_lrp(result_dst, op[0], op[1], op[2]);
1913       break;
1914
1915    case ir_triop_csel:
1916       unreachable("already handled above");
1917       break;
1918
1919    case ir_triop_bfi:
1920       op[0] = fix_3src_operand(op[0]);
1921       op[1] = fix_3src_operand(op[1]);
1922       op[2] = fix_3src_operand(op[2]);
1923       emit(BFI2(result_dst, op[0], op[1], op[2]));
1924       break;
1925
1926    case ir_triop_bitfield_extract:
1927       op[0] = fix_3src_operand(op[0]);
1928       op[1] = fix_3src_operand(op[1]);
1929       op[2] = fix_3src_operand(op[2]);
1930       /* Note that the instruction's argument order is reversed from GLSL
1931        * and the IR.
1932        */
1933       emit(BFE(result_dst, op[2], op[1], op[0]));
1934       break;
1935
1936    case ir_triop_vector_insert:
1937       unreachable("should have been lowered by lower_vector_insert");
1938
1939    case ir_quadop_bitfield_insert:
1940       unreachable("not reached: should be handled by "
1941               "bitfield_insert_to_bfm_bfi\n");
1942
1943    case ir_quadop_vector:
1944       unreachable("not reached: should be handled by lower_quadop_vector");
1945
1946    case ir_unop_pack_half_2x16:
1947       emit_pack_half_2x16(result_dst, op[0]);
1948       break;
1949    case ir_unop_unpack_half_2x16:
1950       emit_unpack_half_2x16(result_dst, op[0]);
1951       break;
1952    case ir_unop_unpack_unorm_4x8:
1953       emit_unpack_unorm_4x8(result_dst, op[0]);
1954       break;
1955    case ir_unop_unpack_snorm_4x8:
1956       emit_unpack_snorm_4x8(result_dst, op[0]);
1957       break;
1958    case ir_unop_pack_unorm_4x8:
1959       emit_pack_unorm_4x8(result_dst, op[0]);
1960       break;
1961    case ir_unop_pack_snorm_4x8:
1962       emit_pack_snorm_4x8(result_dst, op[0]);
1963       break;
1964    case ir_unop_pack_snorm_2x16:
1965    case ir_unop_pack_unorm_2x16:
1966    case ir_unop_unpack_snorm_2x16:
1967    case ir_unop_unpack_unorm_2x16:
1968       unreachable("not reached: should be handled by lower_packing_builtins");
1969    case ir_unop_unpack_half_2x16_split_x:
1970    case ir_unop_unpack_half_2x16_split_y:
1971    case ir_binop_pack_half_2x16_split:
1972    case ir_unop_interpolate_at_centroid:
1973    case ir_binop_interpolate_at_sample:
1974    case ir_binop_interpolate_at_offset:
1975       unreachable("not reached: should not occur in vertex shader");
1976    case ir_binop_ldexp:
1977       unreachable("not reached: should be handled by ldexp_to_arith()");
1978    case ir_unop_d2f:
1979    case ir_unop_f2d:
1980    case ir_unop_d2i:
1981    case ir_unop_i2d:
1982    case ir_unop_d2u:
1983    case ir_unop_u2d:
1984    case ir_unop_d2b:
1985    case ir_unop_pack_double_2x32:
1986    case ir_unop_unpack_double_2x32:
1987    case ir_unop_frexp_sig:
1988    case ir_unop_frexp_exp:
1989       unreachable("fp64 todo");
1990    }
1991 }
1992
1993
1994 void
1995 vec4_visitor::visit(ir_swizzle *ir)
1996 {
1997    /* Note that this is only swizzles in expressions, not those on the left
1998     * hand side of an assignment, which do write masking.  See ir_assignment
1999     * for that.
2000     */
2001    const unsigned swz = brw_compose_swizzle(
2002       brw_swizzle_for_size(ir->type->vector_elements),
2003       BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
2004
2005    ir->val->accept(this);
2006    this->result = swizzle(this->result, swz);
2007 }
2008
2009 void
2010 vec4_visitor::visit(ir_dereference_variable *ir)
2011 {
2012    const struct glsl_type *type = ir->type;
2013    dst_reg *reg = variable_storage(ir->var);
2014
2015    if (!reg) {
2016       fail("Failed to find variable storage for %s\n", ir->var->name);
2017       this->result = src_reg(brw_null_reg());
2018       return;
2019    }
2020
2021    this->result = src_reg(*reg);
2022
2023    /* System values get their swizzle from the dst_reg writemask */
2024    if (ir->var->data.mode == ir_var_system_value)
2025       return;
2026
2027    if (type->is_scalar() || type->is_vector() || type->is_matrix())
2028       this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2029 }
2030
2031
2032 int
2033 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2034 {
2035    /* Under normal circumstances array elements are stored consecutively, so
2036     * the stride is equal to the size of the array element.
2037     */
2038    return type_size(ir->type);
2039 }
2040
2041
2042 void
2043 vec4_visitor::visit(ir_dereference_array *ir)
2044 {
2045    ir_constant *constant_index;
2046    src_reg src;
2047    int array_stride = compute_array_stride(ir);
2048
2049    constant_index = ir->array_index->constant_expression_value();
2050
2051    ir->array->accept(this);
2052    src = this->result;
2053
2054    if (constant_index) {
2055       src.reg_offset += constant_index->value.i[0] * array_stride;
2056    } else {
2057       /* Variable index array dereference.  It eats the "vec4" of the
2058        * base of the array and an index that offsets the Mesa register
2059        * index.
2060        */
2061       ir->array_index->accept(this);
2062
2063       src_reg index_reg;
2064
2065       if (array_stride == 1) {
2066          index_reg = this->result;
2067       } else {
2068          index_reg = src_reg(this, glsl_type::int_type);
2069
2070          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2071       }
2072
2073       if (src.reladdr) {
2074          src_reg temp = src_reg(this, glsl_type::int_type);
2075
2076          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2077
2078          index_reg = temp;
2079       }
2080
2081       src.reladdr = ralloc(mem_ctx, src_reg);
2082       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2083    }
2084
2085    /* If the type is smaller than a vec4, replicate the last channel out. */
2086    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2087       src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2088    else
2089       src.swizzle = BRW_SWIZZLE_NOOP;
2090    src.type = brw_type_for_base_type(ir->type);
2091
2092    this->result = src;
2093 }
2094
2095 void
2096 vec4_visitor::visit(ir_dereference_record *ir)
2097 {
2098    unsigned int i;
2099    const glsl_type *struct_type = ir->record->type;
2100    int offset = 0;
2101
2102    ir->record->accept(this);
2103
2104    for (i = 0; i < struct_type->length; i++) {
2105       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2106          break;
2107       offset += type_size(struct_type->fields.structure[i].type);
2108    }
2109
2110    /* If the type is smaller than a vec4, replicate the last channel out. */
2111    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2112       this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2113    else
2114       this->result.swizzle = BRW_SWIZZLE_NOOP;
2115    this->result.type = brw_type_for_base_type(ir->type);
2116
2117    this->result.reg_offset += offset;
2118 }
2119
2120 /**
2121  * We want to be careful in assignment setup to hit the actual storage
2122  * instead of potentially using a temporary like we might with the
2123  * ir_dereference handler.
2124  */
2125 static dst_reg
2126 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2127 {
2128    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2129     * access of a vector, it must be separated into a series conditional moves
2130     * before reaching this point (see ir_vec_index_to_cond_assign).
2131     */
2132    assert(ir->as_dereference());
2133    ir_dereference_array *deref_array = ir->as_dereference_array();
2134    if (deref_array) {
2135       assert(!deref_array->array->type->is_vector());
2136    }
2137
2138    /* Use the rvalue deref handler for the most part.  We'll ignore
2139     * swizzles in it and write swizzles using writemask, though.
2140     */
2141    ir->accept(v);
2142    return dst_reg(v->result);
2143 }
2144
2145 void
2146 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2147                               const struct glsl_type *type,
2148                               enum brw_predicate predicate)
2149 {
2150    if (type->base_type == GLSL_TYPE_STRUCT) {
2151       for (unsigned int i = 0; i < type->length; i++) {
2152          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2153       }
2154       return;
2155    }
2156
2157    if (type->is_array()) {
2158       for (unsigned int i = 0; i < type->length; i++) {
2159          emit_block_move(dst, src, type->fields.array, predicate);
2160       }
2161       return;
2162    }
2163
2164    if (type->is_matrix()) {
2165       const struct glsl_type *vec_type;
2166
2167       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2168                                          type->vector_elements, 1);
2169
2170       for (int i = 0; i < type->matrix_columns; i++) {
2171          emit_block_move(dst, src, vec_type, predicate);
2172       }
2173       return;
2174    }
2175
2176    assert(type->is_scalar() || type->is_vector());
2177
2178    dst->type = brw_type_for_base_type(type);
2179    src->type = dst->type;
2180
2181    dst->writemask = (1 << type->vector_elements) - 1;
2182
2183    src->swizzle = brw_swizzle_for_size(type->vector_elements);
2184
2185    vec4_instruction *inst = emit(MOV(*dst, *src));
2186    inst->predicate = predicate;
2187
2188    dst->reg_offset++;
2189    src->reg_offset++;
2190 }
2191
2192
2193 /* If the RHS processing resulted in an instruction generating a
2194  * temporary value, and it would be easy to rewrite the instruction to
2195  * generate its result right into the LHS instead, do so.  This ends
2196  * up reliably removing instructions where it can be tricky to do so
2197  * later without real UD chain information.
2198  */
2199 bool
2200 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2201                                      dst_reg dst,
2202                                      src_reg src,
2203                                      vec4_instruction *pre_rhs_inst,
2204                                      vec4_instruction *last_rhs_inst)
2205 {
2206    /* This could be supported, but it would take more smarts. */
2207    if (ir->condition)
2208       return false;
2209
2210    if (pre_rhs_inst == last_rhs_inst)
2211       return false; /* No instructions generated to work with. */
2212
2213    /* Make sure the last instruction generated our source reg. */
2214    if (src.file != GRF ||
2215        src.file != last_rhs_inst->dst.file ||
2216        src.reg != last_rhs_inst->dst.reg ||
2217        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2218        src.reladdr ||
2219        src.abs ||
2220        src.negate ||
2221        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2222       return false;
2223
2224    /* Check that that last instruction fully initialized the channels
2225     * we want to use, in the order we want to use them.  We could
2226     * potentially reswizzle the operands of many instructions so that
2227     * we could handle out of order channels, but don't yet.
2228     */
2229
2230    for (unsigned i = 0; i < 4; i++) {
2231       if (dst.writemask & (1 << i)) {
2232          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2233             return false;
2234
2235          if (BRW_GET_SWZ(src.swizzle, i) != i)
2236             return false;
2237       }
2238    }
2239
2240    /* Success!  Rewrite the instruction. */
2241    last_rhs_inst->dst.file = dst.file;
2242    last_rhs_inst->dst.reg = dst.reg;
2243    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2244    last_rhs_inst->dst.reladdr = dst.reladdr;
2245    last_rhs_inst->dst.writemask &= dst.writemask;
2246
2247    return true;
2248 }
2249
2250 void
2251 vec4_visitor::visit(ir_assignment *ir)
2252 {
2253    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2254    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2255
2256    if (!ir->lhs->type->is_scalar() &&
2257        !ir->lhs->type->is_vector()) {
2258       ir->rhs->accept(this);
2259       src_reg src = this->result;
2260
2261       if (ir->condition) {
2262          emit_bool_to_cond_code(ir->condition, &predicate);
2263       }
2264
2265       /* emit_block_move doesn't account for swizzles in the source register.
2266        * This should be ok, since the source register is a structure or an
2267        * array, and those can't be swizzled.  But double-check to be sure.
2268        */
2269       assert(src.swizzle ==
2270              (ir->rhs->type->is_matrix()
2271               ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2272               : BRW_SWIZZLE_NOOP));
2273
2274       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2275       return;
2276    }
2277
2278    /* Now we're down to just a scalar/vector with writemasks. */
2279    int i;
2280
2281    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2282    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2283
2284    ir->rhs->accept(this);
2285
2286    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2287
2288    int swizzles[4];
2289    int src_chan = 0;
2290
2291    assert(ir->lhs->type->is_vector() ||
2292           ir->lhs->type->is_scalar());
2293    dst.writemask = ir->write_mask;
2294
2295    /* Swizzle a small RHS vector into the channels being written.
2296     *
2297     * glsl ir treats write_mask as dictating how many channels are
2298     * present on the RHS while in our instructions we need to make
2299     * those channels appear in the slots of the vec4 they're written to.
2300     */
2301    for (int i = 0; i < 4; i++)
2302       swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2303
2304    src_reg src = swizzle(this->result,
2305                          BRW_SWIZZLE4(swizzles[0], swizzles[1],
2306                                       swizzles[2], swizzles[3]));
2307
2308    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2309       return;
2310    }
2311
2312    if (ir->condition) {
2313       emit_bool_to_cond_code(ir->condition, &predicate);
2314    }
2315
2316    for (i = 0; i < type_size(ir->lhs->type); i++) {
2317       vec4_instruction *inst = emit(MOV(dst, src));
2318       inst->predicate = predicate;
2319
2320       dst.reg_offset++;
2321       src.reg_offset++;
2322    }
2323 }
2324
2325 void
2326 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2327 {
2328    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2329       foreach_in_list(ir_constant, field_value, &ir->components) {
2330          emit_constant_values(dst, field_value);
2331       }
2332       return;
2333    }
2334
2335    if (ir->type->is_array()) {
2336       for (unsigned int i = 0; i < ir->type->length; i++) {
2337          emit_constant_values(dst, ir->array_elements[i]);
2338       }
2339       return;
2340    }
2341
2342    if (ir->type->is_matrix()) {
2343       for (int i = 0; i < ir->type->matrix_columns; i++) {
2344          float *vec = &ir->value.f[i * ir->type->vector_elements];
2345
2346          for (int j = 0; j < ir->type->vector_elements; j++) {
2347             dst->writemask = 1 << j;
2348             dst->type = BRW_REGISTER_TYPE_F;
2349
2350             emit(MOV(*dst, src_reg(vec[j])));
2351          }
2352          dst->reg_offset++;
2353       }
2354       return;
2355    }
2356
2357    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2358
2359    for (int i = 0; i < ir->type->vector_elements; i++) {
2360       if (!(remaining_writemask & (1 << i)))
2361          continue;
2362
2363       dst->writemask = 1 << i;
2364       dst->type = brw_type_for_base_type(ir->type);
2365
2366       /* Find other components that match the one we're about to
2367        * write.  Emits fewer instructions for things like vec4(0.5,
2368        * 1.5, 1.5, 1.5).
2369        */
2370       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2371          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2372             if (ir->value.b[i] == ir->value.b[j])
2373                dst->writemask |= (1 << j);
2374          } else {
2375             /* u, i, and f storage all line up, so no need for a
2376              * switch case for comparing each type.
2377              */
2378             if (ir->value.u[i] == ir->value.u[j])
2379                dst->writemask |= (1 << j);
2380          }
2381       }
2382
2383       switch (ir->type->base_type) {
2384       case GLSL_TYPE_FLOAT:
2385          emit(MOV(*dst, src_reg(ir->value.f[i])));
2386          break;
2387       case GLSL_TYPE_INT:
2388          emit(MOV(*dst, src_reg(ir->value.i[i])));
2389          break;
2390       case GLSL_TYPE_UINT:
2391          emit(MOV(*dst, src_reg(ir->value.u[i])));
2392          break;
2393       case GLSL_TYPE_BOOL:
2394          emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2395          break;
2396       default:
2397          unreachable("Non-float/uint/int/bool constant");
2398       }
2399
2400       remaining_writemask &= ~dst->writemask;
2401    }
2402    dst->reg_offset++;
2403 }
2404
2405 void
2406 vec4_visitor::visit(ir_constant *ir)
2407 {
2408    dst_reg dst = dst_reg(this, ir->type);
2409    this->result = src_reg(dst);
2410
2411    emit_constant_values(&dst, ir);
2412 }
2413
2414 void
2415 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2416 {
2417    ir_dereference *deref = static_cast<ir_dereference *>(
2418       ir->actual_parameters.get_head());
2419    ir_variable *location = deref->variable_referenced();
2420    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2421                           location->data.binding);
2422
2423    /* Calculate the surface offset */
2424    src_reg offset(this, glsl_type::uint_type);
2425    ir_dereference_array *deref_array = deref->as_dereference_array();
2426    if (deref_array) {
2427       deref_array->array_index->accept(this);
2428
2429       src_reg tmp(this, glsl_type::uint_type);
2430       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2431       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2432    } else {
2433       offset = location->data.atomic.offset;
2434    }
2435
2436    /* Emit the appropriate machine instruction */
2437    const char *callee = ir->callee->function_name();
2438    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2439
2440    if (!strcmp("__intrinsic_atomic_read", callee)) {
2441       emit_untyped_surface_read(surf_index, dst, offset);
2442
2443    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2444       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2445                           src_reg(), src_reg());
2446
2447    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2448       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2449                           src_reg(), src_reg());
2450    }
2451
2452    brw_mark_surface_used(stage_prog_data, surf_index);
2453 }
2454
2455 void
2456 vec4_visitor::visit(ir_call *ir)
2457 {
2458    const char *callee = ir->callee->function_name();
2459
2460    if (!strcmp("__intrinsic_atomic_read", callee) ||
2461        !strcmp("__intrinsic_atomic_increment", callee) ||
2462        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2463       visit_atomic_counter_intrinsic(ir);
2464    } else {
2465       unreachable("Unsupported intrinsic.");
2466    }
2467 }
2468
2469 src_reg
2470 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2471 {
2472    vec4_instruction *inst =
2473       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2474                                     dst_reg(this, glsl_type::uvec4_type));
2475    inst->base_mrf = 2;
2476    inst->src[1] = sampler;
2477
2478    int param_base;
2479
2480    if (devinfo->gen >= 9) {
2481       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2482       vec4_instruction *header_inst = new(mem_ctx)
2483          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2484                           dst_reg(MRF, inst->base_mrf));
2485
2486       emit(header_inst);
2487
2488       inst->mlen = 2;
2489       inst->header_size = 1;
2490       param_base = inst->base_mrf + 1;
2491    } else {
2492       inst->mlen = 1;
2493       param_base = inst->base_mrf;
2494    }
2495
2496    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2497    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2498    int zero_mask = 0xf & ~coord_mask;
2499
2500    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2501             coordinate));
2502
2503    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2504             src_reg(0)));
2505
2506    emit(inst);
2507    return src_reg(inst->dst);
2508 }
2509
2510 static bool
2511 is_high_sampler(const struct brw_device_info *devinfo, src_reg sampler)
2512 {
2513    if (devinfo->gen < 8 && !devinfo->is_haswell)
2514       return false;
2515
2516    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2517 }
2518
2519 void
2520 vec4_visitor::visit(ir_texture *ir)
2521 {
2522    uint32_t sampler =
2523       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2524
2525    ir_rvalue *nonconst_sampler_index =
2526       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2527
2528    /* Handle non-constant sampler array indexing */
2529    src_reg sampler_reg;
2530    if (nonconst_sampler_index) {
2531       /* The highest sampler which may be used by this operation is
2532        * the last element of the array. Mark it here, because the generator
2533        * doesn't have enough information to determine the bound.
2534        */
2535       uint32_t array_size = ir->sampler->as_dereference_array()
2536          ->array->type->array_size();
2537
2538       uint32_t max_used = sampler + array_size - 1;
2539       if (ir->op == ir_tg4 && devinfo->gen < 8) {
2540          max_used += prog_data->base.binding_table.gather_texture_start;
2541       } else {
2542          max_used += prog_data->base.binding_table.texture_start;
2543       }
2544
2545       brw_mark_surface_used(&prog_data->base, max_used);
2546
2547       /* Emit code to evaluate the actual indexing expression */
2548       nonconst_sampler_index->accept(this);
2549       src_reg temp(this, glsl_type::uint_type);
2550       emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
2551       sampler_reg = emit_uniformize(temp);
2552    } else {
2553       /* Single sampler, or constant array index; the indexing expression
2554        * is just an immediate.
2555        */
2556       sampler_reg = src_reg(sampler);
2557    }
2558
2559    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2560     * emitting anything other than setting up the constant result.
2561     */
2562    if (ir->op == ir_tg4) {
2563       ir_constant *chan = ir->lod_info.component->as_constant();
2564       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2565       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2566          dst_reg result(this, ir->type);
2567          this->result = src_reg(result);
2568          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2569          return;
2570       }
2571    }
2572
2573    /* Should be lowered by do_lower_texture_projection */
2574    assert(!ir->projector);
2575
2576    /* Should be lowered */
2577    assert(!ir->offset || !ir->offset->type->is_array());
2578
2579    /* Generate code to compute all the subexpression trees.  This has to be
2580     * done before loading any values into MRFs for the sampler message since
2581     * generating these values may involve SEND messages that need the MRFs.
2582     */
2583    src_reg coordinate;
2584    if (ir->coordinate) {
2585       ir->coordinate->accept(this);
2586       coordinate = this->result;
2587    }
2588
2589    src_reg shadow_comparitor;
2590    if (ir->shadow_comparitor) {
2591       ir->shadow_comparitor->accept(this);
2592       shadow_comparitor = this->result;
2593    }
2594
2595    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2596    src_reg offset_value;
2597    if (has_nonconstant_offset) {
2598       ir->offset->accept(this);
2599       offset_value = src_reg(this->result);
2600    }
2601
2602    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2603    src_reg lod, dPdx, dPdy, sample_index, mcs;
2604    switch (ir->op) {
2605    case ir_tex:
2606       lod = src_reg(0.0f);
2607       lod_type = glsl_type::float_type;
2608       break;
2609    case ir_txf:
2610    case ir_txl:
2611    case ir_txs:
2612       ir->lod_info.lod->accept(this);
2613       lod = this->result;
2614       lod_type = ir->lod_info.lod->type;
2615       break;
2616    case ir_query_levels:
2617       lod = src_reg(0);
2618       lod_type = glsl_type::int_type;
2619       break;
2620    case ir_txf_ms:
2621       ir->lod_info.sample_index->accept(this);
2622       sample_index = this->result;
2623       sample_index_type = ir->lod_info.sample_index->type;
2624
2625       if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2626          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2627       else
2628          mcs = src_reg(0u);
2629       break;
2630    case ir_txd:
2631       ir->lod_info.grad.dPdx->accept(this);
2632       dPdx = this->result;
2633
2634       ir->lod_info.grad.dPdy->accept(this);
2635       dPdy = this->result;
2636
2637       lod_type = ir->lod_info.grad.dPdx->type;
2638       break;
2639    case ir_txb:
2640    case ir_lod:
2641    case ir_tg4:
2642       break;
2643    }
2644
2645    enum opcode opcode;
2646    switch (ir->op) {
2647    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2648    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2649    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2650    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2651    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2652    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2653    case ir_tg4: opcode = has_nonconstant_offset
2654                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2655    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2656    case ir_txb:
2657       unreachable("TXB is not valid for vertex shaders.");
2658    case ir_lod:
2659       unreachable("LOD is not valid for vertex shaders.");
2660    default:
2661       unreachable("Unrecognized tex op");
2662    }
2663
2664    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2665       opcode, dst_reg(this, ir->type));
2666
2667    if (ir->offset != NULL && !has_nonconstant_offset) {
2668       inst->offset =
2669          brw_texture_offset(ir->offset->as_constant()->value.i,
2670                             ir->offset->type->vector_elements);
2671    }
2672
2673    /* Stuff the channel select bits in the top of the texture offset */
2674    if (ir->op == ir_tg4)
2675       inst->offset |= gather_channel(ir, sampler) << 16;
2676
2677    /* The message header is necessary for:
2678     * - Gen4 (always)
2679     * - Gen9+ for selecting SIMD4x2
2680     * - Texel offsets
2681     * - Gather channel selection
2682     * - Sampler indices too large to fit in a 4-bit value.
2683     */
2684    inst->header_size =
2685       (devinfo->gen < 5 || devinfo->gen >= 9 ||
2686        inst->offset != 0 || ir->op == ir_tg4 ||
2687        is_high_sampler(devinfo, sampler_reg)) ? 1 : 0;
2688    inst->base_mrf = 2;
2689    inst->mlen = inst->header_size + 1; /* always at least one */
2690    inst->dst.writemask = WRITEMASK_XYZW;
2691    inst->shadow_compare = ir->shadow_comparitor != NULL;
2692
2693    inst->src[1] = sampler_reg;
2694
2695    /* MRF for the first parameter */
2696    int param_base = inst->base_mrf + inst->header_size;
2697
2698    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2699       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2700       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2701    } else {
2702       /* Load the coordinate */
2703       /* FINISHME: gl_clamp_mask and saturate */
2704       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2705       int zero_mask = 0xf & ~coord_mask;
2706
2707       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2708                coordinate));
2709
2710       if (zero_mask != 0) {
2711          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2712                   src_reg(0)));
2713       }
2714       /* Load the shadow comparitor */
2715       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2716          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2717                           WRITEMASK_X),
2718                   shadow_comparitor));
2719          inst->mlen++;
2720       }
2721
2722       /* Load the LOD info */
2723       if (ir->op == ir_tex || ir->op == ir_txl) {
2724          int mrf, writemask;
2725          if (devinfo->gen >= 5) {
2726             mrf = param_base + 1;
2727             if (ir->shadow_comparitor) {
2728                writemask = WRITEMASK_Y;
2729                /* mlen already incremented */
2730             } else {
2731                writemask = WRITEMASK_X;
2732                inst->mlen++;
2733             }
2734          } else /* devinfo->gen == 4 */ {
2735             mrf = param_base;
2736             writemask = WRITEMASK_W;
2737          }
2738          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2739       } else if (ir->op == ir_txf) {
2740          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2741       } else if (ir->op == ir_txf_ms) {
2742          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2743                   sample_index));
2744          if (devinfo->gen >= 7) {
2745             /* MCS data is in the first channel of `mcs`, but we need to get it into
2746              * the .y channel of the second vec4 of params, so replicate .x across
2747              * the whole vec4 and then mask off everything except .y
2748              */
2749             mcs.swizzle = BRW_SWIZZLE_XXXX;
2750             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2751                      mcs));
2752          }
2753          inst->mlen++;
2754       } else if (ir->op == ir_txd) {
2755          const glsl_type *type = lod_type;
2756
2757          if (devinfo->gen >= 5) {
2758             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2759             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2760             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2761             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2762             inst->mlen++;
2763
2764             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2765                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2766                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2767                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2768                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2769                inst->mlen++;
2770
2771                if (ir->shadow_comparitor) {
2772                   emit(MOV(dst_reg(MRF, param_base + 2,
2773                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2774                            shadow_comparitor));
2775                }
2776             }
2777          } else /* devinfo->gen == 4 */ {
2778             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2779             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2780             inst->mlen += 2;
2781          }
2782       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2783          if (ir->shadow_comparitor) {
2784             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2785                      shadow_comparitor));
2786          }
2787
2788          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2789                   offset_value));
2790          inst->mlen++;
2791       }
2792    }
2793
2794    emit(inst);
2795
2796    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2797     * spec requires layers.
2798     */
2799    if (ir->op == ir_txs) {
2800       glsl_type const *type = ir->sampler->type;
2801       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2802           type->sampler_array) {
2803          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2804                    writemask(inst->dst, WRITEMASK_Z),
2805                    src_reg(inst->dst), src_reg(6));
2806       }
2807    }
2808
2809    if (devinfo->gen == 6 && ir->op == ir_tg4) {
2810       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2811    }
2812
2813    swizzle_result(ir, src_reg(inst->dst), sampler);
2814 }
2815
2816 /**
2817  * Apply workarounds for Gen6 gather with UINT/SINT
2818  */
2819 void
2820 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2821 {
2822    if (!wa)
2823       return;
2824
2825    int width = (wa & WA_8BIT) ? 8 : 16;
2826    dst_reg dst_f = dst;
2827    dst_f.type = BRW_REGISTER_TYPE_F;
2828
2829    /* Convert from UNORM to UINT */
2830    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2831    emit(MOV(dst, src_reg(dst_f)));
2832
2833    if (wa & WA_SIGN) {
2834       /* Reinterpret the UINT value as a signed INT value by
2835        * shifting the sign bit into place, then shifting back
2836        * preserving sign.
2837        */
2838       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2839       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2840    }
2841 }
2842
2843 /**
2844  * Set up the gather channel based on the swizzle, for gather4.
2845  */
2846 uint32_t
2847 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2848 {
2849    ir_constant *chan = ir->lod_info.component->as_constant();
2850    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2851    switch (swiz) {
2852       case SWIZZLE_X: return 0;
2853       case SWIZZLE_Y:
2854          /* gather4 sampler is broken for green channel on RG32F --
2855           * we must ask for blue instead.
2856           */
2857          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2858             return 2;
2859          return 1;
2860       case SWIZZLE_Z: return 2;
2861       case SWIZZLE_W: return 3;
2862       default:
2863          unreachable("Not reached"); /* zero, one swizzles handled already */
2864    }
2865 }
2866
2867 void
2868 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2869 {
2870    int s = key->tex.swizzles[sampler];
2871
2872    this->result = src_reg(this, ir->type);
2873    dst_reg swizzled_result(this->result);
2874
2875    if (ir->op == ir_query_levels) {
2876       /* # levels is in .w */
2877       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2878       emit(MOV(swizzled_result, orig_val));
2879       return;
2880    }
2881
2882    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2883                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2884       emit(MOV(swizzled_result, orig_val));
2885       return;
2886    }
2887
2888
2889    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2890    int swizzle[4] = {0};
2891
2892    for (int i = 0; i < 4; i++) {
2893       switch (GET_SWZ(s, i)) {
2894       case SWIZZLE_ZERO:
2895          zero_mask |= (1 << i);
2896          break;
2897       case SWIZZLE_ONE:
2898          one_mask |= (1 << i);
2899          break;
2900       default:
2901          copy_mask |= (1 << i);
2902          swizzle[i] = GET_SWZ(s, i);
2903          break;
2904       }
2905    }
2906
2907    if (copy_mask) {
2908       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2909       swizzled_result.writemask = copy_mask;
2910       emit(MOV(swizzled_result, orig_val));
2911    }
2912
2913    if (zero_mask) {
2914       swizzled_result.writemask = zero_mask;
2915       emit(MOV(swizzled_result, src_reg(0.0f)));
2916    }
2917
2918    if (one_mask) {
2919       swizzled_result.writemask = one_mask;
2920       emit(MOV(swizzled_result, src_reg(1.0f)));
2921    }
2922 }
2923
2924 void
2925 vec4_visitor::visit(ir_return *)
2926 {
2927    unreachable("not reached");
2928 }
2929
2930 void
2931 vec4_visitor::visit(ir_discard *)
2932 {
2933    unreachable("not reached");
2934 }
2935
2936 void
2937 vec4_visitor::visit(ir_if *ir)
2938 {
2939    /* Don't point the annotation at the if statement, because then it plus
2940     * the then and else blocks get printed.
2941     */
2942    this->base_ir = ir->condition;
2943
2944    if (devinfo->gen == 6) {
2945       emit_if_gen6(ir);
2946    } else {
2947       enum brw_predicate predicate;
2948       emit_bool_to_cond_code(ir->condition, &predicate);
2949       emit(IF(predicate));
2950    }
2951
2952    visit_instructions(&ir->then_instructions);
2953
2954    if (!ir->else_instructions.is_empty()) {
2955       this->base_ir = ir->condition;
2956       emit(BRW_OPCODE_ELSE);
2957
2958       visit_instructions(&ir->else_instructions);
2959    }
2960
2961    this->base_ir = ir->condition;
2962    emit(BRW_OPCODE_ENDIF);
2963 }
2964
2965 void
2966 vec4_visitor::visit(ir_emit_vertex *)
2967 {
2968    unreachable("not reached");
2969 }
2970
2971 void
2972 vec4_visitor::visit(ir_end_primitive *)
2973 {
2974    unreachable("not reached");
2975 }
2976
2977 void
2978 vec4_visitor::visit(ir_barrier *)
2979 {
2980    unreachable("not reached");
2981 }
2982
2983 void
2984 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2985                                   dst_reg dst, src_reg offset,
2986                                   src_reg src0, src_reg src1)
2987 {
2988    unsigned mlen = 0;
2989
2990    /* Set the atomic operation offset. */
2991    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2992    mlen++;
2993
2994    /* Set the atomic operation arguments. */
2995    if (src0.file != BAD_FILE) {
2996       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2997       mlen++;
2998    }
2999
3000    if (src1.file != BAD_FILE) {
3001       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
3002       mlen++;
3003    }
3004
3005    /* Emit the instruction.  Note that this maps to the normal SIMD8
3006     * untyped atomic message on Ivy Bridge, but that's OK because
3007     * unused channels will be masked out.
3008     */
3009    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3010                                  brw_message_reg(0),
3011                                  src_reg(surf_index), src_reg(atomic_op));
3012    inst->mlen = mlen;
3013 }
3014
3015 void
3016 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3017                                         src_reg offset)
3018 {
3019    /* Set the surface read offset. */
3020    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3021
3022    /* Emit the instruction.  Note that this maps to the normal SIMD8
3023     * untyped surface read message, but that's OK because unused
3024     * channels will be masked out.
3025     */
3026    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3027                                  brw_message_reg(0),
3028                                  src_reg(surf_index), src_reg(1));
3029    inst->mlen = 1;
3030 }
3031
3032 void
3033 vec4_visitor::emit_ndc_computation()
3034 {
3035    /* Get the position */
3036    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3037
3038    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3039    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3040    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3041
3042    current_annotation = "NDC";
3043    dst_reg ndc_w = ndc;
3044    ndc_w.writemask = WRITEMASK_W;
3045    src_reg pos_w = pos;
3046    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3047    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3048
3049    dst_reg ndc_xyz = ndc;
3050    ndc_xyz.writemask = WRITEMASK_XYZ;
3051
3052    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3053 }
3054
3055 void
3056 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3057 {
3058    if (devinfo->gen < 6 &&
3059        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3060         key->userclip_active || devinfo->has_negative_rhw_bug)) {
3061       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3062       dst_reg header1_w = header1;
3063       header1_w.writemask = WRITEMASK_W;
3064
3065       emit(MOV(header1, 0u));
3066
3067       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3068          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3069
3070          current_annotation = "Point size";
3071          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3072          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3073       }
3074
3075       if (key->userclip_active) {
3076          current_annotation = "Clipping flags";
3077          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3078          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3079
3080          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3081          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3082          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3083
3084          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3085          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3086          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3087          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3088       }
3089
3090       /* i965 clipping workaround:
3091        * 1) Test for -ve rhw
3092        * 2) If set,
3093        *      set ndc = (0,0,0,0)
3094        *      set ucp[6] = 1
3095        *
3096        * Later, clipping will detect ucp[6] and ensure the primitive is
3097        * clipped against all fixed planes.
3098        */
3099       if (devinfo->has_negative_rhw_bug) {
3100          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3101          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3102          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3103          vec4_instruction *inst;
3104          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3105          inst->predicate = BRW_PREDICATE_NORMAL;
3106          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3107          inst->predicate = BRW_PREDICATE_NORMAL;
3108       }
3109
3110       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3111    } else if (devinfo->gen < 6) {
3112       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3113    } else {
3114       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3115       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3116          dst_reg reg_w = reg;
3117          reg_w.writemask = WRITEMASK_W;
3118          emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3119       }
3120       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3121          dst_reg reg_y = reg;
3122          reg_y.writemask = WRITEMASK_Y;
3123          reg_y.type = BRW_REGISTER_TYPE_D;
3124          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3125       }
3126       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3127          dst_reg reg_z = reg;
3128          reg_z.writemask = WRITEMASK_Z;
3129          reg_z.type = BRW_REGISTER_TYPE_D;
3130          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3131       }
3132    }
3133 }
3134
3135 void
3136 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3137 {
3138    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3139     *
3140     *     "If a linked set of shaders forming the vertex stage contains no
3141     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3142     *     application has requested clipping against user clip planes through
3143     *     the API, then the coordinate written to gl_Position is used for
3144     *     comparison against the user clip planes."
3145     *
3146     * This function is only called if the shader didn't write to
3147     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3148     * if the user wrote to it; otherwise we use gl_Position.
3149     */
3150    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3151    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3152       clip_vertex = VARYING_SLOT_POS;
3153    }
3154
3155    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3156         ++i) {
3157       reg.writemask = 1 << i;
3158       emit(DP4(reg,
3159                src_reg(output_reg[clip_vertex]),
3160                src_reg(this->userplane[i + offset])));
3161    }
3162 }
3163
3164 vec4_instruction *
3165 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3166 {
3167    assert (varying < VARYING_SLOT_MAX);
3168    reg.type = output_reg[varying].type;
3169    current_annotation = output_reg_annotation[varying];
3170    /* Copy the register, saturating if necessary */
3171    return emit(MOV(reg, src_reg(output_reg[varying])));
3172 }
3173
3174 void
3175 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3176 {
3177    reg.type = BRW_REGISTER_TYPE_F;
3178
3179    switch (varying) {
3180    case VARYING_SLOT_PSIZ:
3181    {
3182       /* PSIZ is always in slot 0, and is coupled with other flags. */
3183       current_annotation = "indices, point width, clip flags";
3184       emit_psiz_and_flags(reg);
3185       break;
3186    }
3187    case BRW_VARYING_SLOT_NDC:
3188       current_annotation = "NDC";
3189       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3190       break;
3191    case VARYING_SLOT_POS:
3192       current_annotation = "gl_Position";
3193       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3194       break;
3195    case VARYING_SLOT_EDGE:
3196       /* This is present when doing unfilled polygons.  We're supposed to copy
3197        * the edge flag from the user-provided vertex array
3198        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3199        * of that attribute (starts as 1.0f).  This is then used in clipping to
3200        * determine which edges should be drawn as wireframe.
3201        */
3202       current_annotation = "edge flag";
3203       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3204                                     glsl_type::float_type, WRITEMASK_XYZW))));
3205       break;
3206    case BRW_VARYING_SLOT_PAD:
3207       /* No need to write to this slot */
3208       break;
3209    case VARYING_SLOT_COL0:
3210    case VARYING_SLOT_COL1:
3211    case VARYING_SLOT_BFC0:
3212    case VARYING_SLOT_BFC1: {
3213       /* These built-in varyings are only supported in compatibility mode,
3214        * and we only support GS in core profile.  So, this must be a vertex
3215        * shader.
3216        */
3217       assert(stage == MESA_SHADER_VERTEX);
3218       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3219       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3220          inst->saturate = true;
3221       break;
3222    }
3223
3224    default:
3225       emit_generic_urb_slot(reg, varying);
3226       break;
3227    }
3228 }
3229
3230 static int
3231 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3232 {
3233    if (devinfo->gen >= 6) {
3234       /* URB data written (does not include the message header reg) must
3235        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3236        * section 5.4.3.2.2: URB_INTERLEAVED.
3237        *
3238        * URB entries are allocated on a multiple of 1024 bits, so an
3239        * extra 128 bits written here to make the end align to 256 is
3240        * no problem.
3241        */
3242       if ((mlen % 2) != 1)
3243          mlen++;
3244    }
3245
3246    return mlen;
3247 }
3248
3249
3250 /**
3251  * Generates the VUE payload plus the necessary URB write instructions to
3252  * output it.
3253  *
3254  * The VUE layout is documented in Volume 2a.
3255  */
3256 void
3257 vec4_visitor::emit_vertex()
3258 {
3259    /* MRF 0 is reserved for the debugger, so start with message header
3260     * in MRF 1.
3261     */
3262    int base_mrf = 1;
3263    int mrf = base_mrf;
3264    /* In the process of generating our URB write message contents, we
3265     * may need to unspill a register or load from an array.  Those
3266     * reads would use MRFs 14-15.
3267     */
3268    int max_usable_mrf = 13;
3269
3270    /* The following assertion verifies that max_usable_mrf causes an
3271     * even-numbered amount of URB write data, which will meet gen6's
3272     * requirements for length alignment.
3273     */
3274    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3275
3276    /* First mrf is the g0-based message header containing URB handles and
3277     * such.
3278     */
3279    emit_urb_write_header(mrf++);
3280
3281    if (devinfo->gen < 6) {
3282       emit_ndc_computation();
3283    }
3284
3285    /* Lower legacy ff and ClipVertex clipping to clip distances */
3286    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3287       current_annotation = "user clip distances";
3288
3289       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3290       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3291
3292       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3293       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3294    }
3295
3296    /* We may need to split this up into several URB writes, so do them in a
3297     * loop.
3298     */
3299    int slot = 0;
3300    bool complete = false;
3301    do {
3302       /* URB offset is in URB row increments, and each of our MRFs is half of
3303        * one of those, since we're doing interleaved writes.
3304        */
3305       int offset = slot / 2;
3306
3307       mrf = base_mrf + 1;
3308       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3309          emit_urb_slot(dst_reg(MRF, mrf++),
3310                        prog_data->vue_map.slot_to_varying[slot]);
3311
3312          /* If this was max_usable_mrf, we can't fit anything more into this
3313           * URB WRITE.
3314           */
3315          if (mrf > max_usable_mrf) {
3316             slot++;
3317             break;
3318          }
3319       }
3320
3321       complete = slot >= prog_data->vue_map.num_slots;
3322       current_annotation = "URB write";
3323       vec4_instruction *inst = emit_urb_write_opcode(complete);
3324       inst->base_mrf = base_mrf;
3325       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3326       inst->offset += offset;
3327    } while(!complete);
3328 }
3329
3330
3331 src_reg
3332 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3333                                  src_reg *reladdr, int reg_offset)
3334 {
3335    /* Because we store the values to scratch interleaved like our
3336     * vertex data, we need to scale the vec4 index by 2.
3337     */
3338    int message_header_scale = 2;
3339
3340    /* Pre-gen6, the message header uses byte offsets instead of vec4
3341     * (16-byte) offset units.
3342     */
3343    if (devinfo->gen < 6)
3344       message_header_scale *= 16;
3345
3346    if (reladdr) {
3347       src_reg index = src_reg(this, glsl_type::int_type);
3348
3349       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3350                                    src_reg(reg_offset)));
3351       emit_before(block, inst, MUL(dst_reg(index), index,
3352                                    src_reg(message_header_scale)));
3353
3354       return index;
3355    } else {
3356       return src_reg(reg_offset * message_header_scale);
3357    }
3358 }
3359
3360 src_reg
3361 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3362                                        src_reg *reladdr, int reg_offset)
3363 {
3364    if (reladdr) {
3365       src_reg index = src_reg(this, glsl_type::int_type);
3366
3367       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3368                                    src_reg(reg_offset)));
3369
3370       /* Pre-gen6, the message header uses byte offsets instead of vec4
3371        * (16-byte) offset units.
3372        */
3373       if (devinfo->gen < 6) {
3374          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3375       }
3376
3377       return index;
3378    } else if (devinfo->gen >= 8) {
3379       /* Store the offset in a GRF so we can send-from-GRF. */
3380       src_reg offset = src_reg(this, glsl_type::int_type);
3381       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3382       return offset;
3383    } else {
3384       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3385       return src_reg(reg_offset * message_header_scale);
3386    }
3387 }
3388
3389 /**
3390  * Emits an instruction before @inst to load the value named by @orig_src
3391  * from scratch space at @base_offset to @temp.
3392  *
3393  * @base_offset is measured in 32-byte units (the size of a register).
3394  */
3395 void
3396 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3397                                 dst_reg temp, src_reg orig_src,
3398                                 int base_offset)
3399 {
3400    int reg_offset = base_offset + orig_src.reg_offset;
3401    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3402                                       reg_offset);
3403
3404    emit_before(block, inst, SCRATCH_READ(temp, index));
3405 }
3406
3407 /**
3408  * Emits an instruction after @inst to store the value to be written
3409  * to @orig_dst to scratch space at @base_offset, from @temp.
3410  *
3411  * @base_offset is measured in 32-byte units (the size of a register).
3412  */
3413 void
3414 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3415                                  int base_offset)
3416 {
3417    int reg_offset = base_offset + inst->dst.reg_offset;
3418    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3419                                       reg_offset);
3420
3421    /* Create a temporary register to store *inst's result in.
3422     *
3423     * We have to be careful in MOVing from our temporary result register in
3424     * the scratch write.  If we swizzle from channels of the temporary that
3425     * weren't initialized, it will confuse live interval analysis, which will
3426     * make spilling fail to make progress.
3427     */
3428    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3429                                        inst->dst.type),
3430                                 brw_swizzle_for_mask(inst->dst.writemask));
3431    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3432                                        inst->dst.writemask));
3433    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3434    write->predicate = inst->predicate;
3435    write->ir = inst->ir;
3436    write->annotation = inst->annotation;
3437    inst->insert_after(block, write);
3438
3439    inst->dst.file = temp.file;
3440    inst->dst.reg = temp.reg;
3441    inst->dst.reg_offset = temp.reg_offset;
3442    inst->dst.reladdr = NULL;
3443 }
3444
3445 /**
3446  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3447  * adds the scratch read(s) before \p inst. The function also checks for
3448  * recursive reladdr scratch accesses, issuing the corresponding scratch
3449  * loads and rewriting reladdr references accordingly.
3450  *
3451  * \return \p src if it did not require a scratch load, otherwise, the
3452  * register holding the result of the scratch load that the caller should
3453  * use to rewrite src.
3454  */
3455 src_reg
3456 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3457                                    vec4_instruction *inst, src_reg src)
3458 {
3459    /* Resolve recursive reladdr scratch access by calling ourselves
3460     * with src.reladdr
3461     */
3462    if (src.reladdr)
3463       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3464                                           *src.reladdr);
3465
3466    /* Now handle scratch access on src */
3467    if (src.file == GRF && scratch_loc[src.reg] != -1) {
3468       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3469       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3470       src.reg = temp.reg;
3471       src.reg_offset = temp.reg_offset;
3472       src.reladdr = NULL;
3473    }
3474
3475    return src;
3476 }
3477
3478 /**
3479  * We can't generally support array access in GRF space, because a
3480  * single instruction's destination can only span 2 contiguous
3481  * registers.  So, we send all GRF arrays that get variable index
3482  * access to scratch space.
3483  */
3484 void
3485 vec4_visitor::move_grf_array_access_to_scratch()
3486 {
3487    int scratch_loc[this->alloc.count];
3488    memset(scratch_loc, -1, sizeof(scratch_loc));
3489
3490    /* First, calculate the set of virtual GRFs that need to be punted
3491     * to scratch due to having any array access on them, and where in
3492     * scratch.
3493     */
3494    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3495       if (inst->dst.file == GRF && inst->dst.reladdr) {
3496          if (scratch_loc[inst->dst.reg] == -1) {
3497             scratch_loc[inst->dst.reg] = last_scratch;
3498             last_scratch += this->alloc.sizes[inst->dst.reg];
3499          }
3500
3501          for (src_reg *iter = inst->dst.reladdr;
3502               iter->reladdr;
3503               iter = iter->reladdr) {
3504             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3505                scratch_loc[iter->reg] = last_scratch;
3506                last_scratch += this->alloc.sizes[iter->reg];
3507             }
3508          }
3509       }
3510
3511       for (int i = 0 ; i < 3; i++) {
3512          for (src_reg *iter = &inst->src[i];
3513               iter->reladdr;
3514               iter = iter->reladdr) {
3515             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3516                scratch_loc[iter->reg] = last_scratch;
3517                last_scratch += this->alloc.sizes[iter->reg];
3518             }
3519          }
3520       }
3521    }
3522
3523    /* Now, for anything that will be accessed through scratch, rewrite
3524     * it to load/store.  Note that this is a _safe list walk, because
3525     * we may generate a new scratch_write instruction after the one
3526     * we're processing.
3527     */
3528    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3529       /* Set up the annotation tracking for new generated instructions. */
3530       base_ir = inst->ir;
3531       current_annotation = inst->annotation;
3532
3533       /* First handle scratch access on the dst. Notice we have to handle
3534        * the case where the dst's reladdr also points to scratch space.
3535        */
3536       if (inst->dst.reladdr)
3537          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3538                                                    *inst->dst.reladdr);
3539
3540       /* Now that we have handled any (possibly recursive) reladdr scratch
3541        * accesses for dst we can safely do the scratch write for dst itself
3542        */
3543       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3544          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3545
3546       /* Now handle scratch access on any src. In this case, since inst->src[i]
3547        * already is a src_reg, we can just call emit_resolve_reladdr with
3548        * inst->src[i] and it will take care of handling scratch loads for
3549        * both src and src.reladdr (recursively).
3550        */
3551       for (int i = 0 ; i < 3; i++) {
3552          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3553                                              inst->src[i]);
3554       }
3555    }
3556 }
3557
3558 /**
3559  * Emits an instruction before @inst to load the value named by @orig_src
3560  * from the pull constant buffer (surface) at @base_offset to @temp.
3561  */
3562 void
3563 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3564                                       dst_reg temp, src_reg orig_src,
3565                                       int base_offset)
3566 {
3567    int reg_offset = base_offset + orig_src.reg_offset;
3568    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3569    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3570                                              reg_offset);
3571
3572    emit_pull_constant_load_reg(temp,
3573                                index,
3574                                offset,
3575                                block, inst);
3576 }
3577
3578 /**
3579  * Implements array access of uniforms by inserting a
3580  * PULL_CONSTANT_LOAD instruction.
3581  *
3582  * Unlike temporary GRF array access (where we don't support it due to
3583  * the difficulty of doing relative addressing on instruction
3584  * destinations), we could potentially do array access of uniforms
3585  * that were loaded in GRF space as push constants.  In real-world
3586  * usage we've seen, though, the arrays being used are always larger
3587  * than we could load as push constants, so just always move all
3588  * uniform array access out to a pull constant buffer.
3589  */
3590 void
3591 vec4_visitor::move_uniform_array_access_to_pull_constants()
3592 {
3593    int pull_constant_loc[this->uniforms];
3594    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3595    bool nested_reladdr;
3596
3597    /* Walk through and find array access of uniforms.  Put a copy of that
3598     * uniform in the pull constant buffer.
3599     *
3600     * Note that we don't move constant-indexed accesses to arrays.  No
3601     * testing has been done of the performance impact of this choice.
3602     */
3603    do {
3604       nested_reladdr = false;
3605
3606       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3607          for (int i = 0 ; i < 3; i++) {
3608             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3609                continue;
3610
3611             int uniform = inst->src[i].reg;
3612
3613             if (inst->src[i].reladdr->reladdr)
3614                nested_reladdr = true;  /* will need another pass */
3615
3616             /* If this array isn't already present in the pull constant buffer,
3617              * add it.
3618              */
3619             if (pull_constant_loc[uniform] == -1) {
3620                const gl_constant_value **values =
3621                   &stage_prog_data->param[uniform * 4];
3622
3623                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3624
3625                assert(uniform < uniform_array_size);
3626                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3627                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3628                      = values[j];
3629                }
3630             }
3631
3632             /* Set up the annotation tracking for new generated instructions. */
3633             base_ir = inst->ir;
3634             current_annotation = inst->annotation;
3635
3636             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3637
3638             emit_pull_constant_load(block, inst, temp, inst->src[i],
3639                                     pull_constant_loc[uniform]);
3640
3641             inst->src[i].file = temp.file;
3642             inst->src[i].reg = temp.reg;
3643             inst->src[i].reg_offset = temp.reg_offset;
3644             inst->src[i].reladdr = NULL;
3645          }
3646       }
3647    } while (nested_reladdr);
3648
3649    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3650     * no need to track them as larger-than-vec4 objects.  This will be
3651     * relied on in cutting out unused uniform vectors from push
3652     * constants.
3653     */
3654    split_uniform_registers();
3655 }
3656
3657 void
3658 vec4_visitor::resolve_ud_negate(src_reg *reg)
3659 {
3660    if (reg->type != BRW_REGISTER_TYPE_UD ||
3661        !reg->negate)
3662       return;
3663
3664    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3665    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3666    *reg = temp;
3667 }
3668
3669 /**
3670  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3671  *
3672  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3673  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3674  */
3675 void
3676 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3677 {
3678    assert(devinfo->gen <= 5);
3679
3680    if (!rvalue->type->is_boolean())
3681       return;
3682
3683    src_reg and_result = src_reg(this, rvalue->type);
3684    src_reg neg_result = src_reg(this, rvalue->type);
3685    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3686    emit(MOV(dst_reg(neg_result), negate(and_result)));
3687    *reg = neg_result;
3688 }
3689
3690 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3691                            void *log_data,
3692                            struct gl_program *prog,
3693                            const struct brw_vue_prog_key *key,
3694                            struct brw_vue_prog_data *prog_data,
3695                            struct gl_shader_program *shader_prog,
3696                            gl_shader_stage stage,
3697                            void *mem_ctx,
3698                            bool no_spills,
3699                            int shader_time_index)
3700    : backend_shader(compiler, log_data, mem_ctx,
3701                     shader_prog, prog, &prog_data->base, stage),
3702      key(key),
3703      prog_data(prog_data),
3704      sanity_param_count(0),
3705      fail_msg(NULL),
3706      first_non_payload_grf(0),
3707      need_all_constants_in_pull_buffer(false),
3708      no_spills(no_spills),
3709      shader_time_index(shader_time_index),
3710      last_scratch(0)
3711 {
3712    this->failed = false;
3713
3714    this->base_ir = NULL;
3715    this->current_annotation = NULL;
3716    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3717
3718    this->variable_ht = hash_table_ctor(0,
3719                                        hash_table_pointer_hash,
3720                                        hash_table_pointer_compare);
3721
3722    this->virtual_grf_start = NULL;
3723    this->virtual_grf_end = NULL;
3724    this->live_intervals = NULL;
3725
3726    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3727
3728    this->uniforms = 0;
3729
3730    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3731     * at least one. See setup_uniforms() in brw_vec4.cpp.
3732     */
3733    this->uniform_array_size = 1;
3734    if (prog_data) {
3735       this->uniform_array_size =
3736          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3737    }
3738
3739    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3740    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3741 }
3742
3743 vec4_visitor::~vec4_visitor()
3744 {
3745    hash_table_dtor(this->variable_ht);
3746 }
3747
3748
3749 void
3750 vec4_visitor::fail(const char *format, ...)
3751 {
3752    va_list va;
3753    char *msg;
3754
3755    if (failed)
3756       return;
3757
3758    failed = true;
3759
3760    va_start(va, format);
3761    msg = ralloc_vasprintf(mem_ctx, format, va);
3762    va_end(va);
3763    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3764
3765    this->fail_msg = msg;
3766
3767    if (debug_enabled) {
3768       fprintf(stderr, "%s",  msg);
3769    }
3770 }
3771
3772 } /* namespace brw */