src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(devinfo->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = 14;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = 13;
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 void
 280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 281 {
 282    static enum opcode dot_opcodes[] = {
 283       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 284    };
 285
 286    emit(dot_opcodes[elements - 2], dst, src0, src1);
 287 }
 288
 289 src_reg
 290 vec4_visitor::fix_3src_operand(const src_reg &src)
 291 {
 292    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 293     * able to use vertical stride of zero to replicate the vec4 uniform, like
 294     *
 295     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 296     *
 297     * But you can't, since vertical stride is always four in three-source
 298     * instructions. Instead, insert a MOV instruction to do the replication so
 299     * that the three-source instruction can consume it.
 300     */
 301
 302    /* The MOV is only needed if the source is a uniform or immediate. */
 303    if (src.file != UNIFORM && src.file != IMM)
 304       return src;
 305
 306    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 307       return src;
 308
 309    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 310    expanded.type = src.type;
 311    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 312    return src_reg(expanded);
 313 }
 314
 315 src_reg
 316 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 317 {
 318    if (!src.abs && !src.negate)
 319       return src;
 320
 321    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 322    resolved.type = src.type;
 323    emit(MOV(resolved, src));
 324
 325    return src_reg(resolved);
 326 }
 327
 328 src_reg
 329 vec4_visitor::fix_math_operand(const src_reg &src)
 330 {
 331    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 332       return src;
 333
 334    /* The gen6 math instruction ignores the source modifiers --
 335     * swizzle, abs, negate, and at least some parts of the register
 336     * region description.
 337     *
 338     * Rather than trying to enumerate all these cases, *always* expand the
 339     * operand to a temp GRF for gen6.
 340     *
 341     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 342     * can't use.
 343     */
 344
 345    if (devinfo->gen == 7 && src.file != IMM)
 346       return src;
 347
 348    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 349    expanded.type = src.type;
 350    emit(MOV(expanded, src));
 351    return src_reg(expanded);
 352 }
 353
 354 vec4_instruction *
 355 vec4_visitor::emit_math(enum opcode opcode,
 356                         const dst_reg &dst,
 357                         const src_reg &src0, const src_reg &src1)
 358 {
 359    vec4_instruction *math =
 360       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 361
 362    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 363       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 364       math->dst = dst_reg(this, glsl_type::vec4_type);
 365       math->dst.type = dst.type;
 366       math = emit(MOV(dst, src_reg(math->dst)));
 367    } else if (devinfo->gen < 6) {
 368       math->base_mrf = 1;
 369       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 370    }
 371
 372    return math;
 373 }
 374
 375 void
 376 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 377 {
 378    if (devinfo->gen < 7) {
 379       unreachable("ir_unop_pack_half_2x16 should be lowered");
 380    }
 381
 382    assert(dst.type == BRW_REGISTER_TYPE_UD);
 383    assert(src0.type == BRW_REGISTER_TYPE_F);
 384
 385    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 386     *
 387     *   Because this instruction does not have a 16-bit floating-point type,
 388     *   the destination data type must be Word (W).
 389     *
 390     *   The destination must be DWord-aligned and specify a horizontal stride
 391     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 392     *   each destination channel and the upper word is not modified.
 393     *
 394     * The above restriction implies that the f32to16 instruction must use
 395     * align1 mode, because only in align1 mode is it possible to specify
 396     * horizontal stride.  We choose here to defy the hardware docs and emit
 397     * align16 instructions.
 398     *
 399     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 400     * instructions. I was partially successful in that the code passed all
 401     * tests.  However, the code was dubiously correct and fragile, and the
 402     * tests were not harsh enough to probe that frailty. Not trusting the
 403     * code, I chose instead to remain in align16 mode in defiance of the hw
 404     * docs).
 405     *
 406     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 407     * simulator, emitting a f32to16 in align16 mode with UD as destination
 408     * data type is safe. The behavior differs from that specified in the PRM
 409     * in that the upper word of each destination channel is cleared to 0.
 410     */
 411
 412    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 413    src_reg tmp_src(tmp_dst);
 414
 415 #if 0
 416    /* Verify the undocumented behavior on which the following instructions
 417     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 418     * then the result of the bit-or instruction below will be incorrect.
 419     *
 420     * You should inspect the disasm output in order to verify that the MOV is
 421     * not optimized away.
 422     */
 423    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 424 #endif
 425
 426    /* Give tmp the form below, where "." means untouched.
 427     *
 428     *     w z          y          x w z          y          x
 429     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 430     *
 431     * That the upper word of each write-channel be 0 is required for the
 432     * following bit-shift and bit-or instructions to work. Note that this
 433     * relies on the undocumented hardware behavior mentioned above.
 434     */
 435    tmp_dst.writemask = WRITEMASK_XY;
 436    emit(F32TO16(tmp_dst, src0));
 437
 438    /* Give the write-channels of dst the form:
 439     *   0xhhhh0000
 440     */
 441    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 442    emit(SHL(dst, tmp_src, src_reg(16u)));
 443
 444    /* Finally, give the write-channels of dst the form of packHalf2x16's
 445     * output:
 446     *   0xhhhhllll
 447     */
 448    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 449    emit(OR(dst, src_reg(dst), tmp_src));
 450 }
 451
 452 void
 453 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 454 {
 455    if (devinfo->gen < 7) {
 456       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 457    }
 458
 459    assert(dst.type == BRW_REGISTER_TYPE_F);
 460    assert(src0.type == BRW_REGISTER_TYPE_UD);
 461
 462    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 463     *
 464     *   Because this instruction does not have a 16-bit floating-point type,
 465     *   the source data type must be Word (W). The destination type must be
 466     *   F (Float).
 467     *
 468     * To use W as the source data type, we must adjust horizontal strides,
 469     * which is only possible in align1 mode. All my [chadv] attempts at
 470     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 471     * Piglit tests, so I gave up.
 472     *
 473     * I've verified that, on gen7 hardware and the simulator, it is safe to
 474     * emit f16to32 in align16 mode with UD as source data type.
 475     */
 476
 477    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 478    src_reg tmp_src(tmp_dst);
 479
 480    tmp_dst.writemask = WRITEMASK_X;
 481    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 482
 483    tmp_dst.writemask = WRITEMASK_Y;
 484    emit(SHR(tmp_dst, src0, src_reg(16u)));
 485
 486    dst.writemask = WRITEMASK_XY;
 487    emit(F16TO32(dst, tmp_src));
 488 }
 489
 490 void
 491 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 492 {
 493    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 494     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 495     * is not suitable to generate the shift values, but we can use the packed
 496     * vector float and a type-converting MOV.
 497     */
 498    dst_reg shift(this, glsl_type::uvec4_type);
 499    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 500
 501    dst_reg shifted(this, glsl_type::uvec4_type);
 502    src0.swizzle = BRW_SWIZZLE_XXXX;
 503    emit(SHR(shifted, src0, src_reg(shift)));
 504
 505    shifted.type = BRW_REGISTER_TYPE_UB;
 506    dst_reg f(this, glsl_type::vec4_type);
 507    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 508
 509    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 510 }
 511
 512 void
 513 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 514 {
 515    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 516     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 517     * is not suitable to generate the shift values, but we can use the packed
 518     * vector float and a type-converting MOV.
 519     */
 520    dst_reg shift(this, glsl_type::uvec4_type);
 521    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 522
 523    dst_reg shifted(this, glsl_type::uvec4_type);
 524    src0.swizzle = BRW_SWIZZLE_XXXX;
 525    emit(SHR(shifted, src0, src_reg(shift)));
 526
 527    shifted.type = BRW_REGISTER_TYPE_B;
 528    dst_reg f(this, glsl_type::vec4_type);
 529    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 530
 531    dst_reg scaled(this, glsl_type::vec4_type);
 532    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 533
 534    dst_reg max(this, glsl_type::vec4_type);
 535    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 536    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 537 }
 538
 539 void
 540 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 541 {
 542    dst_reg saturated(this, glsl_type::vec4_type);
 543    vec4_instruction *inst = emit(MOV(saturated, src0));
 544    inst->saturate = true;
 545
 546    dst_reg scaled(this, glsl_type::vec4_type);
 547    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 548
 549    dst_reg rounded(this, glsl_type::vec4_type);
 550    emit(RNDE(rounded, src_reg(scaled)));
 551
 552    dst_reg u(this, glsl_type::uvec4_type);
 553    emit(MOV(u, src_reg(rounded)));
 554
 555    src_reg bytes(u);
 556    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 557 }
 558
 559 void
 560 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 561 {
 562    dst_reg max(this, glsl_type::vec4_type);
 563    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 564
 565    dst_reg min(this, glsl_type::vec4_type);
 566    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 567
 568    dst_reg scaled(this, glsl_type::vec4_type);
 569    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 570
 571    dst_reg rounded(this, glsl_type::vec4_type);
 572    emit(RNDE(rounded, src_reg(scaled)));
 573
 574    dst_reg i(this, glsl_type::ivec4_type);
 575    emit(MOV(i, src_reg(rounded)));
 576
 577    src_reg bytes(i);
 578    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 579 }
 580
 581 void
 582 vec4_visitor::visit_instructions(const exec_list *list)
 583 {
 584    foreach_in_list(ir_instruction, ir, list) {
 585       base_ir = ir;
 586       ir->accept(this);
 587    }
 588 }
 589
 590 /**
 591  * Returns the minimum number of vec4 elements needed to pack a type.
 592  *
 593  * For simple types, it will return 1 (a single vec4); for matrices, the
 594  * number of columns; for array and struct, the sum of the vec4_size of
 595  * each of its elements; and for sampler and atomic, zero.
 596  *
 597  * This method is useful to calculate how much register space is needed to
 598  * store a particular type.
 599  */
 600 extern "C" int
 601 type_size_vec4(const struct glsl_type *type)
 602 {
 603    unsigned int i;
 604    int size;
 605
 606    switch (type->base_type) {
 607    case GLSL_TYPE_UINT:
 608    case GLSL_TYPE_INT:
 609    case GLSL_TYPE_FLOAT:
 610    case GLSL_TYPE_BOOL:
 611       if (type->is_matrix()) {
 612          return type->matrix_columns;
 613       } else {
 614          /* Regardless of size of vector, it gets a vec4. This is bad
 615           * packing for things like floats, but otherwise arrays become a
 616           * mess.  Hopefully a later pass over the code can pack scalars
 617           * down if appropriate.
 618           */
 619          return 1;
 620       }
 621    case GLSL_TYPE_ARRAY:
 622       assert(type->length > 0);
 623       return type_size_vec4(type->fields.array) * type->length;
 624    case GLSL_TYPE_STRUCT:
 625       size = 0;
 626       for (i = 0; i < type->length; i++) {
 627          size += type_size_vec4(type->fields.structure[i].type);
 628       }
 629       return size;
 630    case GLSL_TYPE_SUBROUTINE:
 631       return 1;
 632
 633    case GLSL_TYPE_SAMPLER:
 634       /* Samplers take up no register space, since they're baked in at
 635        * link time.
 636        */
 637       return 0;
 638    case GLSL_TYPE_ATOMIC_UINT:
 639       return 0;
 640    case GLSL_TYPE_IMAGE:
 641       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 642    case GLSL_TYPE_VOID:
 643    case GLSL_TYPE_DOUBLE:
 644    case GLSL_TYPE_ERROR:
 645    case GLSL_TYPE_INTERFACE:
 646       unreachable("not reached");
 647    }
 648
 649    return 0;
 650 }
 651
 652 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 653 {
 654    init();
 655
 656    this->file = GRF;
 657    this->reg = v->alloc.allocate(type_size_vec4(type));
 658
 659    if (type->is_array() || type->is_record()) {
 660       this->swizzle = BRW_SWIZZLE_NOOP;
 661    } else {
 662       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 663    }
 664
 665    this->type = brw_type_for_base_type(type);
 666 }
 667
 668 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 669 {
 670    assert(size > 0);
 671
 672    init();
 673
 674    this->file = GRF;
 675    this->reg = v->alloc.allocate(type_size_vec4(type) * size);
 676
 677    this->swizzle = BRW_SWIZZLE_NOOP;
 678
 679    this->type = brw_type_for_base_type(type);
 680 }
 681
 682 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 683 {
 684    init();
 685
 686    this->file = GRF;
 687    this->reg = v->alloc.allocate(type_size_vec4(type));
 688
 689    if (type->is_array() || type->is_record()) {
 690       this->writemask = WRITEMASK_XYZW;
 691    } else {
 692       this->writemask = (1 << type->vector_elements) - 1;
 693    }
 694
 695    this->type = brw_type_for_base_type(type);
 696 }
 697
 698 void
 699 vec4_visitor::setup_vec4_uniform_value(unsigned param_offset,
 700                                        const gl_constant_value *values,
 701                                        unsigned n)
 702 {
 703    static const gl_constant_value zero = { 0 };
 704
 705    assert(param_offset % 4 == 0);
 706
 707    for (unsigned i = 0; i < n; ++i)
 708       stage_prog_data->param[param_offset + i] = &values[i];
 709
 710    for (unsigned i = n; i < 4; ++i)
 711       stage_prog_data->param[param_offset + i] = &zero;
 712
 713    uniform_vector_size[param_offset / 4] = n;
 714 }
 715
 716 /* Our support for uniforms is piggy-backed on the struct
 717  * gl_fragment_program, because that's where the values actually
 718  * get stored, rather than in some global gl_shader_program uniform
 719  * store.
 720  */
 721 void
 722 vec4_visitor::setup_uniform_values(ir_variable *ir)
 723 {
 724    int namelen = strlen(ir->name);
 725
 726    /* The data for our (non-builtin) uniforms is stored in a series of
 727     * gl_uniform_driver_storage structs for each subcomponent that
 728     * glGetUniformLocation() could name.  We know it's been set up in the same
 729     * order we'd walk the type, so walk the list of storage and find anything
 730     * with our name, or the prefix of a component that starts with our name.
 731     */
 732    for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
 733       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 734
 735       if (storage->builtin)
 736          continue;
 737
 738       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 739           (storage->name[namelen] != 0 &&
 740            storage->name[namelen] != '.' &&
 741            storage->name[namelen] != '[')) {
 742          continue;
 743       }
 744
 745       const unsigned vector_count = (MAX2(storage->array_elements, 1) *
 746                                      storage->type->matrix_columns);
 747       const unsigned vector_size = storage->type->vector_elements;
 748
 749       for (unsigned s = 0; s < vector_count; s++) {
 750          setup_vec4_uniform_value(uniforms * 4,
 751                                   &storage->storage[s * vector_size],
 752                                   vector_size);
 753          uniforms++;
 754       }
 755    }
 756 }
 757
 758 /* Our support for builtin uniforms is even scarier than non-builtin.
 759  * It sits on top of the PROG_STATE_VAR parameters that are
 760  * automatically updated from GL context state.
 761  */
 762 void
 763 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 764 {
 765    const ir_state_slot *const slots = ir->get_state_slots();
 766    assert(slots != NULL);
 767
 768    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 769       /* This state reference has already been setup by ir_to_mesa,
 770        * but we'll get the same index back here.  We can reference
 771        * ParameterValues directly, since unlike brw_fs.cpp, we never
 772        * add new state references during compile.
 773        */
 774       int index = _mesa_add_state_reference(this->prog->Parameters,
 775                                             (gl_state_index *)slots[i].tokens);
 776       gl_constant_value *values =
 777          &this->prog->Parameters->ParameterValues[index][0];
 778
 779       assert(this->uniforms < uniform_array_size);
 780
 781       for (unsigned j = 0; j < 4; j++)
 782          stage_prog_data->param[this->uniforms * 4 + j] =
 783             &values[GET_SWZ(slots[i].swizzle, j)];
 784
 785       this->uniform_vector_size[this->uniforms] =
 786          (ir->type->is_scalar() || ir->type->is_vector() ||
 787           ir->type->is_matrix() ? ir->type->vector_elements : 4);
 788
 789       this->uniforms++;
 790    }
 791 }
 792
 793 dst_reg *
 794 vec4_visitor::variable_storage(ir_variable *var)
 795 {
 796    return (dst_reg *)hash_table_find(this->variable_ht, var);
 797 }
 798
 799 void
 800 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 801                                      enum brw_predicate *predicate)
 802 {
 803    ir_expression *expr = ir->as_expression();
 804
 805    *predicate = BRW_PREDICATE_NORMAL;
 806
 807    if (expr && expr->operation != ir_binop_ubo_load) {
 808       src_reg op[3];
 809       vec4_instruction *inst;
 810
 811       assert(expr->get_num_operands() <= 3);
 812       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 813          expr->operands[i]->accept(this);
 814          op[i] = this->result;
 815
 816          resolve_ud_negate(&op[i]);
 817       }
 818
 819       switch (expr->operation) {
 820       case ir_unop_logic_not:
 821          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 822          inst->conditional_mod = BRW_CONDITIONAL_Z;
 823          break;
 824
 825       case ir_binop_logic_xor:
 826          if (devinfo->gen <= 5) {
 827             src_reg temp = src_reg(this, ir->type);
 828             emit(XOR(dst_reg(temp), op[0], op[1]));
 829             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 830          } else {
 831             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 832          }
 833          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 834          break;
 835
 836       case ir_binop_logic_or:
 837          if (devinfo->gen <= 5) {
 838             src_reg temp = src_reg(this, ir->type);
 839             emit(OR(dst_reg(temp), op[0], op[1]));
 840             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 841          } else {
 842             inst = emit(OR(dst_null_d(), op[0], op[1]));
 843          }
 844          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 845          break;
 846
 847       case ir_binop_logic_and:
 848          if (devinfo->gen <= 5) {
 849             src_reg temp = src_reg(this, ir->type);
 850             emit(AND(dst_reg(temp), op[0], op[1]));
 851             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 852          } else {
 853             inst = emit(AND(dst_null_d(), op[0], op[1]));
 854          }
 855          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 856          break;
 857
 858       case ir_unop_f2b:
 859          if (devinfo->gen >= 6) {
 860             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 861          } else {
 862             inst = emit(MOV(dst_null_f(), op[0]));
 863             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 864          }
 865          break;
 866
 867       case ir_unop_i2b:
 868          if (devinfo->gen >= 6) {
 869             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 870          } else {
 871             inst = emit(MOV(dst_null_d(), op[0]));
 872             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 873          }
 874          break;
 875
 876       case ir_binop_all_equal:
 877          if (devinfo->gen <= 5) {
 878             resolve_bool_comparison(expr->operands[0], &op[0]);
 879             resolve_bool_comparison(expr->operands[1], &op[1]);
 880          }
 881          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 882          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 883          break;
 884
 885       case ir_binop_any_nequal:
 886          if (devinfo->gen <= 5) {
 887             resolve_bool_comparison(expr->operands[0], &op[0]);
 888             resolve_bool_comparison(expr->operands[1], &op[1]);
 889          }
 890          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 891          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 892          break;
 893
 894       case ir_unop_any:
 895          if (devinfo->gen <= 5) {
 896             resolve_bool_comparison(expr->operands[0], &op[0]);
 897          }
 898          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 899          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 900          break;
 901
 902       case ir_binop_greater:
 903       case ir_binop_gequal:
 904       case ir_binop_less:
 905       case ir_binop_lequal:
 906       case ir_binop_equal:
 907       case ir_binop_nequal:
 908          if (devinfo->gen <= 5) {
 909             resolve_bool_comparison(expr->operands[0], &op[0]);
 910             resolve_bool_comparison(expr->operands[1], &op[1]);
 911          }
 912          emit(CMP(dst_null_d(), op[0], op[1],
 913                   brw_conditional_for_comparison(expr->operation)));
 914          break;
 915
 916       case ir_triop_csel: {
 917          /* Expand the boolean condition into the flag register. */
 918          inst = emit(MOV(dst_null_d(), op[0]));
 919          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 920
 921          /* Select which boolean to return. */
 922          dst_reg temp(this, expr->operands[1]->type);
 923          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 924          inst->predicate = BRW_PREDICATE_NORMAL;
 925
 926          /* Expand the result to a condition code. */
 927          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 928          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 929          break;
 930       }
 931
 932       default:
 933          unreachable("not reached");
 934       }
 935       return;
 936    }
 937
 938    ir->accept(this);
 939
 940    resolve_ud_negate(&this->result);
 941
 942    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 943    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 944 }
 945
 946 /**
 947  * Emit a gen6 IF statement with the comparison folded into the IF
 948  * instruction.
 949  */
 950 void
 951 vec4_visitor::emit_if_gen6(ir_if *ir)
 952 {
 953    ir_expression *expr = ir->condition->as_expression();
 954
 955    if (expr && expr->operation != ir_binop_ubo_load) {
 956       src_reg op[3];
 957       dst_reg temp;
 958
 959       assert(expr->get_num_operands() <= 3);
 960       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 961          expr->operands[i]->accept(this);
 962          op[i] = this->result;
 963       }
 964
 965       switch (expr->operation) {
 966       case ir_unop_logic_not:
 967          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 968          return;
 969
 970       case ir_binop_logic_xor:
 971          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 972          return;
 973
 974       case ir_binop_logic_or:
 975          temp = dst_reg(this, glsl_type::bool_type);
 976          emit(OR(temp, op[0], op[1]));
 977          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 978          return;
 979
 980       case ir_binop_logic_and:
 981          temp = dst_reg(this, glsl_type::bool_type);
 982          emit(AND(temp, op[0], op[1]));
 983          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 984          return;
 985
 986       case ir_unop_f2b:
 987          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 988          return;
 989
 990       case ir_unop_i2b:
 991          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 992          return;
 993
 994       case ir_binop_greater:
 995       case ir_binop_gequal:
 996       case ir_binop_less:
 997       case ir_binop_lequal:
 998       case ir_binop_equal:
 999       case ir_binop_nequal:
1000          emit(IF(op[0], op[1],
1001                  brw_conditional_for_comparison(expr->operation)));
1002          return;
1003
1004       case ir_binop_all_equal:
1005          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1006          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1007          return;
1008
1009       case ir_binop_any_nequal:
1010          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1011          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1012          return;
1013
1014       case ir_unop_any:
1015          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1016          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1017          return;
1018
1019       case ir_triop_csel: {
1020          /* Expand the boolean condition into the flag register. */
1021          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1022          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1023
1024          /* Select which boolean to return. */
1025          dst_reg temp(this, expr->operands[1]->type);
1026          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1027          inst->predicate = BRW_PREDICATE_NORMAL;
1028
1029          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1030          return;
1031       }
1032
1033       default:
1034          unreachable("not reached");
1035       }
1036       return;
1037    }
1038
1039    ir->condition->accept(this);
1040
1041    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1042 }
1043
1044 void
1045 vec4_visitor::visit(ir_variable *ir)
1046 {
1047    dst_reg *reg = NULL;
1048
1049    if (variable_storage(ir))
1050       return;
1051
1052    switch (ir->data.mode) {
1053    case ir_var_shader_in:
1054       assert(ir->data.location != -1);
1055       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1056       break;
1057
1058    case ir_var_shader_out:
1059       assert(ir->data.location != -1);
1060       reg = new(mem_ctx) dst_reg(this, ir->type);
1061
1062       for (int i = 0; i < type_size_vec4(ir->type); i++) {
1063          output_reg[ir->data.location + i] = *reg;
1064          output_reg[ir->data.location + i].reg_offset = i;
1065          output_reg_annotation[ir->data.location + i] = ir->name;
1066       }
1067       break;
1068
1069    case ir_var_auto:
1070    case ir_var_temporary:
1071       reg = new(mem_ctx) dst_reg(this, ir->type);
1072       break;
1073
1074    case ir_var_uniform:
1075       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1076
1077       /* Thanks to the lower_ubo_reference pass, we will see only
1078        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1079        * variables, so no need for them to be in variable_ht.
1080        *
1081        * Some uniforms, such as samplers and atomic counters, have no actual
1082        * storage, so we should ignore them.
1083        */
1084       if (ir->is_in_buffer_block() || type_size_vec4(ir->type) == 0)
1085          return;
1086
1087       /* Track how big the whole uniform variable is, in case we need to put a
1088        * copy of its data into pull constants for array access.
1089        */
1090       assert(this->uniforms < uniform_array_size);
1091       this->uniform_size[this->uniforms] = type_size_vec4(ir->type);
1092
1093       if (!strncmp(ir->name, "gl_", 3)) {
1094          setup_builtin_uniform_values(ir);
1095       } else {
1096          setup_uniform_values(ir);
1097       }
1098       break;
1099
1100    case ir_var_system_value:
1101       reg = make_reg_for_system_value(ir->data.location, ir->type);
1102       break;
1103
1104    default:
1105       unreachable("not reached");
1106    }
1107
1108    reg->type = brw_type_for_base_type(ir->type);
1109    hash_table_insert(this->variable_ht, reg, ir);
1110 }
1111
1112 void
1113 vec4_visitor::visit(ir_loop *ir)
1114 {
1115    /* We don't want debugging output to print the whole body of the
1116     * loop as the annotation.
1117     */
1118    this->base_ir = NULL;
1119
1120    emit(BRW_OPCODE_DO);
1121
1122    visit_instructions(&ir->body_instructions);
1123
1124    emit(BRW_OPCODE_WHILE);
1125 }
1126
1127 void
1128 vec4_visitor::visit(ir_loop_jump *ir)
1129 {
1130    switch (ir->mode) {
1131    case ir_loop_jump::jump_break:
1132       emit(BRW_OPCODE_BREAK);
1133       break;
1134    case ir_loop_jump::jump_continue:
1135       emit(BRW_OPCODE_CONTINUE);
1136       break;
1137    }
1138 }
1139
1140
1141 void
1142 vec4_visitor::visit(ir_function_signature *)
1143 {
1144    unreachable("not reached");
1145 }
1146
1147 void
1148 vec4_visitor::visit(ir_function *ir)
1149 {
1150    /* Ignore function bodies other than main() -- we shouldn't see calls to
1151     * them since they should all be inlined.
1152     */
1153    if (strcmp(ir->name, "main") == 0) {
1154       const ir_function_signature *sig;
1155       exec_list empty;
1156
1157       sig = ir->matching_signature(NULL, &empty, false);
1158
1159       assert(sig);
1160
1161       visit_instructions(&sig->body);
1162    }
1163 }
1164
1165 bool
1166 vec4_visitor::try_emit_mad(ir_expression *ir)
1167 {
1168    /* 3-src instructions were introduced in gen6. */
1169    if (devinfo->gen < 6)
1170       return false;
1171
1172    /* MAD can only handle floating-point data. */
1173    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1174       return false;
1175
1176    ir_rvalue *nonmul;
1177    ir_expression *mul;
1178    bool mul_negate, mul_abs;
1179
1180    for (int i = 0; i < 2; i++) {
1181       mul_negate = false;
1182       mul_abs = false;
1183
1184       mul = ir->operands[i]->as_expression();
1185       nonmul = ir->operands[1 - i];
1186
1187       if (mul && mul->operation == ir_unop_abs) {
1188          mul = mul->operands[0]->as_expression();
1189          mul_abs = true;
1190       } else if (mul && mul->operation == ir_unop_neg) {
1191          mul = mul->operands[0]->as_expression();
1192          mul_negate = true;
1193       }
1194
1195       if (mul && mul->operation == ir_binop_mul)
1196          break;
1197    }
1198
1199    if (!mul || mul->operation != ir_binop_mul)
1200       return false;
1201
1202    nonmul->accept(this);
1203    src_reg src0 = fix_3src_operand(this->result);
1204
1205    mul->operands[0]->accept(this);
1206    src_reg src1 = fix_3src_operand(this->result);
1207    src1.negate ^= mul_negate;
1208    src1.abs = mul_abs;
1209    if (mul_abs)
1210       src1.negate = false;
1211
1212    mul->operands[1]->accept(this);
1213    src_reg src2 = fix_3src_operand(this->result);
1214    src2.abs = mul_abs;
1215    if (mul_abs)
1216       src2.negate = false;
1217
1218    this->result = src_reg(this, ir->type);
1219    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1220
1221    return true;
1222 }
1223
1224 bool
1225 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1226 {
1227    /* This optimization relies on CMP setting the destination to 0 when
1228     * false.  Early hardware only sets the least significant bit, and
1229     * leaves the other bits undefined.  So we can't use it.
1230     */
1231    if (devinfo->gen < 6)
1232       return false;
1233
1234    ir_expression *const cmp = ir->operands[0]->as_expression();
1235
1236    if (cmp == NULL)
1237       return false;
1238
1239    switch (cmp->operation) {
1240    case ir_binop_less:
1241    case ir_binop_greater:
1242    case ir_binop_lequal:
1243    case ir_binop_gequal:
1244    case ir_binop_equal:
1245    case ir_binop_nequal:
1246       break;
1247
1248    default:
1249       return false;
1250    }
1251
1252    cmp->operands[0]->accept(this);
1253    const src_reg cmp_src0 = this->result;
1254
1255    cmp->operands[1]->accept(this);
1256    const src_reg cmp_src1 = this->result;
1257
1258    this->result = src_reg(this, ir->type);
1259
1260    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1261             brw_conditional_for_comparison(cmp->operation)));
1262
1263    /* If the comparison is false, this->result will just happen to be zero.
1264     */
1265    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1266                                        this->result, src_reg(1.0f));
1267    inst->predicate = BRW_PREDICATE_NORMAL;
1268    inst->predicate_inverse = true;
1269
1270    return true;
1271 }
1272
1273 vec4_instruction *
1274 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1275                           src_reg src0, src_reg src1)
1276 {
1277    vec4_instruction *inst;
1278
1279    if (devinfo->gen >= 6) {
1280       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1281       inst->conditional_mod = conditionalmod;
1282    } else {
1283       emit(CMP(dst, src0, src1, conditionalmod));
1284
1285       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1286       inst->predicate = BRW_PREDICATE_NORMAL;
1287    }
1288
1289    return inst;
1290 }
1291
1292 vec4_instruction *
1293 vec4_visitor::emit_lrp(const dst_reg &dst,
1294                        const src_reg &x, const src_reg &y, const src_reg &a)
1295 {
1296    if (devinfo->gen >= 6) {
1297       /* Note that the instruction's argument order is reversed from GLSL
1298        * and the IR.
1299        */
1300      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
1301                      fix_3src_operand(x)));
1302    } else {
1303       /* Earlier generations don't support three source operations, so we
1304        * need to emit x*(1-a) + y*a.
1305        */
1306       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1307       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1308       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1309       y_times_a.writemask           = dst.writemask;
1310       one_minus_a.writemask         = dst.writemask;
1311       x_times_one_minus_a.writemask = dst.writemask;
1312
1313       emit(MUL(y_times_a, y, a));
1314       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1315       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1316       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1317    }
1318 }
1319
1320 /**
1321  * Emits the instructions needed to perform a pull constant load. before_block
1322  * and before_inst can be NULL in which case the instruction will be appended
1323  * to the end of the instruction list.
1324  */
1325 void
1326 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1327                                           src_reg surf_index,
1328                                           src_reg offset_reg,
1329                                           bblock_t *before_block,
1330                                           vec4_instruction *before_inst)
1331 {
1332    assert((before_inst == NULL && before_block == NULL) ||
1333           (before_inst && before_block));
1334
1335    vec4_instruction *pull;
1336
1337    if (devinfo->gen >= 9) {
1338       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1339       src_reg header(this, glsl_type::uvec4_type, 2);
1340
1341       pull = new(mem_ctx)
1342          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1343                           dst_reg(header));
1344
1345       if (before_inst)
1346          emit_before(before_block, before_inst, pull);
1347       else
1348          emit(pull);
1349
1350       dst_reg index_reg = retype(offset(dst_reg(header), 1),
1351                                  offset_reg.type);
1352       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1353
1354       if (before_inst)
1355          emit_before(before_block, before_inst, pull);
1356       else
1357          emit(pull);
1358
1359       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1360                                            dst,
1361                                            surf_index,
1362                                            header);
1363       pull->mlen = 2;
1364       pull->header_size = 1;
1365    } else if (devinfo->gen >= 7) {
1366       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1367
1368       grf_offset.type = offset_reg.type;
1369
1370       pull = MOV(grf_offset, offset_reg);
1371
1372       if (before_inst)
1373          emit_before(before_block, before_inst, pull);
1374       else
1375          emit(pull);
1376
1377       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1378                                            dst,
1379                                            surf_index,
1380                                            src_reg(grf_offset));
1381       pull->mlen = 1;
1382    } else {
1383       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1384                                            dst,
1385                                            surf_index,
1386                                            offset_reg);
1387       pull->base_mrf = 14;
1388       pull->mlen = 1;
1389    }
1390
1391    if (before_inst)
1392       emit_before(before_block, before_inst, pull);
1393    else
1394       emit(pull);
1395 }
1396
1397 src_reg
1398 vec4_visitor::emit_uniformize(const src_reg &src)
1399 {
1400    const src_reg chan_index(this, glsl_type::uint_type);
1401    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
1402                               src.type);
1403
1404    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1405       ->force_writemask_all = true;
1406    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1407       ->force_writemask_all = true;
1408
1409    return src_reg(dst);
1410 }
1411
1412 void
1413 vec4_visitor::visit(ir_expression *ir)
1414 {
1415    unsigned int operand;
1416    src_reg op[ARRAY_SIZE(ir->operands)];
1417    vec4_instruction *inst;
1418
1419    if (ir->operation == ir_binop_add) {
1420       if (try_emit_mad(ir))
1421          return;
1422    }
1423
1424    if (ir->operation == ir_unop_b2f) {
1425       if (try_emit_b2f_of_compare(ir))
1426          return;
1427    }
1428
1429    /* Storage for our result.  Ideally for an assignment we'd be using
1430     * the actual storage for the result here, instead.
1431     */
1432    dst_reg result_dst(this, ir->type);
1433    src_reg result_src(result_dst);
1434
1435    if (ir->operation == ir_triop_csel) {
1436       ir->operands[1]->accept(this);
1437       op[1] = this->result;
1438       ir->operands[2]->accept(this);
1439       op[2] = this->result;
1440
1441       enum brw_predicate predicate;
1442       emit_bool_to_cond_code(ir->operands[0], &predicate);
1443       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1444       inst->predicate = predicate;
1445       this->result = result_src;
1446       return;
1447    }
1448
1449    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1450       this->result.file = BAD_FILE;
1451       ir->operands[operand]->accept(this);
1452       if (this->result.file == BAD_FILE) {
1453          fprintf(stderr, "Failed to get tree for expression operand:\n");
1454          ir->operands[operand]->fprint(stderr);
1455          exit(1);
1456       }
1457       op[operand] = this->result;
1458
1459       /* Matrix expression operands should have been broken down to vector
1460        * operations already.
1461        */
1462       assert(!ir->operands[operand]->type->is_matrix());
1463    }
1464
1465    /* If nothing special happens, this is the result. */
1466    this->result = result_src;
1467
1468    switch (ir->operation) {
1469    case ir_unop_logic_not:
1470       emit(NOT(result_dst, op[0]));
1471       break;
1472    case ir_unop_neg:
1473       op[0].negate = !op[0].negate;
1474       emit(MOV(result_dst, op[0]));
1475       break;
1476    case ir_unop_abs:
1477       op[0].abs = true;
1478       op[0].negate = false;
1479       emit(MOV(result_dst, op[0]));
1480       break;
1481
1482    case ir_unop_sign:
1483       if (ir->type->is_float()) {
1484          /* AND(val, 0x80000000) gives the sign bit.
1485           *
1486           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1487           * zero.
1488           */
1489          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1490
1491          op[0].type = BRW_REGISTER_TYPE_UD;
1492          result_dst.type = BRW_REGISTER_TYPE_UD;
1493          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1494
1495          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1496          inst->predicate = BRW_PREDICATE_NORMAL;
1497
1498          this->result.type = BRW_REGISTER_TYPE_F;
1499       } else {
1500          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1501           *               -> non-negative val generates 0x00000000.
1502           *  Predicated OR sets 1 if val is positive.
1503           */
1504          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1505
1506          emit(ASR(result_dst, op[0], src_reg(31)));
1507
1508          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1509          inst->predicate = BRW_PREDICATE_NORMAL;
1510       }
1511       break;
1512
1513    case ir_unop_rcp:
1514       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1515       break;
1516
1517    case ir_unop_exp2:
1518       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1519       break;
1520    case ir_unop_log2:
1521       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1522       break;
1523    case ir_unop_exp:
1524    case ir_unop_log:
1525       unreachable("not reached: should be handled by ir_explog_to_explog2");
1526    case ir_unop_sin:
1527       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1528       break;
1529    case ir_unop_cos:
1530       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1531       break;
1532
1533    case ir_unop_dFdx:
1534    case ir_unop_dFdx_coarse:
1535    case ir_unop_dFdx_fine:
1536    case ir_unop_dFdy:
1537    case ir_unop_dFdy_coarse:
1538    case ir_unop_dFdy_fine:
1539       unreachable("derivatives not valid in vertex shader");
1540
1541    case ir_unop_bitfield_reverse:
1542       emit(BFREV(result_dst, op[0]));
1543       break;
1544    case ir_unop_bit_count:
1545       emit(CBIT(result_dst, op[0]));
1546       break;
1547    case ir_unop_find_msb: {
1548       src_reg temp = src_reg(this, glsl_type::uint_type);
1549
1550       inst = emit(FBH(dst_reg(temp), op[0]));
1551       inst->dst.writemask = WRITEMASK_XYZW;
1552
1553       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1554        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1555        * subtract the result from 31 to convert the MSB count into an LSB count.
1556        */
1557
1558       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1559       temp.swizzle = BRW_SWIZZLE_NOOP;
1560       emit(MOV(result_dst, temp));
1561
1562       src_reg src_tmp = src_reg(result_dst);
1563       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1564
1565       src_tmp.negate = true;
1566       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1567       inst->predicate = BRW_PREDICATE_NORMAL;
1568       break;
1569    }
1570    case ir_unop_find_lsb:
1571       emit(FBL(result_dst, op[0]));
1572       break;
1573    case ir_unop_saturate:
1574       inst = emit(MOV(result_dst, op[0]));
1575       inst->saturate = true;
1576       break;
1577
1578    case ir_unop_noise:
1579       unreachable("not reached: should be handled by lower_noise");
1580
1581    case ir_unop_subroutine_to_int:
1582       emit(MOV(result_dst, op[0]));
1583       break;
1584
1585    case ir_binop_add:
1586       emit(ADD(result_dst, op[0], op[1]));
1587       break;
1588    case ir_binop_sub:
1589       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1590
1591    case ir_binop_mul:
1592       if (devinfo->gen < 8 && ir->type->is_integer()) {
1593          /* For integer multiplication, the MUL uses the low 16 bits of one of
1594           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1595           * accumulates in the contribution of the upper 16 bits of that
1596           * operand.  If we can determine that one of the args is in the low
1597           * 16 bits, though, we can just emit a single MUL.
1598           */
1599          if (ir->operands[0]->is_uint16_constant()) {
1600             if (devinfo->gen < 7)
1601                emit(MUL(result_dst, op[0], op[1]));
1602             else
1603                emit(MUL(result_dst, op[1], op[0]));
1604          } else if (ir->operands[1]->is_uint16_constant()) {
1605             if (devinfo->gen < 7)
1606                emit(MUL(result_dst, op[1], op[0]));
1607             else
1608                emit(MUL(result_dst, op[0], op[1]));
1609          } else {
1610             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1611
1612             emit(MUL(acc, op[0], op[1]));
1613             emit(MACH(dst_null_d(), op[0], op[1]));
1614             emit(MOV(result_dst, src_reg(acc)));
1615          }
1616       } else {
1617          emit(MUL(result_dst, op[0], op[1]));
1618       }
1619       break;
1620    case ir_binop_imul_high: {
1621       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1622
1623       emit(MUL(acc, op[0], op[1]));
1624       emit(MACH(result_dst, op[0], op[1]));
1625       break;
1626    }
1627    case ir_binop_div:
1628       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1629       assert(ir->type->is_integer());
1630       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1631       break;
1632
1633    case ir_binop_carry:
1634       unreachable("Should have been lowered by carry_to_arith().");
1635
1636    case ir_binop_borrow:
1637       unreachable("Should have been lowered by borrow_to_arith().");
1638
1639    case ir_binop_mod:
1640       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1641       assert(ir->type->is_integer());
1642       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1643       break;
1644
1645    case ir_binop_less:
1646    case ir_binop_greater:
1647    case ir_binop_lequal:
1648    case ir_binop_gequal:
1649    case ir_binop_equal:
1650    case ir_binop_nequal: {
1651       if (devinfo->gen <= 5) {
1652          resolve_bool_comparison(ir->operands[0], &op[0]);
1653          resolve_bool_comparison(ir->operands[1], &op[1]);
1654       }
1655       emit(CMP(result_dst, op[0], op[1],
1656                brw_conditional_for_comparison(ir->operation)));
1657       break;
1658    }
1659
1660    case ir_binop_all_equal:
1661       if (devinfo->gen <= 5) {
1662          resolve_bool_comparison(ir->operands[0], &op[0]);
1663          resolve_bool_comparison(ir->operands[1], &op[1]);
1664       }
1665
1666       /* "==" operator producing a scalar boolean. */
1667       if (ir->operands[0]->type->is_vector() ||
1668           ir->operands[1]->type->is_vector()) {
1669          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1670          emit(MOV(result_dst, src_reg(0)));
1671          inst = emit(MOV(result_dst, src_reg(~0)));
1672          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1673       } else {
1674          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1675       }
1676       break;
1677    case ir_binop_any_nequal:
1678       if (devinfo->gen <= 5) {
1679          resolve_bool_comparison(ir->operands[0], &op[0]);
1680          resolve_bool_comparison(ir->operands[1], &op[1]);
1681       }
1682
1683       /* "!=" operator producing a scalar boolean. */
1684       if (ir->operands[0]->type->is_vector() ||
1685           ir->operands[1]->type->is_vector()) {
1686          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1687
1688          emit(MOV(result_dst, src_reg(0)));
1689          inst = emit(MOV(result_dst, src_reg(~0)));
1690          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1691       } else {
1692          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1693       }
1694       break;
1695
1696    case ir_unop_any:
1697       if (devinfo->gen <= 5) {
1698          resolve_bool_comparison(ir->operands[0], &op[0]);
1699       }
1700       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1701       emit(MOV(result_dst, src_reg(0)));
1702
1703       inst = emit(MOV(result_dst, src_reg(~0)));
1704       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1705       break;
1706
1707    case ir_binop_logic_xor:
1708       emit(XOR(result_dst, op[0], op[1]));
1709       break;
1710
1711    case ir_binop_logic_or:
1712       emit(OR(result_dst, op[0], op[1]));
1713       break;
1714
1715    case ir_binop_logic_and:
1716       emit(AND(result_dst, op[0], op[1]));
1717       break;
1718
1719    case ir_binop_dot:
1720       assert(ir->operands[0]->type->is_vector());
1721       assert(ir->operands[0]->type == ir->operands[1]->type);
1722       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1723       break;
1724
1725    case ir_unop_sqrt:
1726       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1727       break;
1728    case ir_unop_rsq:
1729       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1730       break;
1731
1732    case ir_unop_bitcast_i2f:
1733    case ir_unop_bitcast_u2f:
1734       this->result = op[0];
1735       this->result.type = BRW_REGISTER_TYPE_F;
1736       break;
1737
1738    case ir_unop_bitcast_f2i:
1739       this->result = op[0];
1740       this->result.type = BRW_REGISTER_TYPE_D;
1741       break;
1742
1743    case ir_unop_bitcast_f2u:
1744       this->result = op[0];
1745       this->result.type = BRW_REGISTER_TYPE_UD;
1746       break;
1747
1748    case ir_unop_i2f:
1749    case ir_unop_i2u:
1750    case ir_unop_u2i:
1751    case ir_unop_u2f:
1752    case ir_unop_f2i:
1753    case ir_unop_f2u:
1754       emit(MOV(result_dst, op[0]));
1755       break;
1756    case ir_unop_b2i:
1757    case ir_unop_b2f:
1758       if (devinfo->gen <= 5) {
1759          resolve_bool_comparison(ir->operands[0], &op[0]);
1760       }
1761       emit(MOV(result_dst, negate(op[0])));
1762       break;
1763    case ir_unop_f2b:
1764       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1765       break;
1766    case ir_unop_i2b:
1767       emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1768       break;
1769
1770    case ir_unop_trunc:
1771       emit(RNDZ(result_dst, op[0]));
1772       break;
1773    case ir_unop_ceil: {
1774          src_reg tmp = src_reg(this, ir->type);
1775          op[0].negate = !op[0].negate;
1776          emit(RNDD(dst_reg(tmp), op[0]));
1777          tmp.negate = true;
1778          emit(MOV(result_dst, tmp));
1779       }
1780       break;
1781    case ir_unop_floor:
1782       inst = emit(RNDD(result_dst, op[0]));
1783       break;
1784    case ir_unop_fract:
1785       inst = emit(FRC(result_dst, op[0]));
1786       break;
1787    case ir_unop_round_even:
1788       emit(RNDE(result_dst, op[0]));
1789       break;
1790
1791    case ir_binop_min:
1792       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1793       break;
1794    case ir_binop_max:
1795       emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1796       break;
1797
1798    case ir_binop_pow:
1799       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1800       break;
1801
1802    case ir_unop_bit_not:
1803       inst = emit(NOT(result_dst, op[0]));
1804       break;
1805    case ir_binop_bit_and:
1806       inst = emit(AND(result_dst, op[0], op[1]));
1807       break;
1808    case ir_binop_bit_xor:
1809       inst = emit(XOR(result_dst, op[0], op[1]));
1810       break;
1811    case ir_binop_bit_or:
1812       inst = emit(OR(result_dst, op[0], op[1]));
1813       break;
1814
1815    case ir_binop_lshift:
1816       inst = emit(SHL(result_dst, op[0], op[1]));
1817       break;
1818
1819    case ir_binop_rshift:
1820       if (ir->type->base_type == GLSL_TYPE_INT)
1821          inst = emit(ASR(result_dst, op[0], op[1]));
1822       else
1823          inst = emit(SHR(result_dst, op[0], op[1]));
1824       break;
1825
1826    case ir_binop_bfm:
1827       emit(BFI1(result_dst, op[0], op[1]));
1828       break;
1829
1830    case ir_binop_ubo_load: {
1831       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1832       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1833       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1834       src_reg offset;
1835
1836       /* Now, load the vector from that offset. */
1837       assert(ir->type->is_vector() || ir->type->is_scalar());
1838
1839       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1840       packed_consts.type = result.type;
1841       src_reg surf_index;
1842
1843       if (const_uniform_block) {
1844          /* The block index is a constant, so just emit the binding table entry
1845           * as an immediate.
1846           */
1847          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1848                               const_uniform_block->value.u[0]);
1849       } else {
1850          /* The block index is not a constant. Evaluate the index expression
1851           * per-channel and add the base UBO index; we have to select a value
1852           * from any live channel.
1853           */
1854          surf_index = src_reg(this, glsl_type::uint_type);
1855          emit(ADD(dst_reg(surf_index), op[0],
1856                   src_reg(prog_data->base.binding_table.ubo_start)));
1857          surf_index = emit_uniformize(surf_index);
1858
1859          /* Assume this may touch any UBO. It would be nice to provide
1860           * a tighter bound, but the array information is already lowered away.
1861           */
1862          brw_mark_surface_used(&prog_data->base,
1863                                prog_data->base.binding_table.ubo_start +
1864                                shader_prog->NumUniformBlocks - 1);
1865       }
1866
1867       if (const_offset_ir) {
1868          if (devinfo->gen >= 8) {
1869             /* Store the offset in a GRF so we can send-from-GRF. */
1870             offset = src_reg(this, glsl_type::int_type);
1871             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1872          } else {
1873             /* Immediates are fine on older generations since they'll be moved
1874              * to a (potentially fake) MRF at the generator level.
1875              */
1876             offset = src_reg(const_offset / 16);
1877          }
1878       } else {
1879          offset = src_reg(this, glsl_type::uint_type);
1880          emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1881       }
1882
1883       emit_pull_constant_load_reg(dst_reg(packed_consts),
1884                                   surf_index,
1885                                   offset,
1886                                   NULL, NULL /* before_block/inst */);
1887
1888       packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1889       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1890                                             const_offset % 16 / 4,
1891                                             const_offset % 16 / 4,
1892                                             const_offset % 16 / 4);
1893
1894       /* UBO bools are any nonzero int.  We need to convert them to 0/~0. */
1895       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1896          emit(CMP(result_dst, packed_consts, src_reg(0u),
1897                   BRW_CONDITIONAL_NZ));
1898       } else {
1899          emit(MOV(result_dst, packed_consts));
1900       }
1901       break;
1902    }
1903
1904    case ir_binop_vector_extract:
1905       unreachable("should have been lowered by vec_index_to_cond_assign");
1906
1907    case ir_triop_fma:
1908       op[0] = fix_3src_operand(op[0]);
1909       op[1] = fix_3src_operand(op[1]);
1910       op[2] = fix_3src_operand(op[2]);
1911       /* Note that the instruction's argument order is reversed from GLSL
1912        * and the IR.
1913        */
1914       emit(MAD(result_dst, op[2], op[1], op[0]));
1915       break;
1916
1917    case ir_triop_lrp:
1918       emit_lrp(result_dst, op[0], op[1], op[2]);
1919       break;
1920
1921    case ir_triop_csel:
1922       unreachable("already handled above");
1923       break;
1924
1925    case ir_triop_bfi:
1926       op[0] = fix_3src_operand(op[0]);
1927       op[1] = fix_3src_operand(op[1]);
1928       op[2] = fix_3src_operand(op[2]);
1929       emit(BFI2(result_dst, op[0], op[1], op[2]));
1930       break;
1931
1932    case ir_triop_bitfield_extract:
1933       op[0] = fix_3src_operand(op[0]);
1934       op[1] = fix_3src_operand(op[1]);
1935       op[2] = fix_3src_operand(op[2]);
1936       /* Note that the instruction's argument order is reversed from GLSL
1937        * and the IR.
1938        */
1939       emit(BFE(result_dst, op[2], op[1], op[0]));
1940       break;
1941
1942    case ir_triop_vector_insert:
1943       unreachable("should have been lowered by lower_vector_insert");
1944
1945    case ir_quadop_bitfield_insert:
1946       unreachable("not reached: should be handled by "
1947               "bitfield_insert_to_bfm_bfi\n");
1948
1949    case ir_quadop_vector:
1950       unreachable("not reached: should be handled by lower_quadop_vector");
1951
1952    case ir_unop_pack_half_2x16:
1953       emit_pack_half_2x16(result_dst, op[0]);
1954       break;
1955    case ir_unop_unpack_half_2x16:
1956       emit_unpack_half_2x16(result_dst, op[0]);
1957       break;
1958    case ir_unop_unpack_unorm_4x8:
1959       emit_unpack_unorm_4x8(result_dst, op[0]);
1960       break;
1961    case ir_unop_unpack_snorm_4x8:
1962       emit_unpack_snorm_4x8(result_dst, op[0]);
1963       break;
1964    case ir_unop_pack_unorm_4x8:
1965       emit_pack_unorm_4x8(result_dst, op[0]);
1966       break;
1967    case ir_unop_pack_snorm_4x8:
1968       emit_pack_snorm_4x8(result_dst, op[0]);
1969       break;
1970    case ir_unop_pack_snorm_2x16:
1971    case ir_unop_pack_unorm_2x16:
1972    case ir_unop_unpack_snorm_2x16:
1973    case ir_unop_unpack_unorm_2x16:
1974       unreachable("not reached: should be handled by lower_packing_builtins");
1975    case ir_unop_unpack_half_2x16_split_x:
1976    case ir_unop_unpack_half_2x16_split_y:
1977    case ir_binop_pack_half_2x16_split:
1978    case ir_unop_interpolate_at_centroid:
1979    case ir_binop_interpolate_at_sample:
1980    case ir_binop_interpolate_at_offset:
1981       unreachable("not reached: should not occur in vertex shader");
1982    case ir_binop_ldexp:
1983       unreachable("not reached: should be handled by ldexp_to_arith()");
1984    case ir_unop_d2f:
1985    case ir_unop_f2d:
1986    case ir_unop_d2i:
1987    case ir_unop_i2d:
1988    case ir_unop_d2u:
1989    case ir_unop_u2d:
1990    case ir_unop_d2b:
1991    case ir_unop_pack_double_2x32:
1992    case ir_unop_unpack_double_2x32:
1993    case ir_unop_frexp_sig:
1994    case ir_unop_frexp_exp:
1995       unreachable("fp64 todo");
1996    }
1997 }
1998
1999
2000 void
2001 vec4_visitor::visit(ir_swizzle *ir)
2002 {
2003    /* Note that this is only swizzles in expressions, not those on the left
2004     * hand side of an assignment, which do write masking.  See ir_assignment
2005     * for that.
2006     */
2007    const unsigned swz = brw_compose_swizzle(
2008       brw_swizzle_for_size(ir->type->vector_elements),
2009       BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
2010
2011    ir->val->accept(this);
2012    this->result = swizzle(this->result, swz);
2013 }
2014
2015 void
2016 vec4_visitor::visit(ir_dereference_variable *ir)
2017 {
2018    const struct glsl_type *type = ir->type;
2019    dst_reg *reg = variable_storage(ir->var);
2020
2021    if (!reg) {
2022       fail("Failed to find variable storage for %s\n", ir->var->name);
2023       this->result = src_reg(brw_null_reg());
2024       return;
2025    }
2026
2027    this->result = src_reg(*reg);
2028
2029    /* System values get their swizzle from the dst_reg writemask */
2030    if (ir->var->data.mode == ir_var_system_value)
2031       return;
2032
2033    if (type->is_scalar() || type->is_vector() || type->is_matrix())
2034       this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2035 }
2036
2037
2038 int
2039 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2040 {
2041    /* Under normal circumstances array elements are stored consecutively, so
2042     * the stride is equal to the size of the array element.
2043     */
2044    return type_size_vec4(ir->type);
2045 }
2046
2047
2048 void
2049 vec4_visitor::visit(ir_dereference_array *ir)
2050 {
2051    ir_constant *constant_index;
2052    src_reg src;
2053    int array_stride = compute_array_stride(ir);
2054
2055    constant_index = ir->array_index->constant_expression_value();
2056
2057    ir->array->accept(this);
2058    src = this->result;
2059
2060    if (constant_index) {
2061       src.reg_offset += constant_index->value.i[0] * array_stride;
2062    } else {
2063       /* Variable index array dereference.  It eats the "vec4" of the
2064        * base of the array and an index that offsets the Mesa register
2065        * index.
2066        */
2067       ir->array_index->accept(this);
2068
2069       src_reg index_reg;
2070
2071       if (array_stride == 1) {
2072          index_reg = this->result;
2073       } else {
2074          index_reg = src_reg(this, glsl_type::int_type);
2075
2076          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2077       }
2078
2079       if (src.reladdr) {
2080          src_reg temp = src_reg(this, glsl_type::int_type);
2081
2082          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2083
2084          index_reg = temp;
2085       }
2086
2087       src.reladdr = ralloc(mem_ctx, src_reg);
2088       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2089    }
2090
2091    /* If the type is smaller than a vec4, replicate the last channel out. */
2092    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2093       src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2094    else
2095       src.swizzle = BRW_SWIZZLE_NOOP;
2096    src.type = brw_type_for_base_type(ir->type);
2097
2098    this->result = src;
2099 }
2100
2101 void
2102 vec4_visitor::visit(ir_dereference_record *ir)
2103 {
2104    unsigned int i;
2105    const glsl_type *struct_type = ir->record->type;
2106    int offset = 0;
2107
2108    ir->record->accept(this);
2109
2110    for (i = 0; i < struct_type->length; i++) {
2111       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2112          break;
2113       offset += type_size_vec4(struct_type->fields.structure[i].type);
2114    }
2115
2116    /* If the type is smaller than a vec4, replicate the last channel out. */
2117    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2118       this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2119    else
2120       this->result.swizzle = BRW_SWIZZLE_NOOP;
2121    this->result.type = brw_type_for_base_type(ir->type);
2122
2123    this->result.reg_offset += offset;
2124 }
2125
2126 /**
2127  * We want to be careful in assignment setup to hit the actual storage
2128  * instead of potentially using a temporary like we might with the
2129  * ir_dereference handler.
2130  */
2131 static dst_reg
2132 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2133 {
2134    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2135     * access of a vector, it must be separated into a series conditional moves
2136     * before reaching this point (see ir_vec_index_to_cond_assign).
2137     */
2138    assert(ir->as_dereference());
2139    ir_dereference_array *deref_array = ir->as_dereference_array();
2140    if (deref_array) {
2141       assert(!deref_array->array->type->is_vector());
2142    }
2143
2144    /* Use the rvalue deref handler for the most part.  We'll ignore
2145     * swizzles in it and write swizzles using writemask, though.
2146     */
2147    ir->accept(v);
2148    return dst_reg(v->result);
2149 }
2150
2151 void
2152 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2153                               const struct glsl_type *type,
2154                               enum brw_predicate predicate)
2155 {
2156    if (type->base_type == GLSL_TYPE_STRUCT) {
2157       for (unsigned int i = 0; i < type->length; i++) {
2158          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2159       }
2160       return;
2161    }
2162
2163    if (type->is_array()) {
2164       for (unsigned int i = 0; i < type->length; i++) {
2165          emit_block_move(dst, src, type->fields.array, predicate);
2166       }
2167       return;
2168    }
2169
2170    if (type->is_matrix()) {
2171       const struct glsl_type *vec_type;
2172
2173       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2174                                          type->vector_elements, 1);
2175
2176       for (int i = 0; i < type->matrix_columns; i++) {
2177          emit_block_move(dst, src, vec_type, predicate);
2178       }
2179       return;
2180    }
2181
2182    assert(type->is_scalar() || type->is_vector());
2183
2184    dst->type = brw_type_for_base_type(type);
2185    src->type = dst->type;
2186
2187    dst->writemask = (1 << type->vector_elements) - 1;
2188
2189    src->swizzle = brw_swizzle_for_size(type->vector_elements);
2190
2191    vec4_instruction *inst = emit(MOV(*dst, *src));
2192    inst->predicate = predicate;
2193
2194    dst->reg_offset++;
2195    src->reg_offset++;
2196 }
2197
2198
2199 /* If the RHS processing resulted in an instruction generating a
2200  * temporary value, and it would be easy to rewrite the instruction to
2201  * generate its result right into the LHS instead, do so.  This ends
2202  * up reliably removing instructions where it can be tricky to do so
2203  * later without real UD chain information.
2204  */
2205 bool
2206 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2207                                      dst_reg dst,
2208                                      src_reg src,
2209                                      vec4_instruction *pre_rhs_inst,
2210                                      vec4_instruction *last_rhs_inst)
2211 {
2212    /* This could be supported, but it would take more smarts. */
2213    if (ir->condition)
2214       return false;
2215
2216    if (pre_rhs_inst == last_rhs_inst)
2217       return false; /* No instructions generated to work with. */
2218
2219    /* Make sure the last instruction generated our source reg. */
2220    if (src.file != GRF ||
2221        src.file != last_rhs_inst->dst.file ||
2222        src.reg != last_rhs_inst->dst.reg ||
2223        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2224        src.reladdr ||
2225        src.abs ||
2226        src.negate ||
2227        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2228       return false;
2229
2230    /* Check that that last instruction fully initialized the channels
2231     * we want to use, in the order we want to use them.  We could
2232     * potentially reswizzle the operands of many instructions so that
2233     * we could handle out of order channels, but don't yet.
2234     */
2235
2236    for (unsigned i = 0; i < 4; i++) {
2237       if (dst.writemask & (1 << i)) {
2238          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2239             return false;
2240
2241          if (BRW_GET_SWZ(src.swizzle, i) != i)
2242             return false;
2243       }
2244    }
2245
2246    /* Success!  Rewrite the instruction. */
2247    last_rhs_inst->dst.file = dst.file;
2248    last_rhs_inst->dst.reg = dst.reg;
2249    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2250    last_rhs_inst->dst.reladdr = dst.reladdr;
2251    last_rhs_inst->dst.writemask &= dst.writemask;
2252
2253    return true;
2254 }
2255
2256 void
2257 vec4_visitor::visit(ir_assignment *ir)
2258 {
2259    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2260    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2261
2262    if (!ir->lhs->type->is_scalar() &&
2263        !ir->lhs->type->is_vector()) {
2264       ir->rhs->accept(this);
2265       src_reg src = this->result;
2266
2267       if (ir->condition) {
2268          emit_bool_to_cond_code(ir->condition, &predicate);
2269       }
2270
2271       /* emit_block_move doesn't account for swizzles in the source register.
2272        * This should be ok, since the source register is a structure or an
2273        * array, and those can't be swizzled.  But double-check to be sure.
2274        */
2275       assert(src.swizzle ==
2276              (ir->rhs->type->is_matrix()
2277               ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2278               : BRW_SWIZZLE_NOOP));
2279
2280       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2281       return;
2282    }
2283
2284    /* Now we're down to just a scalar/vector with writemasks. */
2285    int i;
2286
2287    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2288    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2289
2290    ir->rhs->accept(this);
2291
2292    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2293
2294    int swizzles[4];
2295    int src_chan = 0;
2296
2297    assert(ir->lhs->type->is_vector() ||
2298           ir->lhs->type->is_scalar());
2299    dst.writemask = ir->write_mask;
2300
2301    /* Swizzle a small RHS vector into the channels being written.
2302     *
2303     * glsl ir treats write_mask as dictating how many channels are
2304     * present on the RHS while in our instructions we need to make
2305     * those channels appear in the slots of the vec4 they're written to.
2306     */
2307    for (int i = 0; i < 4; i++)
2308       swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2309
2310    src_reg src = swizzle(this->result,
2311                          BRW_SWIZZLE4(swizzles[0], swizzles[1],
2312                                       swizzles[2], swizzles[3]));
2313
2314    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2315       return;
2316    }
2317
2318    if (ir->condition) {
2319       emit_bool_to_cond_code(ir->condition, &predicate);
2320    }
2321
2322    for (i = 0; i < type_size_vec4(ir->lhs->type); i++) {
2323       vec4_instruction *inst = emit(MOV(dst, src));
2324       inst->predicate = predicate;
2325
2326       dst.reg_offset++;
2327       src.reg_offset++;
2328    }
2329 }
2330
2331 void
2332 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2333 {
2334    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2335       foreach_in_list(ir_constant, field_value, &ir->components) {
2336          emit_constant_values(dst, field_value);
2337       }
2338       return;
2339    }
2340
2341    if (ir->type->is_array()) {
2342       for (unsigned int i = 0; i < ir->type->length; i++) {
2343          emit_constant_values(dst, ir->array_elements[i]);
2344       }
2345       return;
2346    }
2347
2348    if (ir->type->is_matrix()) {
2349       for (int i = 0; i < ir->type->matrix_columns; i++) {
2350          float *vec = &ir->value.f[i * ir->type->vector_elements];
2351
2352          for (int j = 0; j < ir->type->vector_elements; j++) {
2353             dst->writemask = 1 << j;
2354             dst->type = BRW_REGISTER_TYPE_F;
2355
2356             emit(MOV(*dst, src_reg(vec[j])));
2357          }
2358          dst->reg_offset++;
2359       }
2360       return;
2361    }
2362
2363    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2364
2365    for (int i = 0; i < ir->type->vector_elements; i++) {
2366       if (!(remaining_writemask & (1 << i)))
2367          continue;
2368
2369       dst->writemask = 1 << i;
2370       dst->type = brw_type_for_base_type(ir->type);
2371
2372       /* Find other components that match the one we're about to
2373        * write.  Emits fewer instructions for things like vec4(0.5,
2374        * 1.5, 1.5, 1.5).
2375        */
2376       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2377          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2378             if (ir->value.b[i] == ir->value.b[j])
2379                dst->writemask |= (1 << j);
2380          } else {
2381             /* u, i, and f storage all line up, so no need for a
2382              * switch case for comparing each type.
2383              */
2384             if (ir->value.u[i] == ir->value.u[j])
2385                dst->writemask |= (1 << j);
2386          }
2387       }
2388
2389       switch (ir->type->base_type) {
2390       case GLSL_TYPE_FLOAT:
2391          emit(MOV(*dst, src_reg(ir->value.f[i])));
2392          break;
2393       case GLSL_TYPE_INT:
2394          emit(MOV(*dst, src_reg(ir->value.i[i])));
2395          break;
2396       case GLSL_TYPE_UINT:
2397          emit(MOV(*dst, src_reg(ir->value.u[i])));
2398          break;
2399       case GLSL_TYPE_BOOL:
2400          emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2401          break;
2402       default:
2403          unreachable("Non-float/uint/int/bool constant");
2404       }
2405
2406       remaining_writemask &= ~dst->writemask;
2407    }
2408    dst->reg_offset++;
2409 }
2410
2411 void
2412 vec4_visitor::visit(ir_constant *ir)
2413 {
2414    dst_reg dst = dst_reg(this, ir->type);
2415    this->result = src_reg(dst);
2416
2417    emit_constant_values(&dst, ir);
2418 }
2419
2420 void
2421 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2422 {
2423    ir_dereference *deref = static_cast<ir_dereference *>(
2424       ir->actual_parameters.get_head());
2425    ir_variable *location = deref->variable_referenced();
2426    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2427                           location->data.binding);
2428
2429    /* Calculate the surface offset */
2430    src_reg offset(this, glsl_type::uint_type);
2431    ir_dereference_array *deref_array = deref->as_dereference_array();
2432    if (deref_array) {
2433       deref_array->array_index->accept(this);
2434
2435       src_reg tmp(this, glsl_type::uint_type);
2436       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2437       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2438    } else {
2439       offset = location->data.atomic.offset;
2440    }
2441
2442    /* Emit the appropriate machine instruction */
2443    const char *callee = ir->callee->function_name();
2444    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2445
2446    if (!strcmp("__intrinsic_atomic_read", callee)) {
2447       emit_untyped_surface_read(surf_index, dst, offset);
2448
2449    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2450       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2451                           src_reg(), src_reg());
2452
2453    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2454       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2455                           src_reg(), src_reg());
2456    }
2457
2458    brw_mark_surface_used(stage_prog_data, surf_index);
2459 }
2460
2461 void
2462 vec4_visitor::visit(ir_call *ir)
2463 {
2464    const char *callee = ir->callee->function_name();
2465
2466    if (!strcmp("__intrinsic_atomic_read", callee) ||
2467        !strcmp("__intrinsic_atomic_increment", callee) ||
2468        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2469       visit_atomic_counter_intrinsic(ir);
2470    } else {
2471       unreachable("Unsupported intrinsic.");
2472    }
2473 }
2474
2475 src_reg
2476 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
2477                              src_reg coordinate, src_reg sampler)
2478 {
2479    vec4_instruction *inst =
2480       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2481                                     dst_reg(this, glsl_type::uvec4_type));
2482    inst->base_mrf = 2;
2483    inst->src[1] = sampler;
2484
2485    int param_base;
2486
2487    if (devinfo->gen >= 9) {
2488       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2489       vec4_instruction *header_inst = new(mem_ctx)
2490          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2491                           dst_reg(MRF, inst->base_mrf));
2492
2493       emit(header_inst);
2494
2495       inst->mlen = 2;
2496       inst->header_size = 1;
2497       param_base = inst->base_mrf + 1;
2498    } else {
2499       inst->mlen = 1;
2500       param_base = inst->base_mrf;
2501    }
2502
2503    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2504    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
2505    int zero_mask = 0xf & ~coord_mask;
2506
2507    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
2508             coordinate));
2509
2510    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
2511             src_reg(0)));
2512
2513    emit(inst);
2514    return src_reg(inst->dst);
2515 }
2516
2517 bool
2518 vec4_visitor::is_high_sampler(src_reg sampler)
2519 {
2520    if (devinfo->gen < 8 && !devinfo->is_haswell)
2521       return false;
2522
2523    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2524 }
2525
2526 void
2527 vec4_visitor::emit_texture(ir_texture_opcode op,
2528                            dst_reg dest,
2529                            const glsl_type *dest_type,
2530                            src_reg coordinate,
2531                            int coord_components,
2532                            src_reg shadow_comparitor,
2533                            src_reg lod, src_reg lod2,
2534                            src_reg sample_index,
2535                            uint32_t constant_offset,
2536                            src_reg offset_value,
2537                            src_reg mcs,
2538                            bool is_cube_array,
2539                            uint32_t sampler,
2540                            src_reg sampler_reg)
2541 {
2542    enum opcode opcode;
2543    switch (op) {
2544    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2545    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2546    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2547    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2548    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2549    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2550    case ir_tg4: opcode = offset_value.file != BAD_FILE
2551                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2552    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2553    case ir_txb:
2554       unreachable("TXB is not valid for vertex shaders.");
2555    case ir_lod:
2556       unreachable("LOD is not valid for vertex shaders.");
2557    default:
2558       unreachable("Unrecognized tex op");
2559    }
2560
2561    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2562       opcode, dst_reg(this, dest_type));
2563
2564    inst->offset = constant_offset;
2565
2566    /* The message header is necessary for:
2567     * - Gen4 (always)
2568     * - Gen9+ for selecting SIMD4x2
2569     * - Texel offsets
2570     * - Gather channel selection
2571     * - Sampler indices too large to fit in a 4-bit value.
2572     */
2573    inst->header_size =
2574       (devinfo->gen < 5 || devinfo->gen >= 9 ||
2575        inst->offset != 0 || op == ir_tg4 ||
2576        is_high_sampler(sampler_reg)) ? 1 : 0;
2577    inst->base_mrf = 2;
2578    inst->mlen = inst->header_size + 1; /* always at least one */
2579    inst->dst.writemask = WRITEMASK_XYZW;
2580    inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
2581
2582    inst->src[1] = sampler_reg;
2583
2584    /* MRF for the first parameter */
2585    int param_base = inst->base_mrf + inst->header_size;
2586
2587    if (op == ir_txs || op == ir_query_levels) {
2588       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2589       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
2590    } else {
2591       /* Load the coordinate */
2592       /* FINISHME: gl_clamp_mask and saturate */
2593       int coord_mask = (1 << coord_components) - 1;
2594       int zero_mask = 0xf & ~coord_mask;
2595
2596       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
2597                coordinate));
2598
2599       if (zero_mask != 0) {
2600          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
2601                   src_reg(0)));
2602       }
2603       /* Load the shadow comparitor */
2604       if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
2605          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
2606                           WRITEMASK_X),
2607                   shadow_comparitor));
2608          inst->mlen++;
2609       }
2610
2611       /* Load the LOD info */
2612       if (op == ir_tex || op == ir_txl) {
2613          int mrf, writemask;
2614          if (devinfo->gen >= 5) {
2615             mrf = param_base + 1;
2616             if (shadow_comparitor.file != BAD_FILE) {
2617                writemask = WRITEMASK_Y;
2618                /* mlen already incremented */
2619             } else {
2620                writemask = WRITEMASK_X;
2621                inst->mlen++;
2622             }
2623          } else /* devinfo->gen == 4 */ {
2624             mrf = param_base;
2625             writemask = WRITEMASK_W;
2626          }
2627          lod.swizzle = BRW_SWIZZLE_XXXX;
2628          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
2629       } else if (op == ir_txf) {
2630          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
2631       } else if (op == ir_txf_ms) {
2632          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
2633                   sample_index));
2634          if (devinfo->gen >= 7) {
2635             /* MCS data is in the first channel of `mcs`, but we need to get it into
2636              * the .y channel of the second vec4 of params, so replicate .x across
2637              * the whole vec4 and then mask off everything except .y
2638              */
2639             mcs.swizzle = BRW_SWIZZLE_XXXX;
2640             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2641                      mcs));
2642          }
2643          inst->mlen++;
2644       } else if (op == ir_txd) {
2645          const brw_reg_type type = lod.type;
2646
2647          if (devinfo->gen >= 5) {
2648             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2649             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2650             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
2651             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
2652             inst->mlen++;
2653
2654             if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
2655                lod.swizzle = BRW_SWIZZLE_ZZZZ;
2656                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
2657                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
2658                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
2659                inst->mlen++;
2660
2661                if (shadow_comparitor.file != BAD_FILE) {
2662                   emit(MOV(dst_reg(MRF, param_base + 2,
2663                                    shadow_comparitor.type, WRITEMASK_Z),
2664                            shadow_comparitor));
2665                }
2666             }
2667          } else /* devinfo->gen == 4 */ {
2668             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
2669             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
2670             inst->mlen += 2;
2671          }
2672       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
2673          if (shadow_comparitor.file != BAD_FILE) {
2674             emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
2675                      shadow_comparitor));
2676          }
2677
2678          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2679                   offset_value));
2680          inst->mlen++;
2681       }
2682    }
2683
2684    emit(inst);
2685
2686    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2687     * spec requires layers.
2688     */
2689    if (op == ir_txs && is_cube_array) {
2690       emit_math(SHADER_OPCODE_INT_QUOTIENT,
2691                 writemask(inst->dst, WRITEMASK_Z),
2692                 src_reg(inst->dst), src_reg(6));
2693    }
2694
2695    if (devinfo->gen == 6 && op == ir_tg4) {
2696       emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst);
2697    }
2698
2699    swizzle_result(op, dest,
2700                   src_reg(inst->dst), sampler, dest_type);
2701 }
2702
2703 void
2704 vec4_visitor::visit(ir_texture *ir)
2705 {
2706    uint32_t sampler =
2707       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2708
2709    ir_rvalue *nonconst_sampler_index =
2710       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2711
2712    /* Handle non-constant sampler array indexing */
2713    src_reg sampler_reg;
2714    if (nonconst_sampler_index) {
2715       /* The highest sampler which may be used by this operation is
2716        * the last element of the array. Mark it here, because the generator
2717        * doesn't have enough information to determine the bound.
2718        */
2719       uint32_t array_size = ir->sampler->as_dereference_array()
2720          ->array->type->array_size();
2721
2722       uint32_t max_used = sampler + array_size - 1;
2723       if (ir->op == ir_tg4 && devinfo->gen < 8) {
2724          max_used += prog_data->base.binding_table.gather_texture_start;
2725       } else {
2726          max_used += prog_data->base.binding_table.texture_start;
2727       }
2728
2729       brw_mark_surface_used(&prog_data->base, max_used);
2730
2731       /* Emit code to evaluate the actual indexing expression */
2732       nonconst_sampler_index->accept(this);
2733       src_reg temp(this, glsl_type::uint_type);
2734       emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
2735       sampler_reg = emit_uniformize(temp);
2736    } else {
2737       /* Single sampler, or constant array index; the indexing expression
2738        * is just an immediate.
2739        */
2740       sampler_reg = src_reg(sampler);
2741    }
2742
2743    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2744     * emitting anything other than setting up the constant result.
2745     */
2746    if (ir->op == ir_tg4) {
2747       ir_constant *chan = ir->lod_info.component->as_constant();
2748       int swiz = GET_SWZ(key_tex->swizzles[sampler], chan->value.i[0]);
2749       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2750          dst_reg result(this, ir->type);
2751          this->result = src_reg(result);
2752          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2753          return;
2754       }
2755    }
2756
2757    /* Should be lowered by do_lower_texture_projection */
2758    assert(!ir->projector);
2759
2760    /* Should be lowered */
2761    assert(!ir->offset || !ir->offset->type->is_array());
2762
2763    /* Generate code to compute all the subexpression trees.  This has to be
2764     * done before loading any values into MRFs for the sampler message since
2765     * generating these values may involve SEND messages that need the MRFs.
2766     */
2767    src_reg coordinate;
2768    int coord_components = 0;
2769    if (ir->coordinate) {
2770       coord_components = ir->coordinate->type->vector_elements;
2771       ir->coordinate->accept(this);
2772       coordinate = this->result;
2773    }
2774
2775    src_reg shadow_comparitor;
2776    if (ir->shadow_comparitor) {
2777       ir->shadow_comparitor->accept(this);
2778       shadow_comparitor = this->result;
2779    }
2780
2781    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2782    src_reg offset_value;
2783    if (has_nonconstant_offset) {
2784       ir->offset->accept(this);
2785       offset_value = src_reg(this->result);
2786    }
2787
2788    src_reg lod, lod2, sample_index, mcs;
2789    switch (ir->op) {
2790    case ir_tex:
2791       lod = src_reg(0.0f);
2792       break;
2793    case ir_txf:
2794    case ir_txl:
2795    case ir_txs:
2796       ir->lod_info.lod->accept(this);
2797       lod = this->result;
2798       break;
2799    case ir_query_levels:
2800       lod = src_reg(0);
2801       break;
2802    case ir_txf_ms:
2803       ir->lod_info.sample_index->accept(this);
2804       sample_index = this->result;
2805
2806       if (devinfo->gen >= 7 && key_tex->compressed_multisample_layout_mask & (1 << sampler))
2807          mcs = emit_mcs_fetch(ir->coordinate->type, coordinate, sampler_reg);
2808       else
2809          mcs = src_reg(0u);
2810       break;
2811    case ir_txd:
2812       ir->lod_info.grad.dPdx->accept(this);
2813       lod = this->result;
2814
2815       ir->lod_info.grad.dPdy->accept(this);
2816       lod2 = this->result;
2817       break;
2818    case ir_txb:
2819    case ir_lod:
2820    case ir_tg4:
2821       break;
2822    }
2823
2824    uint32_t constant_offset = 0;
2825    if (ir->offset != NULL && !has_nonconstant_offset) {
2826       constant_offset  =
2827          brw_texture_offset(ir->offset->as_constant()->value.i,
2828                             ir->offset->type->vector_elements);
2829    }
2830
2831    /* Stuff the channel select bits in the top of the texture offset */
2832    if (ir->op == ir_tg4)
2833       constant_offset |=
2834          gather_channel( ir->lod_info.component->as_constant()->value.i[0],
2835                          sampler) << 16;
2836
2837    glsl_type const *type = ir->sampler->type;
2838    bool is_cube_array = type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2839       type->sampler_array;
2840
2841    this->result = src_reg(this, ir->type);
2842    dst_reg dest = dst_reg(this->result);
2843
2844    emit_texture(ir->op, dest, ir->type, coordinate, coord_components,
2845                 shadow_comparitor,
2846                 lod, lod2, sample_index,
2847                 constant_offset, offset_value,
2848                 mcs, is_cube_array, sampler, sampler_reg);
2849 }
2850
2851 /**
2852  * Apply workarounds for Gen6 gather with UINT/SINT
2853  */
2854 void
2855 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2856 {
2857    if (!wa)
2858       return;
2859
2860    int width = (wa & WA_8BIT) ? 8 : 16;
2861    dst_reg dst_f = dst;
2862    dst_f.type = BRW_REGISTER_TYPE_F;
2863
2864    /* Convert from UNORM to UINT */
2865    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2866    emit(MOV(dst, src_reg(dst_f)));
2867
2868    if (wa & WA_SIGN) {
2869       /* Reinterpret the UINT value as a signed INT value by
2870        * shifting the sign bit into place, then shifting back
2871        * preserving sign.
2872        */
2873       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2874       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2875    }
2876 }
2877
2878 /**
2879  * Set up the gather channel based on the swizzle, for gather4.
2880  */
2881 uint32_t
2882 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
2883 {
2884    int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
2885    switch (swiz) {
2886       case SWIZZLE_X: return 0;
2887       case SWIZZLE_Y:
2888          /* gather4 sampler is broken for green channel on RG32F --
2889           * we must ask for blue instead.
2890           */
2891          if (key_tex->gather_channel_quirk_mask & (1 << sampler))
2892             return 2;
2893          return 1;
2894       case SWIZZLE_Z: return 2;
2895       case SWIZZLE_W: return 3;
2896       default:
2897          unreachable("Not reached"); /* zero, one swizzles handled already */
2898    }
2899 }
2900
2901 void
2902 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
2903                              src_reg orig_val, uint32_t sampler,
2904                              const glsl_type *dest_type)
2905 {
2906    int s = key_tex->swizzles[sampler];
2907
2908    dst_reg swizzled_result = dest;
2909
2910    if (op == ir_query_levels) {
2911       /* # levels is in .w */
2912       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2913       emit(MOV(swizzled_result, orig_val));
2914       return;
2915    }
2916
2917    if (op == ir_txs || dest_type == glsl_type::float_type
2918                         || s == SWIZZLE_NOOP || op == ir_tg4) {
2919       emit(MOV(swizzled_result, orig_val));
2920       return;
2921    }
2922
2923
2924    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2925    int swizzle[4] = {0};
2926
2927    for (int i = 0; i < 4; i++) {
2928       switch (GET_SWZ(s, i)) {
2929       case SWIZZLE_ZERO:
2930          zero_mask |= (1 << i);
2931          break;
2932       case SWIZZLE_ONE:
2933          one_mask |= (1 << i);
2934          break;
2935       default:
2936          copy_mask |= (1 << i);
2937          swizzle[i] = GET_SWZ(s, i);
2938          break;
2939       }
2940    }
2941
2942    if (copy_mask) {
2943       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2944       swizzled_result.writemask = copy_mask;
2945       emit(MOV(swizzled_result, orig_val));
2946    }
2947
2948    if (zero_mask) {
2949       swizzled_result.writemask = zero_mask;
2950       emit(MOV(swizzled_result, src_reg(0.0f)));
2951    }
2952
2953    if (one_mask) {
2954       swizzled_result.writemask = one_mask;
2955       emit(MOV(swizzled_result, src_reg(1.0f)));
2956    }
2957 }
2958
2959 void
2960 vec4_visitor::visit(ir_return *)
2961 {
2962    unreachable("not reached");
2963 }
2964
2965 void
2966 vec4_visitor::visit(ir_discard *)
2967 {
2968    unreachable("not reached");
2969 }
2970
2971 void
2972 vec4_visitor::visit(ir_if *ir)
2973 {
2974    /* Don't point the annotation at the if statement, because then it plus
2975     * the then and else blocks get printed.
2976     */
2977    this->base_ir = ir->condition;
2978
2979    if (devinfo->gen == 6) {
2980       emit_if_gen6(ir);
2981    } else {
2982       enum brw_predicate predicate;
2983       emit_bool_to_cond_code(ir->condition, &predicate);
2984       emit(IF(predicate));
2985    }
2986
2987    visit_instructions(&ir->then_instructions);
2988
2989    if (!ir->else_instructions.is_empty()) {
2990       this->base_ir = ir->condition;
2991       emit(BRW_OPCODE_ELSE);
2992
2993       visit_instructions(&ir->else_instructions);
2994    }
2995
2996    this->base_ir = ir->condition;
2997    emit(BRW_OPCODE_ENDIF);
2998 }
2999
3000 void
3001 vec4_visitor::gs_emit_vertex(int stream_id)
3002 {
3003    unreachable("not reached");
3004 }
3005
3006 void
3007 vec4_visitor::visit(ir_emit_vertex *)
3008 {
3009    unreachable("not reached");
3010 }
3011
3012 void
3013 vec4_visitor::gs_end_primitive()
3014 {
3015    unreachable("not reached");
3016 }
3017
3018
3019 void
3020 vec4_visitor::visit(ir_end_primitive *)
3021 {
3022    unreachable("not reached");
3023 }
3024
3025 void
3026 vec4_visitor::visit(ir_barrier *)
3027 {
3028    unreachable("not reached");
3029 }
3030
3031 void
3032 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
3033                                   dst_reg dst, src_reg offset,
3034                                   src_reg src0, src_reg src1)
3035 {
3036    unsigned mlen = 0;
3037
3038    /* Set the atomic operation offset. */
3039    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
3040    mlen++;
3041
3042    /* Set the atomic operation arguments. */
3043    if (src0.file != BAD_FILE) {
3044       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
3045       mlen++;
3046    }
3047
3048    if (src1.file != BAD_FILE) {
3049       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
3050       mlen++;
3051    }
3052
3053    /* Emit the instruction.  Note that this maps to the normal SIMD8
3054     * untyped atomic message on Ivy Bridge, but that's OK because
3055     * unused channels will be masked out.
3056     */
3057    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3058                                  brw_message_reg(0),
3059                                  src_reg(surf_index), src_reg(atomic_op));
3060    inst->mlen = mlen;
3061 }
3062
3063 void
3064 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3065                                         src_reg offset)
3066 {
3067    /* Set the surface read offset. */
3068    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3069
3070    /* Emit the instruction.  Note that this maps to the normal SIMD8
3071     * untyped surface read message, but that's OK because unused
3072     * channels will be masked out.
3073     */
3074    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3075                                  brw_message_reg(0),
3076                                  src_reg(surf_index), src_reg(1));
3077    inst->mlen = 1;
3078 }
3079
3080 void
3081 vec4_visitor::emit_ndc_computation()
3082 {
3083    /* Get the position */
3084    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3085
3086    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3087    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3088    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3089
3090    current_annotation = "NDC";
3091    dst_reg ndc_w = ndc;
3092    ndc_w.writemask = WRITEMASK_W;
3093    src_reg pos_w = pos;
3094    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3095    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3096
3097    dst_reg ndc_xyz = ndc;
3098    ndc_xyz.writemask = WRITEMASK_XYZ;
3099
3100    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3101 }
3102
3103 void
3104 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3105 {
3106    if (devinfo->gen < 6 &&
3107        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3108         output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
3109         devinfo->has_negative_rhw_bug)) {
3110       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3111       dst_reg header1_w = header1;
3112       header1_w.writemask = WRITEMASK_W;
3113
3114       emit(MOV(header1, 0u));
3115
3116       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3117          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3118
3119          current_annotation = "Point size";
3120          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3121          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3122       }
3123
3124       if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
3125          current_annotation = "Clipping flags";
3126          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3127          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3128
3129          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3130          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3131          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3132
3133          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3134          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3135          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3136          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3137       }
3138
3139       /* i965 clipping workaround:
3140        * 1) Test for -ve rhw
3141        * 2) If set,
3142        *      set ndc = (0,0,0,0)
3143        *      set ucp[6] = 1
3144        *
3145        * Later, clipping will detect ucp[6] and ensure the primitive is
3146        * clipped against all fixed planes.
3147        */
3148       if (devinfo->has_negative_rhw_bug) {
3149          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3150          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3151          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3152          vec4_instruction *inst;
3153          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3154          inst->predicate = BRW_PREDICATE_NORMAL;
3155          output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
3156          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3157          inst->predicate = BRW_PREDICATE_NORMAL;
3158       }
3159
3160       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3161    } else if (devinfo->gen < 6) {
3162       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3163    } else {
3164       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3165       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3166          dst_reg reg_w = reg;
3167          reg_w.writemask = WRITEMASK_W;
3168          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3169          reg_as_src.type = reg_w.type;
3170          reg_as_src.swizzle = brw_swizzle_for_size(1);
3171          emit(MOV(reg_w, reg_as_src));
3172       }
3173       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3174          dst_reg reg_y = reg;
3175          reg_y.writemask = WRITEMASK_Y;
3176          reg_y.type = BRW_REGISTER_TYPE_D;
3177          output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
3178          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3179       }
3180       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3181          dst_reg reg_z = reg;
3182          reg_z.writemask = WRITEMASK_Z;
3183          reg_z.type = BRW_REGISTER_TYPE_D;
3184          output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
3185          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3186       }
3187    }
3188 }
3189
3190 vec4_instruction *
3191 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3192 {
3193    assert(varying < VARYING_SLOT_MAX);
3194    assert(output_reg[varying].type == reg.type);
3195    current_annotation = output_reg_annotation[varying];
3196    /* Copy the register, saturating if necessary */
3197    return emit(MOV(reg, src_reg(output_reg[varying])));
3198 }
3199
3200 void
3201 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3202 {
3203    reg.type = BRW_REGISTER_TYPE_F;
3204    output_reg[varying].type = reg.type;
3205
3206    switch (varying) {
3207    case VARYING_SLOT_PSIZ:
3208    {
3209       /* PSIZ is always in slot 0, and is coupled with other flags. */
3210       current_annotation = "indices, point width, clip flags";
3211       emit_psiz_and_flags(reg);
3212       break;
3213    }
3214    case BRW_VARYING_SLOT_NDC:
3215       current_annotation = "NDC";
3216       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3217       break;
3218    case VARYING_SLOT_POS:
3219       current_annotation = "gl_Position";
3220       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3221       break;
3222    case VARYING_SLOT_EDGE:
3223       /* This is present when doing unfilled polygons.  We're supposed to copy
3224        * the edge flag from the user-provided vertex array
3225        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3226        * of that attribute (starts as 1.0f).  This is then used in clipping to
3227        * determine which edges should be drawn as wireframe.
3228        */
3229       current_annotation = "edge flag";
3230       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3231                                     glsl_type::float_type, WRITEMASK_XYZW))));
3232       break;
3233    case BRW_VARYING_SLOT_PAD:
3234       /* No need to write to this slot */
3235       break;
3236    case VARYING_SLOT_COL0:
3237    case VARYING_SLOT_COL1:
3238    case VARYING_SLOT_BFC0:
3239    case VARYING_SLOT_BFC1: {
3240       /* These built-in varyings are only supported in compatibility mode,
3241        * and we only support GS in core profile.  So, this must be a vertex
3242        * shader.
3243        */
3244       assert(stage == MESA_SHADER_VERTEX);
3245       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3246       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3247          inst->saturate = true;
3248       break;
3249    }
3250
3251    default:
3252       emit_generic_urb_slot(reg, varying);
3253       break;
3254    }
3255 }
3256
3257 static int
3258 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3259 {
3260    if (devinfo->gen >= 6) {
3261       /* URB data written (does not include the message header reg) must
3262        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3263        * section 5.4.3.2.2: URB_INTERLEAVED.
3264        *
3265        * URB entries are allocated on a multiple of 1024 bits, so an
3266        * extra 128 bits written here to make the end align to 256 is
3267        * no problem.
3268        */
3269       if ((mlen % 2) != 1)
3270          mlen++;
3271    }
3272
3273    return mlen;
3274 }
3275
3276
3277 /**
3278  * Generates the VUE payload plus the necessary URB write instructions to
3279  * output it.
3280  *
3281  * The VUE layout is documented in Volume 2a.
3282  */
3283 void
3284 vec4_visitor::emit_vertex()
3285 {
3286    /* MRF 0 is reserved for the debugger, so start with message header
3287     * in MRF 1.
3288     */
3289    int base_mrf = 1;
3290    int mrf = base_mrf;
3291    /* In the process of generating our URB write message contents, we
3292     * may need to unspill a register or load from an array.  Those
3293     * reads would use MRFs 14-15.
3294     */
3295    int max_usable_mrf = 13;
3296
3297    /* The following assertion verifies that max_usable_mrf causes an
3298     * even-numbered amount of URB write data, which will meet gen6's
3299     * requirements for length alignment.
3300     */
3301    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3302
3303    /* First mrf is the g0-based message header containing URB handles and
3304     * such.
3305     */
3306    emit_urb_write_header(mrf++);
3307
3308    if (devinfo->gen < 6) {
3309       emit_ndc_computation();
3310    }
3311
3312    /* We may need to split this up into several URB writes, so do them in a
3313     * loop.
3314     */
3315    int slot = 0;
3316    bool complete = false;
3317    do {
3318       /* URB offset is in URB row increments, and each of our MRFs is half of
3319        * one of those, since we're doing interleaved writes.
3320        */
3321       int offset = slot / 2;
3322
3323       mrf = base_mrf + 1;
3324       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3325          emit_urb_slot(dst_reg(MRF, mrf++),
3326                        prog_data->vue_map.slot_to_varying[slot]);
3327
3328          /* If this was max_usable_mrf, we can't fit anything more into this
3329           * URB WRITE.
3330           */
3331          if (mrf > max_usable_mrf) {
3332             slot++;
3333             break;
3334          }
3335       }
3336
3337       complete = slot >= prog_data->vue_map.num_slots;
3338       current_annotation = "URB write";
3339       vec4_instruction *inst = emit_urb_write_opcode(complete);
3340       inst->base_mrf = base_mrf;
3341       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3342       inst->offset += offset;
3343    } while(!complete);
3344 }
3345
3346
3347 src_reg
3348 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3349                                  src_reg *reladdr, int reg_offset)
3350 {
3351    /* Because we store the values to scratch interleaved like our
3352     * vertex data, we need to scale the vec4 index by 2.
3353     */
3354    int message_header_scale = 2;
3355
3356    /* Pre-gen6, the message header uses byte offsets instead of vec4
3357     * (16-byte) offset units.
3358     */
3359    if (devinfo->gen < 6)
3360       message_header_scale *= 16;
3361
3362    if (reladdr) {
3363       src_reg index = src_reg(this, glsl_type::int_type);
3364
3365       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3366                                    src_reg(reg_offset)));
3367       emit_before(block, inst, MUL(dst_reg(index), index,
3368                                    src_reg(message_header_scale)));
3369
3370       return index;
3371    } else {
3372       return src_reg(reg_offset * message_header_scale);
3373    }
3374 }
3375
3376 src_reg
3377 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3378                                        src_reg *reladdr, int reg_offset)
3379 {
3380    if (reladdr) {
3381       src_reg index = src_reg(this, glsl_type::int_type);
3382
3383       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3384                                    src_reg(reg_offset)));
3385
3386       /* Pre-gen6, the message header uses byte offsets instead of vec4
3387        * (16-byte) offset units.
3388        */
3389       if (devinfo->gen < 6) {
3390          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3391       }
3392
3393       return index;
3394    } else if (devinfo->gen >= 8) {
3395       /* Store the offset in a GRF so we can send-from-GRF. */
3396       src_reg offset = src_reg(this, glsl_type::int_type);
3397       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3398       return offset;
3399    } else {
3400       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3401       return src_reg(reg_offset * message_header_scale);
3402    }
3403 }
3404
3405 /**
3406  * Emits an instruction before @inst to load the value named by @orig_src
3407  * from scratch space at @base_offset to @temp.
3408  *
3409  * @base_offset is measured in 32-byte units (the size of a register).
3410  */
3411 void
3412 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3413                                 dst_reg temp, src_reg orig_src,
3414                                 int base_offset)
3415 {
3416    int reg_offset = base_offset + orig_src.reg_offset;
3417    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3418                                       reg_offset);
3419
3420    emit_before(block, inst, SCRATCH_READ(temp, index));
3421 }
3422
3423 /**
3424  * Emits an instruction after @inst to store the value to be written
3425  * to @orig_dst to scratch space at @base_offset, from @temp.
3426  *
3427  * @base_offset is measured in 32-byte units (the size of a register).
3428  */
3429 void
3430 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3431                                  int base_offset)
3432 {
3433    int reg_offset = base_offset + inst->dst.reg_offset;
3434    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3435                                       reg_offset);
3436
3437    /* Create a temporary register to store *inst's result in.
3438     *
3439     * We have to be careful in MOVing from our temporary result register in
3440     * the scratch write.  If we swizzle from channels of the temporary that
3441     * weren't initialized, it will confuse live interval analysis, which will
3442     * make spilling fail to make progress.
3443     */
3444    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3445                                        inst->dst.type),
3446                                 brw_swizzle_for_mask(inst->dst.writemask));
3447    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3448                                        inst->dst.writemask));
3449    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3450    if (inst->opcode != BRW_OPCODE_SEL)
3451       write->predicate = inst->predicate;
3452    write->ir = inst->ir;
3453    write->annotation = inst->annotation;
3454    inst->insert_after(block, write);
3455
3456    inst->dst.file = temp.file;
3457    inst->dst.reg = temp.reg;
3458    inst->dst.reg_offset = temp.reg_offset;
3459    inst->dst.reladdr = NULL;
3460 }
3461
3462 /**
3463  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3464  * adds the scratch read(s) before \p inst. The function also checks for
3465  * recursive reladdr scratch accesses, issuing the corresponding scratch
3466  * loads and rewriting reladdr references accordingly.
3467  *
3468  * \return \p src if it did not require a scratch load, otherwise, the
3469  * register holding the result of the scratch load that the caller should
3470  * use to rewrite src.
3471  */
3472 src_reg
3473 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3474                                    vec4_instruction *inst, src_reg src)
3475 {
3476    /* Resolve recursive reladdr scratch access by calling ourselves
3477     * with src.reladdr
3478     */
3479    if (src.reladdr)
3480       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3481                                           *src.reladdr);
3482
3483    /* Now handle scratch access on src */
3484    if (src.file == GRF && scratch_loc[src.reg] != -1) {
3485       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3486       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3487       src.reg = temp.reg;
3488       src.reg_offset = temp.reg_offset;
3489       src.reladdr = NULL;
3490    }
3491
3492    return src;
3493 }
3494
3495 /**
3496  * We can't generally support array access in GRF space, because a
3497  * single instruction's destination can only span 2 contiguous
3498  * registers.  So, we send all GRF arrays that get variable index
3499  * access to scratch space.
3500  */
3501 void
3502 vec4_visitor::move_grf_array_access_to_scratch()
3503 {
3504    int scratch_loc[this->alloc.count];
3505    memset(scratch_loc, -1, sizeof(scratch_loc));
3506
3507    /* First, calculate the set of virtual GRFs that need to be punted
3508     * to scratch due to having any array access on them, and where in
3509     * scratch.
3510     */
3511    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3512       if (inst->dst.file == GRF && inst->dst.reladdr) {
3513          if (scratch_loc[inst->dst.reg] == -1) {
3514             scratch_loc[inst->dst.reg] = last_scratch;
3515             last_scratch += this->alloc.sizes[inst->dst.reg];
3516          }
3517
3518          for (src_reg *iter = inst->dst.reladdr;
3519               iter->reladdr;
3520               iter = iter->reladdr) {
3521             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3522                scratch_loc[iter->reg] = last_scratch;
3523                last_scratch += this->alloc.sizes[iter->reg];
3524             }
3525          }
3526       }
3527
3528       for (int i = 0 ; i < 3; i++) {
3529          for (src_reg *iter = &inst->src[i];
3530               iter->reladdr;
3531               iter = iter->reladdr) {
3532             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3533                scratch_loc[iter->reg] = last_scratch;
3534                last_scratch += this->alloc.sizes[iter->reg];
3535             }
3536          }
3537       }
3538    }
3539
3540    /* Now, for anything that will be accessed through scratch, rewrite
3541     * it to load/store.  Note that this is a _safe list walk, because
3542     * we may generate a new scratch_write instruction after the one
3543     * we're processing.
3544     */
3545    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3546       /* Set up the annotation tracking for new generated instructions. */
3547       base_ir = inst->ir;
3548       current_annotation = inst->annotation;
3549
3550       /* First handle scratch access on the dst. Notice we have to handle
3551        * the case where the dst's reladdr also points to scratch space.
3552        */
3553       if (inst->dst.reladdr)
3554          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3555                                                    *inst->dst.reladdr);
3556
3557       /* Now that we have handled any (possibly recursive) reladdr scratch
3558        * accesses for dst we can safely do the scratch write for dst itself
3559        */
3560       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3561          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3562
3563       /* Now handle scratch access on any src. In this case, since inst->src[i]
3564        * already is a src_reg, we can just call emit_resolve_reladdr with
3565        * inst->src[i] and it will take care of handling scratch loads for
3566        * both src and src.reladdr (recursively).
3567        */
3568       for (int i = 0 ; i < 3; i++) {
3569          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3570                                              inst->src[i]);
3571       }
3572    }
3573 }
3574
3575 /**
3576  * Emits an instruction before @inst to load the value named by @orig_src
3577  * from the pull constant buffer (surface) at @base_offset to @temp.
3578  */
3579 void
3580 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3581                                       dst_reg temp, src_reg orig_src,
3582                                       int base_offset)
3583 {
3584    int reg_offset = base_offset + orig_src.reg_offset;
3585    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3586    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3587                                              reg_offset);
3588
3589    emit_pull_constant_load_reg(temp,
3590                                index,
3591                                offset,
3592                                block, inst);
3593 }
3594
3595 /**
3596  * Implements array access of uniforms by inserting a
3597  * PULL_CONSTANT_LOAD instruction.
3598  *
3599  * Unlike temporary GRF array access (where we don't support it due to
3600  * the difficulty of doing relative addressing on instruction
3601  * destinations), we could potentially do array access of uniforms
3602  * that were loaded in GRF space as push constants.  In real-world
3603  * usage we've seen, though, the arrays being used are always larger
3604  * than we could load as push constants, so just always move all
3605  * uniform array access out to a pull constant buffer.
3606  */
3607 void
3608 vec4_visitor::move_uniform_array_access_to_pull_constants()
3609 {
3610    int pull_constant_loc[this->uniforms];
3611    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3612    bool nested_reladdr;
3613
3614    /* Walk through and find array access of uniforms.  Put a copy of that
3615     * uniform in the pull constant buffer.
3616     *
3617     * Note that we don't move constant-indexed accesses to arrays.  No
3618     * testing has been done of the performance impact of this choice.
3619     */
3620    do {
3621       nested_reladdr = false;
3622
3623       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3624          for (int i = 0 ; i < 3; i++) {
3625             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3626                continue;
3627
3628             int uniform = inst->src[i].reg;
3629
3630             if (inst->src[i].reladdr->reladdr)
3631                nested_reladdr = true;  /* will need another pass */
3632
3633             /* If this array isn't already present in the pull constant buffer,
3634              * add it.
3635              */
3636             if (pull_constant_loc[uniform] == -1) {
3637                const gl_constant_value **values =
3638                   &stage_prog_data->param[uniform * 4];
3639
3640                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3641
3642                assert(uniform < uniform_array_size);
3643                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3644                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3645                      = values[j];
3646                }
3647             }
3648
3649             /* Set up the annotation tracking for new generated instructions. */
3650             base_ir = inst->ir;
3651             current_annotation = inst->annotation;
3652
3653             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3654
3655             emit_pull_constant_load(block, inst, temp, inst->src[i],
3656                                     pull_constant_loc[uniform]);
3657
3658             inst->src[i].file = temp.file;
3659             inst->src[i].reg = temp.reg;
3660             inst->src[i].reg_offset = temp.reg_offset;
3661             inst->src[i].reladdr = NULL;
3662          }
3663       }
3664    } while (nested_reladdr);
3665
3666    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3667     * no need to track them as larger-than-vec4 objects.  This will be
3668     * relied on in cutting out unused uniform vectors from push
3669     * constants.
3670     */
3671    split_uniform_registers();
3672 }
3673
3674 void
3675 vec4_visitor::resolve_ud_negate(src_reg *reg)
3676 {
3677    if (reg->type != BRW_REGISTER_TYPE_UD ||
3678        !reg->negate)
3679       return;
3680
3681    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3682    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3683    *reg = temp;
3684 }
3685
3686 /**
3687  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3688  *
3689  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3690  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3691  */
3692 void
3693 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3694 {
3695    assert(devinfo->gen <= 5);
3696
3697    if (!rvalue->type->is_boolean())
3698       return;
3699
3700    src_reg and_result = src_reg(this, rvalue->type);
3701    src_reg neg_result = src_reg(this, rvalue->type);
3702    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3703    emit(MOV(dst_reg(neg_result), negate(and_result)));
3704    *reg = neg_result;
3705 }
3706
3707 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3708                            void *log_data,
3709                            struct gl_program *prog,
3710                            const struct brw_vue_prog_key *key,
3711                            struct brw_vue_prog_data *prog_data,
3712                            struct gl_shader_program *shader_prog,
3713                            gl_shader_stage stage,
3714                            void *mem_ctx,
3715                            bool no_spills,
3716                            int shader_time_index)
3717    : backend_shader(compiler, log_data, mem_ctx,
3718                     shader_prog, prog, &prog_data->base, stage),
3719      key(key),
3720      key_tex(&key->tex),
3721      prog_data(prog_data),
3722      sanity_param_count(0),
3723      fail_msg(NULL),
3724      first_non_payload_grf(0),
3725      need_all_constants_in_pull_buffer(false),
3726      no_spills(no_spills),
3727      shader_time_index(shader_time_index),
3728      last_scratch(0)
3729 {
3730    this->failed = false;
3731
3732    this->base_ir = NULL;
3733    this->current_annotation = NULL;
3734    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3735
3736    this->variable_ht = hash_table_ctor(0,
3737                                        hash_table_pointer_hash,
3738                                        hash_table_pointer_compare);
3739
3740    this->virtual_grf_start = NULL;
3741    this->virtual_grf_end = NULL;
3742    this->live_intervals = NULL;
3743
3744    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3745
3746    this->uniforms = 0;
3747
3748    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3749     * at least one. See setup_uniforms() in brw_vec4.cpp.
3750     */
3751    this->uniform_array_size = 1;
3752    if (prog_data) {
3753       this->uniform_array_size =
3754          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3755    }
3756
3757    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3758    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3759 }
3760
3761 vec4_visitor::~vec4_visitor()
3762 {
3763    hash_table_dtor(this->variable_ht);
3764 }
3765
3766
3767 void
3768 vec4_visitor::fail(const char *format, ...)
3769 {
3770    va_list va;
3771    char *msg;
3772
3773    if (failed)
3774       return;
3775
3776    failed = true;
3777
3778    va_start(va, format);
3779    msg = ralloc_vasprintf(mem_ctx, format, va);
3780    va_end(va);
3781    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3782
3783    this->fail_msg = msg;
3784
3785    if (debug_enabled) {
3786       fprintf(stderr, "%s",  msg);
3787    }
3788 }
3789
3790 } /* namespace brw */