src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(devinfo->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = 14;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = 13;
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 void
 280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 281 {
 282    static enum opcode dot_opcodes[] = {
 283       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 284    };
 285
 286    emit(dot_opcodes[elements - 2], dst, src0, src1);
 287 }
 288
 289 src_reg
 290 vec4_visitor::fix_3src_operand(src_reg src)
 291 {
 292    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 293     * able to use vertical stride of zero to replicate the vec4 uniform, like
 294     *
 295     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 296     *
 297     * But you can't, since vertical stride is always four in three-source
 298     * instructions. Instead, insert a MOV instruction to do the replication so
 299     * that the three-source instruction can consume it.
 300     */
 301
 302    /* The MOV is only needed if the source is a uniform or immediate. */
 303    if (src.file != UNIFORM && src.file != IMM)
 304       return src;
 305
 306    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 307       return src;
 308
 309    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 310    expanded.type = src.type;
 311    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 312    return src_reg(expanded);
 313 }
 314
 315 src_reg
 316 vec4_visitor::fix_math_operand(src_reg src)
 317 {
 318    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 319       return src;
 320
 321    /* The gen6 math instruction ignores the source modifiers --
 322     * swizzle, abs, negate, and at least some parts of the register
 323     * region description.
 324     *
 325     * Rather than trying to enumerate all these cases, *always* expand the
 326     * operand to a temp GRF for gen6.
 327     *
 328     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 329     * can't use.
 330     */
 331
 332    if (devinfo->gen == 7 && src.file != IMM)
 333       return src;
 334
 335    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 336    expanded.type = src.type;
 337    emit(MOV(expanded, src));
 338    return src_reg(expanded);
 339 }
 340
 341 void
 342 vec4_visitor::emit_math(enum opcode opcode,
 343                         const dst_reg &dst,
 344                         const src_reg &src0, const src_reg &src1)
 345 {
 346    vec4_instruction *math =
 347       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 348
 349    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 350       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 351       math->dst = dst_reg(this, glsl_type::vec4_type);
 352       math->dst.type = dst.type;
 353       emit(MOV(dst, src_reg(math->dst)));
 354    } else if (devinfo->gen < 6) {
 355       math->base_mrf = 1;
 356       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 357    }
 358 }
 359
 360 void
 361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 362 {
 363    if (devinfo->gen < 7) {
 364       unreachable("ir_unop_pack_half_2x16 should be lowered");
 365    }
 366
 367    assert(dst.type == BRW_REGISTER_TYPE_UD);
 368    assert(src0.type == BRW_REGISTER_TYPE_F);
 369
 370    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 371     *
 372     *   Because this instruction does not have a 16-bit floating-point type,
 373     *   the destination data type must be Word (W).
 374     *
 375     *   The destination must be DWord-aligned and specify a horizontal stride
 376     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 377     *   each destination channel and the upper word is not modified.
 378     *
 379     * The above restriction implies that the f32to16 instruction must use
 380     * align1 mode, because only in align1 mode is it possible to specify
 381     * horizontal stride.  We choose here to defy the hardware docs and emit
 382     * align16 instructions.
 383     *
 384     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 385     * instructions. I was partially successful in that the code passed all
 386     * tests.  However, the code was dubiously correct and fragile, and the
 387     * tests were not harsh enough to probe that frailty. Not trusting the
 388     * code, I chose instead to remain in align16 mode in defiance of the hw
 389     * docs).
 390     *
 391     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 392     * simulator, emitting a f32to16 in align16 mode with UD as destination
 393     * data type is safe. The behavior differs from that specified in the PRM
 394     * in that the upper word of each destination channel is cleared to 0.
 395     */
 396
 397    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 398    src_reg tmp_src(tmp_dst);
 399
 400 #if 0
 401    /* Verify the undocumented behavior on which the following instructions
 402     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 403     * then the result of the bit-or instruction below will be incorrect.
 404     *
 405     * You should inspect the disasm output in order to verify that the MOV is
 406     * not optimized away.
 407     */
 408    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 409 #endif
 410
 411    /* Give tmp the form below, where "." means untouched.
 412     *
 413     *     w z          y          x w z          y          x
 414     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 415     *
 416     * That the upper word of each write-channel be 0 is required for the
 417     * following bit-shift and bit-or instructions to work. Note that this
 418     * relies on the undocumented hardware behavior mentioned above.
 419     */
 420    tmp_dst.writemask = WRITEMASK_XY;
 421    emit(F32TO16(tmp_dst, src0));
 422
 423    /* Give the write-channels of dst the form:
 424     *   0xhhhh0000
 425     */
 426    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 427    emit(SHL(dst, tmp_src, src_reg(16u)));
 428
 429    /* Finally, give the write-channels of dst the form of packHalf2x16's
 430     * output:
 431     *   0xhhhhllll
 432     */
 433    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 434    emit(OR(dst, src_reg(dst), tmp_src));
 435 }
 436
 437 void
 438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 439 {
 440    if (devinfo->gen < 7) {
 441       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 442    }
 443
 444    assert(dst.type == BRW_REGISTER_TYPE_F);
 445    assert(src0.type == BRW_REGISTER_TYPE_UD);
 446
 447    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 448     *
 449     *   Because this instruction does not have a 16-bit floating-point type,
 450     *   the source data type must be Word (W). The destination type must be
 451     *   F (Float).
 452     *
 453     * To use W as the source data type, we must adjust horizontal strides,
 454     * which is only possible in align1 mode. All my [chadv] attempts at
 455     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 456     * Piglit tests, so I gave up.
 457     *
 458     * I've verified that, on gen7 hardware and the simulator, it is safe to
 459     * emit f16to32 in align16 mode with UD as source data type.
 460     */
 461
 462    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 463    src_reg tmp_src(tmp_dst);
 464
 465    tmp_dst.writemask = WRITEMASK_X;
 466    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 467
 468    tmp_dst.writemask = WRITEMASK_Y;
 469    emit(SHR(tmp_dst, src0, src_reg(16u)));
 470
 471    dst.writemask = WRITEMASK_XY;
 472    emit(F16TO32(dst, tmp_src));
 473 }
 474
 475 void
 476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 477 {
 478    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 479     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 480     * is not suitable to generate the shift values, but we can use the packed
 481     * vector float and a type-converting MOV.
 482     */
 483    dst_reg shift(this, glsl_type::uvec4_type);
 484    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 485
 486    dst_reg shifted(this, glsl_type::uvec4_type);
 487    src0.swizzle = BRW_SWIZZLE_XXXX;
 488    emit(SHR(shifted, src0, src_reg(shift)));
 489
 490    shifted.type = BRW_REGISTER_TYPE_UB;
 491    dst_reg f(this, glsl_type::vec4_type);
 492    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 493
 494    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 495 }
 496
 497 void
 498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 499 {
 500    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 501     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 502     * is not suitable to generate the shift values, but we can use the packed
 503     * vector float and a type-converting MOV.
 504     */
 505    dst_reg shift(this, glsl_type::uvec4_type);
 506    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 507
 508    dst_reg shifted(this, glsl_type::uvec4_type);
 509    src0.swizzle = BRW_SWIZZLE_XXXX;
 510    emit(SHR(shifted, src0, src_reg(shift)));
 511
 512    shifted.type = BRW_REGISTER_TYPE_B;
 513    dst_reg f(this, glsl_type::vec4_type);
 514    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 515
 516    dst_reg scaled(this, glsl_type::vec4_type);
 517    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 518
 519    dst_reg max(this, glsl_type::vec4_type);
 520    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 521    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 522 }
 523
 524 void
 525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 526 {
 527    dst_reg saturated(this, glsl_type::vec4_type);
 528    vec4_instruction *inst = emit(MOV(saturated, src0));
 529    inst->saturate = true;
 530
 531    dst_reg scaled(this, glsl_type::vec4_type);
 532    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 533
 534    dst_reg rounded(this, glsl_type::vec4_type);
 535    emit(RNDE(rounded, src_reg(scaled)));
 536
 537    dst_reg u(this, glsl_type::uvec4_type);
 538    emit(MOV(u, src_reg(rounded)));
 539
 540    src_reg bytes(u);
 541    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 542 }
 543
 544 void
 545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 546 {
 547    dst_reg max(this, glsl_type::vec4_type);
 548    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 549
 550    dst_reg min(this, glsl_type::vec4_type);
 551    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 552
 553    dst_reg scaled(this, glsl_type::vec4_type);
 554    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 555
 556    dst_reg rounded(this, glsl_type::vec4_type);
 557    emit(RNDE(rounded, src_reg(scaled)));
 558
 559    dst_reg i(this, glsl_type::ivec4_type);
 560    emit(MOV(i, src_reg(rounded)));
 561
 562    src_reg bytes(i);
 563    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 564 }
 565
 566 void
 567 vec4_visitor::visit_instructions(const exec_list *list)
 568 {
 569    foreach_in_list(ir_instruction, ir, list) {
 570       base_ir = ir;
 571       ir->accept(this);
 572    }
 573 }
 574
 575 /**
 576  * Returns the minimum number of vec4 elements needed to pack a type.
 577  *
 578  * For simple types, it will return 1 (a single vec4); for matrices, the
 579  * number of columns; for array and struct, the sum of the vec4_size of
 580  * each of its elements; and for sampler and atomic, zero.
 581  *
 582  * This method is useful to calculate how much register space is needed to
 583  * store a particular type.
 584  */
 585 int
 586 vec4_visitor::type_size(const struct glsl_type *type)
 587 {
 588    unsigned int i;
 589    int size;
 590
 591    switch (type->base_type) {
 592    case GLSL_TYPE_UINT:
 593    case GLSL_TYPE_INT:
 594    case GLSL_TYPE_FLOAT:
 595    case GLSL_TYPE_BOOL:
 596       if (type->is_matrix()) {
 597          return type->matrix_columns;
 598       } else {
 599          /* Regardless of size of vector, it gets a vec4. This is bad
 600           * packing for things like floats, but otherwise arrays become a
 601           * mess.  Hopefully a later pass over the code can pack scalars
 602           * down if appropriate.
 603           */
 604          return 1;
 605       }
 606    case GLSL_TYPE_ARRAY:
 607       assert(type->length > 0);
 608       return type_size(type->fields.array) * type->length;
 609    case GLSL_TYPE_STRUCT:
 610       size = 0;
 611       for (i = 0; i < type->length; i++) {
 612          size += type_size(type->fields.structure[i].type);
 613       }
 614       return size;
 615    case GLSL_TYPE_SUBROUTINE:
 616       return 1;
 617
 618    case GLSL_TYPE_SAMPLER:
 619       /* Samplers take up no register space, since they're baked in at
 620        * link time.
 621        */
 622       return 0;
 623    case GLSL_TYPE_ATOMIC_UINT:
 624       return 0;
 625    case GLSL_TYPE_IMAGE:
 626    case GLSL_TYPE_VOID:
 627    case GLSL_TYPE_DOUBLE:
 628    case GLSL_TYPE_ERROR:
 629    case GLSL_TYPE_INTERFACE:
 630       unreachable("not reached");
 631    }
 632
 633    return 0;
 634 }
 635
 636 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 637 {
 638    init();
 639
 640    this->file = GRF;
 641    this->reg = v->alloc.allocate(v->type_size(type));
 642
 643    if (type->is_array() || type->is_record()) {
 644       this->swizzle = BRW_SWIZZLE_NOOP;
 645    } else {
 646       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 647    }
 648
 649    this->type = brw_type_for_base_type(type);
 650 }
 651
 652 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 653 {
 654    assert(size > 0);
 655
 656    init();
 657
 658    this->file = GRF;
 659    this->reg = v->alloc.allocate(v->type_size(type) * size);
 660
 661    this->swizzle = BRW_SWIZZLE_NOOP;
 662
 663    this->type = brw_type_for_base_type(type);
 664 }
 665
 666 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 667 {
 668    init();
 669
 670    this->file = GRF;
 671    this->reg = v->alloc.allocate(v->type_size(type));
 672
 673    if (type->is_array() || type->is_record()) {
 674       this->writemask = WRITEMASK_XYZW;
 675    } else {
 676       this->writemask = (1 << type->vector_elements) - 1;
 677    }
 678
 679    this->type = brw_type_for_base_type(type);
 680 }
 681
 682 void
 683 vec4_visitor::setup_vector_uniform_values(const gl_constant_value *values,
 684                                           unsigned n)
 685 {
 686    static const gl_constant_value zero = { 0 };
 687
 688    for (unsigned i = 0; i < n; ++i)
 689       stage_prog_data->param[4 * uniforms + i] = &values[i];
 690
 691    for (unsigned i = n; i < 4; ++i)
 692       stage_prog_data->param[4 * uniforms + i] = &zero;
 693
 694    uniform_vector_size[uniforms++] = n;
 695 }
 696
 697 /* Our support for uniforms is piggy-backed on the struct
 698  * gl_fragment_program, because that's where the values actually
 699  * get stored, rather than in some global gl_shader_program uniform
 700  * store.
 701  */
 702 void
 703 vec4_visitor::setup_uniform_values(ir_variable *ir)
 704 {
 705    int namelen = strlen(ir->name);
 706
 707    /* The data for our (non-builtin) uniforms is stored in a series of
 708     * gl_uniform_driver_storage structs for each subcomponent that
 709     * glGetUniformLocation() could name.  We know it's been set up in the same
 710     * order we'd walk the type, so walk the list of storage and find anything
 711     * with our name, or the prefix of a component that starts with our name.
 712     */
 713    for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
 714       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 715
 716       if (storage->builtin)
 717          continue;
 718
 719       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 720           (storage->name[namelen] != 0 &&
 721            storage->name[namelen] != '.' &&
 722            storage->name[namelen] != '[')) {
 723          continue;
 724       }
 725
 726       const unsigned vector_count = (MAX2(storage->array_elements, 1) *
 727                                      storage->type->matrix_columns);
 728       const unsigned vector_size = storage->type->vector_elements;
 729
 730       for (unsigned s = 0; s < vector_count; s++)
 731          setup_vector_uniform_values(&storage->storage[s * vector_size],
 732                                      vector_size);
 733    }
 734 }
 735
 736 void
 737 vec4_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
 738 {
 739    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 740       assert(this->uniforms < uniform_array_size);
 741       this->uniform_vector_size[this->uniforms] = 4;
 742       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 743       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 744       for (int j = 0; j < 4; ++j) {
 745          stage_prog_data->param[this->uniforms * 4 + j] =
 746             (gl_constant_value *) &clip_planes[i][j];
 747       }
 748       ++this->uniforms;
 749    }
 750 }
 751
 752 /* Our support for builtin uniforms is even scarier than non-builtin.
 753  * It sits on top of the PROG_STATE_VAR parameters that are
 754  * automatically updated from GL context state.
 755  */
 756 void
 757 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 758 {
 759    const ir_state_slot *const slots = ir->get_state_slots();
 760    assert(slots != NULL);
 761
 762    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 763       /* This state reference has already been setup by ir_to_mesa,
 764        * but we'll get the same index back here.  We can reference
 765        * ParameterValues directly, since unlike brw_fs.cpp, we never
 766        * add new state references during compile.
 767        */
 768       int index = _mesa_add_state_reference(this->prog->Parameters,
 769                                             (gl_state_index *)slots[i].tokens);
 770       gl_constant_value *values =
 771          &this->prog->Parameters->ParameterValues[index][0];
 772
 773       assert(this->uniforms < uniform_array_size);
 774
 775       for (unsigned j = 0; j < 4; j++)
 776          stage_prog_data->param[this->uniforms * 4 + j] =
 777             &values[GET_SWZ(slots[i].swizzle, j)];
 778
 779       this->uniform_vector_size[this->uniforms] =
 780          (ir->type->is_scalar() || ir->type->is_vector() ||
 781           ir->type->is_matrix() ? ir->type->vector_elements : 4);
 782
 783       this->uniforms++;
 784    }
 785 }
 786
 787 dst_reg *
 788 vec4_visitor::variable_storage(ir_variable *var)
 789 {
 790    return (dst_reg *)hash_table_find(this->variable_ht, var);
 791 }
 792
 793 void
 794 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 795                                      enum brw_predicate *predicate)
 796 {
 797    ir_expression *expr = ir->as_expression();
 798
 799    *predicate = BRW_PREDICATE_NORMAL;
 800
 801    if (expr && expr->operation != ir_binop_ubo_load) {
 802       src_reg op[3];
 803       vec4_instruction *inst;
 804
 805       assert(expr->get_num_operands() <= 3);
 806       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 807          expr->operands[i]->accept(this);
 808          op[i] = this->result;
 809
 810          resolve_ud_negate(&op[i]);
 811       }
 812
 813       switch (expr->operation) {
 814       case ir_unop_logic_not:
 815          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 816          inst->conditional_mod = BRW_CONDITIONAL_Z;
 817          break;
 818
 819       case ir_binop_logic_xor:
 820          if (devinfo->gen <= 5) {
 821             src_reg temp = src_reg(this, ir->type);
 822             emit(XOR(dst_reg(temp), op[0], op[1]));
 823             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 824          } else {
 825             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 826          }
 827          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 828          break;
 829
 830       case ir_binop_logic_or:
 831          if (devinfo->gen <= 5) {
 832             src_reg temp = src_reg(this, ir->type);
 833             emit(OR(dst_reg(temp), op[0], op[1]));
 834             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 835          } else {
 836             inst = emit(OR(dst_null_d(), op[0], op[1]));
 837          }
 838          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 839          break;
 840
 841       case ir_binop_logic_and:
 842          if (devinfo->gen <= 5) {
 843             src_reg temp = src_reg(this, ir->type);
 844             emit(AND(dst_reg(temp), op[0], op[1]));
 845             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 846          } else {
 847             inst = emit(AND(dst_null_d(), op[0], op[1]));
 848          }
 849          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 850          break;
 851
 852       case ir_unop_f2b:
 853          if (devinfo->gen >= 6) {
 854             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 855          } else {
 856             inst = emit(MOV(dst_null_f(), op[0]));
 857             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 858          }
 859          break;
 860
 861       case ir_unop_i2b:
 862          if (devinfo->gen >= 6) {
 863             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 864          } else {
 865             inst = emit(MOV(dst_null_d(), op[0]));
 866             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 867          }
 868          break;
 869
 870       case ir_binop_all_equal:
 871          if (devinfo->gen <= 5) {
 872             resolve_bool_comparison(expr->operands[0], &op[0]);
 873             resolve_bool_comparison(expr->operands[1], &op[1]);
 874          }
 875          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 876          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 877          break;
 878
 879       case ir_binop_any_nequal:
 880          if (devinfo->gen <= 5) {
 881             resolve_bool_comparison(expr->operands[0], &op[0]);
 882             resolve_bool_comparison(expr->operands[1], &op[1]);
 883          }
 884          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 885          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 886          break;
 887
 888       case ir_unop_any:
 889          if (devinfo->gen <= 5) {
 890             resolve_bool_comparison(expr->operands[0], &op[0]);
 891          }
 892          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 893          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 894          break;
 895
 896       case ir_binop_greater:
 897       case ir_binop_gequal:
 898       case ir_binop_less:
 899       case ir_binop_lequal:
 900       case ir_binop_equal:
 901       case ir_binop_nequal:
 902          if (devinfo->gen <= 5) {
 903             resolve_bool_comparison(expr->operands[0], &op[0]);
 904             resolve_bool_comparison(expr->operands[1], &op[1]);
 905          }
 906          emit(CMP(dst_null_d(), op[0], op[1],
 907                   brw_conditional_for_comparison(expr->operation)));
 908          break;
 909
 910       case ir_triop_csel: {
 911          /* Expand the boolean condition into the flag register. */
 912          inst = emit(MOV(dst_null_d(), op[0]));
 913          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 914
 915          /* Select which boolean to return. */
 916          dst_reg temp(this, expr->operands[1]->type);
 917          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 918          inst->predicate = BRW_PREDICATE_NORMAL;
 919
 920          /* Expand the result to a condition code. */
 921          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 922          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 923          break;
 924       }
 925
 926       default:
 927          unreachable("not reached");
 928       }
 929       return;
 930    }
 931
 932    ir->accept(this);
 933
 934    resolve_ud_negate(&this->result);
 935
 936    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 937    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 938 }
 939
 940 /**
 941  * Emit a gen6 IF statement with the comparison folded into the IF
 942  * instruction.
 943  */
 944 void
 945 vec4_visitor::emit_if_gen6(ir_if *ir)
 946 {
 947    ir_expression *expr = ir->condition->as_expression();
 948
 949    if (expr && expr->operation != ir_binop_ubo_load) {
 950       src_reg op[3];
 951       dst_reg temp;
 952
 953       assert(expr->get_num_operands() <= 3);
 954       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 955          expr->operands[i]->accept(this);
 956          op[i] = this->result;
 957       }
 958
 959       switch (expr->operation) {
 960       case ir_unop_logic_not:
 961          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 962          return;
 963
 964       case ir_binop_logic_xor:
 965          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 966          return;
 967
 968       case ir_binop_logic_or:
 969          temp = dst_reg(this, glsl_type::bool_type);
 970          emit(OR(temp, op[0], op[1]));
 971          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 972          return;
 973
 974       case ir_binop_logic_and:
 975          temp = dst_reg(this, glsl_type::bool_type);
 976          emit(AND(temp, op[0], op[1]));
 977          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 978          return;
 979
 980       case ir_unop_f2b:
 981          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 982          return;
 983
 984       case ir_unop_i2b:
 985          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 986          return;
 987
 988       case ir_binop_greater:
 989       case ir_binop_gequal:
 990       case ir_binop_less:
 991       case ir_binop_lequal:
 992       case ir_binop_equal:
 993       case ir_binop_nequal:
 994          emit(IF(op[0], op[1],
 995                  brw_conditional_for_comparison(expr->operation)));
 996          return;
 997
 998       case ir_binop_all_equal:
 999          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1000          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1001          return;
1002
1003       case ir_binop_any_nequal:
1004          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1005          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1006          return;
1007
1008       case ir_unop_any:
1009          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1010          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1011          return;
1012
1013       case ir_triop_csel: {
1014          /* Expand the boolean condition into the flag register. */
1015          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1016          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1017
1018          /* Select which boolean to return. */
1019          dst_reg temp(this, expr->operands[1]->type);
1020          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1021          inst->predicate = BRW_PREDICATE_NORMAL;
1022
1023          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1024          return;
1025       }
1026
1027       default:
1028          unreachable("not reached");
1029       }
1030       return;
1031    }
1032
1033    ir->condition->accept(this);
1034
1035    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1036 }
1037
1038 void
1039 vec4_visitor::visit(ir_variable *ir)
1040 {
1041    dst_reg *reg = NULL;
1042
1043    if (variable_storage(ir))
1044       return;
1045
1046    switch (ir->data.mode) {
1047    case ir_var_shader_in:
1048       assert(ir->data.location != -1);
1049       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1050       break;
1051
1052    case ir_var_shader_out:
1053       assert(ir->data.location != -1);
1054       reg = new(mem_ctx) dst_reg(this, ir->type);
1055
1056       for (int i = 0; i < type_size(ir->type); i++) {
1057          output_reg[ir->data.location + i] = *reg;
1058          output_reg[ir->data.location + i].reg_offset = i;
1059          output_reg_annotation[ir->data.location + i] = ir->name;
1060       }
1061       break;
1062
1063    case ir_var_auto:
1064    case ir_var_temporary:
1065       reg = new(mem_ctx) dst_reg(this, ir->type);
1066       break;
1067
1068    case ir_var_uniform:
1069       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1070
1071       /* Thanks to the lower_ubo_reference pass, we will see only
1072        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1073        * variables, so no need for them to be in variable_ht.
1074        *
1075        * Some uniforms, such as samplers and atomic counters, have no actual
1076        * storage, so we should ignore them.
1077        */
1078       if (ir->is_in_buffer_block() || type_size(ir->type) == 0)
1079          return;
1080
1081       /* Track how big the whole uniform variable is, in case we need to put a
1082        * copy of its data into pull constants for array access.
1083        */
1084       assert(this->uniforms < uniform_array_size);
1085       this->uniform_size[this->uniforms] = type_size(ir->type);
1086
1087       if (!strncmp(ir->name, "gl_", 3)) {
1088          setup_builtin_uniform_values(ir);
1089       } else {
1090          setup_uniform_values(ir);
1091       }
1092       break;
1093
1094    case ir_var_system_value:
1095       reg = make_reg_for_system_value(ir->data.location, ir->type);
1096       break;
1097
1098    default:
1099       unreachable("not reached");
1100    }
1101
1102    reg->type = brw_type_for_base_type(ir->type);
1103    hash_table_insert(this->variable_ht, reg, ir);
1104 }
1105
1106 void
1107 vec4_visitor::visit(ir_loop *ir)
1108 {
1109    /* We don't want debugging output to print the whole body of the
1110     * loop as the annotation.
1111     */
1112    this->base_ir = NULL;
1113
1114    emit(BRW_OPCODE_DO);
1115
1116    visit_instructions(&ir->body_instructions);
1117
1118    emit(BRW_OPCODE_WHILE);
1119 }
1120
1121 void
1122 vec4_visitor::visit(ir_loop_jump *ir)
1123 {
1124    switch (ir->mode) {
1125    case ir_loop_jump::jump_break:
1126       emit(BRW_OPCODE_BREAK);
1127       break;
1128    case ir_loop_jump::jump_continue:
1129       emit(BRW_OPCODE_CONTINUE);
1130       break;
1131    }
1132 }
1133
1134
1135 void
1136 vec4_visitor::visit(ir_function_signature *)
1137 {
1138    unreachable("not reached");
1139 }
1140
1141 void
1142 vec4_visitor::visit(ir_function *ir)
1143 {
1144    /* Ignore function bodies other than main() -- we shouldn't see calls to
1145     * them since they should all be inlined.
1146     */
1147    if (strcmp(ir->name, "main") == 0) {
1148       const ir_function_signature *sig;
1149       exec_list empty;
1150
1151       sig = ir->matching_signature(NULL, &empty, false);
1152
1153       assert(sig);
1154
1155       visit_instructions(&sig->body);
1156    }
1157 }
1158
1159 bool
1160 vec4_visitor::try_emit_mad(ir_expression *ir)
1161 {
1162    /* 3-src instructions were introduced in gen6. */
1163    if (devinfo->gen < 6)
1164       return false;
1165
1166    /* MAD can only handle floating-point data. */
1167    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1168       return false;
1169
1170    ir_rvalue *nonmul;
1171    ir_expression *mul;
1172    bool mul_negate, mul_abs;
1173
1174    for (int i = 0; i < 2; i++) {
1175       mul_negate = false;
1176       mul_abs = false;
1177
1178       mul = ir->operands[i]->as_expression();
1179       nonmul = ir->operands[1 - i];
1180
1181       if (mul && mul->operation == ir_unop_abs) {
1182          mul = mul->operands[0]->as_expression();
1183          mul_abs = true;
1184       } else if (mul && mul->operation == ir_unop_neg) {
1185          mul = mul->operands[0]->as_expression();
1186          mul_negate = true;
1187       }
1188
1189       if (mul && mul->operation == ir_binop_mul)
1190          break;
1191    }
1192
1193    if (!mul || mul->operation != ir_binop_mul)
1194       return false;
1195
1196    nonmul->accept(this);
1197    src_reg src0 = fix_3src_operand(this->result);
1198
1199    mul->operands[0]->accept(this);
1200    src_reg src1 = fix_3src_operand(this->result);
1201    src1.negate ^= mul_negate;
1202    src1.abs = mul_abs;
1203    if (mul_abs)
1204       src1.negate = false;
1205
1206    mul->operands[1]->accept(this);
1207    src_reg src2 = fix_3src_operand(this->result);
1208    src2.abs = mul_abs;
1209    if (mul_abs)
1210       src2.negate = false;
1211
1212    this->result = src_reg(this, ir->type);
1213    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1214
1215    return true;
1216 }
1217
1218 bool
1219 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1220 {
1221    /* This optimization relies on CMP setting the destination to 0 when
1222     * false.  Early hardware only sets the least significant bit, and
1223     * leaves the other bits undefined.  So we can't use it.
1224     */
1225    if (devinfo->gen < 6)
1226       return false;
1227
1228    ir_expression *const cmp = ir->operands[0]->as_expression();
1229
1230    if (cmp == NULL)
1231       return false;
1232
1233    switch (cmp->operation) {
1234    case ir_binop_less:
1235    case ir_binop_greater:
1236    case ir_binop_lequal:
1237    case ir_binop_gequal:
1238    case ir_binop_equal:
1239    case ir_binop_nequal:
1240       break;
1241
1242    default:
1243       return false;
1244    }
1245
1246    cmp->operands[0]->accept(this);
1247    const src_reg cmp_src0 = this->result;
1248
1249    cmp->operands[1]->accept(this);
1250    const src_reg cmp_src1 = this->result;
1251
1252    this->result = src_reg(this, ir->type);
1253
1254    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1255             brw_conditional_for_comparison(cmp->operation)));
1256
1257    /* If the comparison is false, this->result will just happen to be zero.
1258     */
1259    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1260                                        this->result, src_reg(1.0f));
1261    inst->predicate = BRW_PREDICATE_NORMAL;
1262    inst->predicate_inverse = true;
1263
1264    return true;
1265 }
1266
1267 void
1268 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1269                           src_reg src0, src_reg src1)
1270 {
1271    vec4_instruction *inst;
1272
1273    if (devinfo->gen >= 6) {
1274       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1275       inst->conditional_mod = conditionalmod;
1276    } else {
1277       emit(CMP(dst, src0, src1, conditionalmod));
1278
1279       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1280       inst->predicate = BRW_PREDICATE_NORMAL;
1281    }
1282 }
1283
1284 void
1285 vec4_visitor::emit_lrp(const dst_reg &dst,
1286                        const src_reg &x, const src_reg &y, const src_reg &a)
1287 {
1288    if (devinfo->gen >= 6) {
1289       /* Note that the instruction's argument order is reversed from GLSL
1290        * and the IR.
1291        */
1292       emit(LRP(dst,
1293                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1294    } else {
1295       /* Earlier generations don't support three source operations, so we
1296        * need to emit x*(1-a) + y*a.
1297        */
1298       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1299       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1300       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1301       y_times_a.writemask           = dst.writemask;
1302       one_minus_a.writemask         = dst.writemask;
1303       x_times_one_minus_a.writemask = dst.writemask;
1304
1305       emit(MUL(y_times_a, y, a));
1306       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1307       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1308       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1309    }
1310 }
1311
1312 /**
1313  * Emits the instructions needed to perform a pull constant load. before_block
1314  * and before_inst can be NULL in which case the instruction will be appended
1315  * to the end of the instruction list.
1316  */
1317 void
1318 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1319                                           src_reg surf_index,
1320                                           src_reg offset_reg,
1321                                           bblock_t *before_block,
1322                                           vec4_instruction *before_inst)
1323 {
1324    assert((before_inst == NULL && before_block == NULL) ||
1325           (before_inst && before_block));
1326
1327    vec4_instruction *pull;
1328
1329    if (devinfo->gen >= 9) {
1330       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1331       src_reg header(this, glsl_type::uvec4_type, 2);
1332
1333       pull = new(mem_ctx)
1334          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1335                           dst_reg(header));
1336
1337       if (before_inst)
1338          emit_before(before_block, before_inst, pull);
1339       else
1340          emit(pull);
1341
1342       dst_reg index_reg = retype(offset(dst_reg(header), 1),
1343                                  offset_reg.type);
1344       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1345
1346       if (before_inst)
1347          emit_before(before_block, before_inst, pull);
1348       else
1349          emit(pull);
1350
1351       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1352                                            dst,
1353                                            surf_index,
1354                                            header);
1355       pull->mlen = 2;
1356       pull->header_size = 1;
1357    } else if (devinfo->gen >= 7) {
1358       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1359
1360       grf_offset.type = offset_reg.type;
1361
1362       pull = MOV(grf_offset, offset_reg);
1363
1364       if (before_inst)
1365          emit_before(before_block, before_inst, pull);
1366       else
1367          emit(pull);
1368
1369       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1370                                            dst,
1371                                            surf_index,
1372                                            src_reg(grf_offset));
1373       pull->mlen = 1;
1374    } else {
1375       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1376                                            dst,
1377                                            surf_index,
1378                                            offset_reg);
1379       pull->base_mrf = 14;
1380       pull->mlen = 1;
1381    }
1382
1383    if (before_inst)
1384       emit_before(before_block, before_inst, pull);
1385    else
1386       emit(pull);
1387 }
1388
1389 src_reg
1390 vec4_visitor::emit_uniformize(const src_reg &src)
1391 {
1392    const src_reg chan_index(this, glsl_type::uint_type);
1393    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
1394                               src.type);
1395
1396    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1397       ->force_writemask_all = true;
1398    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1399       ->force_writemask_all = true;
1400
1401    return src_reg(dst);
1402 }
1403
1404 void
1405 vec4_visitor::visit(ir_expression *ir)
1406 {
1407    unsigned int operand;
1408    src_reg op[ARRAY_SIZE(ir->operands)];
1409    vec4_instruction *inst;
1410
1411    if (ir->operation == ir_binop_add) {
1412       if (try_emit_mad(ir))
1413          return;
1414    }
1415
1416    if (ir->operation == ir_unop_b2f) {
1417       if (try_emit_b2f_of_compare(ir))
1418          return;
1419    }
1420
1421    /* Storage for our result.  Ideally for an assignment we'd be using
1422     * the actual storage for the result here, instead.
1423     */
1424    dst_reg result_dst(this, ir->type);
1425    src_reg result_src(result_dst);
1426
1427    if (ir->operation == ir_triop_csel) {
1428       ir->operands[1]->accept(this);
1429       op[1] = this->result;
1430       ir->operands[2]->accept(this);
1431       op[2] = this->result;
1432
1433       enum brw_predicate predicate;
1434       emit_bool_to_cond_code(ir->operands[0], &predicate);
1435       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1436       inst->predicate = predicate;
1437       this->result = result_src;
1438       return;
1439    }
1440
1441    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1442       this->result.file = BAD_FILE;
1443       ir->operands[operand]->accept(this);
1444       if (this->result.file == BAD_FILE) {
1445          fprintf(stderr, "Failed to get tree for expression operand:\n");
1446          ir->operands[operand]->fprint(stderr);
1447          exit(1);
1448       }
1449       op[operand] = this->result;
1450
1451       /* Matrix expression operands should have been broken down to vector
1452        * operations already.
1453        */
1454       assert(!ir->operands[operand]->type->is_matrix());
1455    }
1456
1457    /* If nothing special happens, this is the result. */
1458    this->result = result_src;
1459
1460    switch (ir->operation) {
1461    case ir_unop_logic_not:
1462       emit(NOT(result_dst, op[0]));
1463       break;
1464    case ir_unop_neg:
1465       op[0].negate = !op[0].negate;
1466       emit(MOV(result_dst, op[0]));
1467       break;
1468    case ir_unop_abs:
1469       op[0].abs = true;
1470       op[0].negate = false;
1471       emit(MOV(result_dst, op[0]));
1472       break;
1473
1474    case ir_unop_sign:
1475       if (ir->type->is_float()) {
1476          /* AND(val, 0x80000000) gives the sign bit.
1477           *
1478           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1479           * zero.
1480           */
1481          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1482
1483          op[0].type = BRW_REGISTER_TYPE_UD;
1484          result_dst.type = BRW_REGISTER_TYPE_UD;
1485          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1486
1487          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1488          inst->predicate = BRW_PREDICATE_NORMAL;
1489
1490          this->result.type = BRW_REGISTER_TYPE_F;
1491       } else {
1492          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1493           *               -> non-negative val generates 0x00000000.
1494           *  Predicated OR sets 1 if val is positive.
1495           */
1496          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1497
1498          emit(ASR(result_dst, op[0], src_reg(31)));
1499
1500          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1501          inst->predicate = BRW_PREDICATE_NORMAL;
1502       }
1503       break;
1504
1505    case ir_unop_rcp:
1506       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1507       break;
1508
1509    case ir_unop_exp2:
1510       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1511       break;
1512    case ir_unop_log2:
1513       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1514       break;
1515    case ir_unop_exp:
1516    case ir_unop_log:
1517       unreachable("not reached: should be handled by ir_explog_to_explog2");
1518    case ir_unop_sin:
1519       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1520       break;
1521    case ir_unop_cos:
1522       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1523       break;
1524
1525    case ir_unop_dFdx:
1526    case ir_unop_dFdx_coarse:
1527    case ir_unop_dFdx_fine:
1528    case ir_unop_dFdy:
1529    case ir_unop_dFdy_coarse:
1530    case ir_unop_dFdy_fine:
1531       unreachable("derivatives not valid in vertex shader");
1532
1533    case ir_unop_bitfield_reverse:
1534       emit(BFREV(result_dst, op[0]));
1535       break;
1536    case ir_unop_bit_count:
1537       emit(CBIT(result_dst, op[0]));
1538       break;
1539    case ir_unop_find_msb: {
1540       src_reg temp = src_reg(this, glsl_type::uint_type);
1541
1542       inst = emit(FBH(dst_reg(temp), op[0]));
1543       inst->dst.writemask = WRITEMASK_XYZW;
1544
1545       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1546        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1547        * subtract the result from 31 to convert the MSB count into an LSB count.
1548        */
1549
1550       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1551       temp.swizzle = BRW_SWIZZLE_NOOP;
1552       emit(MOV(result_dst, temp));
1553
1554       src_reg src_tmp = src_reg(result_dst);
1555       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1556
1557       src_tmp.negate = true;
1558       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1559       inst->predicate = BRW_PREDICATE_NORMAL;
1560       break;
1561    }
1562    case ir_unop_find_lsb:
1563       emit(FBL(result_dst, op[0]));
1564       break;
1565    case ir_unop_saturate:
1566       inst = emit(MOV(result_dst, op[0]));
1567       inst->saturate = true;
1568       break;
1569
1570    case ir_unop_noise:
1571       unreachable("not reached: should be handled by lower_noise");
1572
1573    case ir_unop_subroutine_to_int:
1574       emit(MOV(result_dst, op[0]));
1575       break;
1576
1577    case ir_binop_add:
1578       emit(ADD(result_dst, op[0], op[1]));
1579       break;
1580    case ir_binop_sub:
1581       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1582
1583    case ir_binop_mul:
1584       if (devinfo->gen < 8 && ir->type->is_integer()) {
1585          /* For integer multiplication, the MUL uses the low 16 bits of one of
1586           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1587           * accumulates in the contribution of the upper 16 bits of that
1588           * operand.  If we can determine that one of the args is in the low
1589           * 16 bits, though, we can just emit a single MUL.
1590           */
1591          if (ir->operands[0]->is_uint16_constant()) {
1592             if (devinfo->gen < 7)
1593                emit(MUL(result_dst, op[0], op[1]));
1594             else
1595                emit(MUL(result_dst, op[1], op[0]));
1596          } else if (ir->operands[1]->is_uint16_constant()) {
1597             if (devinfo->gen < 7)
1598                emit(MUL(result_dst, op[1], op[0]));
1599             else
1600                emit(MUL(result_dst, op[0], op[1]));
1601          } else {
1602             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1603
1604             emit(MUL(acc, op[0], op[1]));
1605             emit(MACH(dst_null_d(), op[0], op[1]));
1606             emit(MOV(result_dst, src_reg(acc)));
1607          }
1608       } else {
1609          emit(MUL(result_dst, op[0], op[1]));
1610       }
1611       break;
1612    case ir_binop_imul_high: {
1613       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1614
1615       emit(MUL(acc, op[0], op[1]));
1616       emit(MACH(result_dst, op[0], op[1]));
1617       break;
1618    }
1619    case ir_binop_div:
1620       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1621       assert(ir->type->is_integer());
1622       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1623       break;
1624
1625    case ir_binop_carry:
1626       unreachable("Should have been lowered by carry_to_arith().");
1627
1628    case ir_binop_borrow:
1629       unreachable("Should have been lowered by borrow_to_arith().");
1630
1631    case ir_binop_mod:
1632       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1633       assert(ir->type->is_integer());
1634       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1635       break;
1636
1637    case ir_binop_less:
1638    case ir_binop_greater:
1639    case ir_binop_lequal:
1640    case ir_binop_gequal:
1641    case ir_binop_equal:
1642    case ir_binop_nequal: {
1643       if (devinfo->gen <= 5) {
1644          resolve_bool_comparison(ir->operands[0], &op[0]);
1645          resolve_bool_comparison(ir->operands[1], &op[1]);
1646       }
1647       emit(CMP(result_dst, op[0], op[1],
1648                brw_conditional_for_comparison(ir->operation)));
1649       break;
1650    }
1651
1652    case ir_binop_all_equal:
1653       if (devinfo->gen <= 5) {
1654          resolve_bool_comparison(ir->operands[0], &op[0]);
1655          resolve_bool_comparison(ir->operands[1], &op[1]);
1656       }
1657
1658       /* "==" operator producing a scalar boolean. */
1659       if (ir->operands[0]->type->is_vector() ||
1660           ir->operands[1]->type->is_vector()) {
1661          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1662          emit(MOV(result_dst, src_reg(0)));
1663          inst = emit(MOV(result_dst, src_reg(~0)));
1664          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1665       } else {
1666          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1667       }
1668       break;
1669    case ir_binop_any_nequal:
1670       if (devinfo->gen <= 5) {
1671          resolve_bool_comparison(ir->operands[0], &op[0]);
1672          resolve_bool_comparison(ir->operands[1], &op[1]);
1673       }
1674
1675       /* "!=" operator producing a scalar boolean. */
1676       if (ir->operands[0]->type->is_vector() ||
1677           ir->operands[1]->type->is_vector()) {
1678          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1679
1680          emit(MOV(result_dst, src_reg(0)));
1681          inst = emit(MOV(result_dst, src_reg(~0)));
1682          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1683       } else {
1684          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1685       }
1686       break;
1687
1688    case ir_unop_any:
1689       if (devinfo->gen <= 5) {
1690          resolve_bool_comparison(ir->operands[0], &op[0]);
1691       }
1692       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1693       emit(MOV(result_dst, src_reg(0)));
1694
1695       inst = emit(MOV(result_dst, src_reg(~0)));
1696       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1697       break;
1698
1699    case ir_binop_logic_xor:
1700       emit(XOR(result_dst, op[0], op[1]));
1701       break;
1702
1703    case ir_binop_logic_or:
1704       emit(OR(result_dst, op[0], op[1]));
1705       break;
1706
1707    case ir_binop_logic_and:
1708       emit(AND(result_dst, op[0], op[1]));
1709       break;
1710
1711    case ir_binop_dot:
1712       assert(ir->operands[0]->type->is_vector());
1713       assert(ir->operands[0]->type == ir->operands[1]->type);
1714       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1715       break;
1716
1717    case ir_unop_sqrt:
1718       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1719       break;
1720    case ir_unop_rsq:
1721       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1722       break;
1723
1724    case ir_unop_bitcast_i2f:
1725    case ir_unop_bitcast_u2f:
1726       this->result = op[0];
1727       this->result.type = BRW_REGISTER_TYPE_F;
1728       break;
1729
1730    case ir_unop_bitcast_f2i:
1731       this->result = op[0];
1732       this->result.type = BRW_REGISTER_TYPE_D;
1733       break;
1734
1735    case ir_unop_bitcast_f2u:
1736       this->result = op[0];
1737       this->result.type = BRW_REGISTER_TYPE_UD;
1738       break;
1739
1740    case ir_unop_i2f:
1741    case ir_unop_i2u:
1742    case ir_unop_u2i:
1743    case ir_unop_u2f:
1744    case ir_unop_f2i:
1745    case ir_unop_f2u:
1746       emit(MOV(result_dst, op[0]));
1747       break;
1748    case ir_unop_b2i:
1749    case ir_unop_b2f:
1750       if (devinfo->gen <= 5) {
1751          resolve_bool_comparison(ir->operands[0], &op[0]);
1752       }
1753       emit(MOV(result_dst, negate(op[0])));
1754       break;
1755    case ir_unop_f2b:
1756       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1757       break;
1758    case ir_unop_i2b:
1759       emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1760       break;
1761
1762    case ir_unop_trunc:
1763       emit(RNDZ(result_dst, op[0]));
1764       break;
1765    case ir_unop_ceil: {
1766          src_reg tmp = src_reg(this, ir->type);
1767          op[0].negate = !op[0].negate;
1768          emit(RNDD(dst_reg(tmp), op[0]));
1769          tmp.negate = true;
1770          emit(MOV(result_dst, tmp));
1771       }
1772       break;
1773    case ir_unop_floor:
1774       inst = emit(RNDD(result_dst, op[0]));
1775       break;
1776    case ir_unop_fract:
1777       inst = emit(FRC(result_dst, op[0]));
1778       break;
1779    case ir_unop_round_even:
1780       emit(RNDE(result_dst, op[0]));
1781       break;
1782
1783    case ir_binop_min:
1784       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1785       break;
1786    case ir_binop_max:
1787       emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1788       break;
1789
1790    case ir_binop_pow:
1791       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1792       break;
1793
1794    case ir_unop_bit_not:
1795       inst = emit(NOT(result_dst, op[0]));
1796       break;
1797    case ir_binop_bit_and:
1798       inst = emit(AND(result_dst, op[0], op[1]));
1799       break;
1800    case ir_binop_bit_xor:
1801       inst = emit(XOR(result_dst, op[0], op[1]));
1802       break;
1803    case ir_binop_bit_or:
1804       inst = emit(OR(result_dst, op[0], op[1]));
1805       break;
1806
1807    case ir_binop_lshift:
1808       inst = emit(SHL(result_dst, op[0], op[1]));
1809       break;
1810
1811    case ir_binop_rshift:
1812       if (ir->type->base_type == GLSL_TYPE_INT)
1813          inst = emit(ASR(result_dst, op[0], op[1]));
1814       else
1815          inst = emit(SHR(result_dst, op[0], op[1]));
1816       break;
1817
1818    case ir_binop_bfm:
1819       emit(BFI1(result_dst, op[0], op[1]));
1820       break;
1821
1822    case ir_binop_ubo_load: {
1823       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1824       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1825       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1826       src_reg offset;
1827
1828       /* Now, load the vector from that offset. */
1829       assert(ir->type->is_vector() || ir->type->is_scalar());
1830
1831       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1832       packed_consts.type = result.type;
1833       src_reg surf_index;
1834
1835       if (const_uniform_block) {
1836          /* The block index is a constant, so just emit the binding table entry
1837           * as an immediate.
1838           */
1839          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1840                               const_uniform_block->value.u[0]);
1841       } else {
1842          /* The block index is not a constant. Evaluate the index expression
1843           * per-channel and add the base UBO index; we have to select a value
1844           * from any live channel.
1845           */
1846          surf_index = src_reg(this, glsl_type::uint_type);
1847          emit(ADD(dst_reg(surf_index), op[0],
1848                   src_reg(prog_data->base.binding_table.ubo_start)));
1849          surf_index = emit_uniformize(surf_index);
1850
1851          /* Assume this may touch any UBO. It would be nice to provide
1852           * a tighter bound, but the array information is already lowered away.
1853           */
1854          brw_mark_surface_used(&prog_data->base,
1855                                prog_data->base.binding_table.ubo_start +
1856                                shader_prog->NumUniformBlocks - 1);
1857       }
1858
1859       if (const_offset_ir) {
1860          if (devinfo->gen >= 8) {
1861             /* Store the offset in a GRF so we can send-from-GRF. */
1862             offset = src_reg(this, glsl_type::int_type);
1863             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1864          } else {
1865             /* Immediates are fine on older generations since they'll be moved
1866              * to a (potentially fake) MRF at the generator level.
1867              */
1868             offset = src_reg(const_offset / 16);
1869          }
1870       } else {
1871          offset = src_reg(this, glsl_type::uint_type);
1872          emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1873       }
1874
1875       emit_pull_constant_load_reg(dst_reg(packed_consts),
1876                                   surf_index,
1877                                   offset,
1878                                   NULL, NULL /* before_block/inst */);
1879
1880       packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1881       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1882                                             const_offset % 16 / 4,
1883                                             const_offset % 16 / 4,
1884                                             const_offset % 16 / 4);
1885
1886       /* UBO bools are any nonzero int.  We need to convert them to 0/~0. */
1887       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1888          emit(CMP(result_dst, packed_consts, src_reg(0u),
1889                   BRW_CONDITIONAL_NZ));
1890       } else {
1891          emit(MOV(result_dst, packed_consts));
1892       }
1893       break;
1894    }
1895
1896    case ir_binop_vector_extract:
1897       unreachable("should have been lowered by vec_index_to_cond_assign");
1898
1899    case ir_triop_fma:
1900       op[0] = fix_3src_operand(op[0]);
1901       op[1] = fix_3src_operand(op[1]);
1902       op[2] = fix_3src_operand(op[2]);
1903       /* Note that the instruction's argument order is reversed from GLSL
1904        * and the IR.
1905        */
1906       emit(MAD(result_dst, op[2], op[1], op[0]));
1907       break;
1908
1909    case ir_triop_lrp:
1910       emit_lrp(result_dst, op[0], op[1], op[2]);
1911       break;
1912
1913    case ir_triop_csel:
1914       unreachable("already handled above");
1915       break;
1916
1917    case ir_triop_bfi:
1918       op[0] = fix_3src_operand(op[0]);
1919       op[1] = fix_3src_operand(op[1]);
1920       op[2] = fix_3src_operand(op[2]);
1921       emit(BFI2(result_dst, op[0], op[1], op[2]));
1922       break;
1923
1924    case ir_triop_bitfield_extract:
1925       op[0] = fix_3src_operand(op[0]);
1926       op[1] = fix_3src_operand(op[1]);
1927       op[2] = fix_3src_operand(op[2]);
1928       /* Note that the instruction's argument order is reversed from GLSL
1929        * and the IR.
1930        */
1931       emit(BFE(result_dst, op[2], op[1], op[0]));
1932       break;
1933
1934    case ir_triop_vector_insert:
1935       unreachable("should have been lowered by lower_vector_insert");
1936
1937    case ir_quadop_bitfield_insert:
1938       unreachable("not reached: should be handled by "
1939               "bitfield_insert_to_bfm_bfi\n");
1940
1941    case ir_quadop_vector:
1942       unreachable("not reached: should be handled by lower_quadop_vector");
1943
1944    case ir_unop_pack_half_2x16:
1945       emit_pack_half_2x16(result_dst, op[0]);
1946       break;
1947    case ir_unop_unpack_half_2x16:
1948       emit_unpack_half_2x16(result_dst, op[0]);
1949       break;
1950    case ir_unop_unpack_unorm_4x8:
1951       emit_unpack_unorm_4x8(result_dst, op[0]);
1952       break;
1953    case ir_unop_unpack_snorm_4x8:
1954       emit_unpack_snorm_4x8(result_dst, op[0]);
1955       break;
1956    case ir_unop_pack_unorm_4x8:
1957       emit_pack_unorm_4x8(result_dst, op[0]);
1958       break;
1959    case ir_unop_pack_snorm_4x8:
1960       emit_pack_snorm_4x8(result_dst, op[0]);
1961       break;
1962    case ir_unop_pack_snorm_2x16:
1963    case ir_unop_pack_unorm_2x16:
1964    case ir_unop_unpack_snorm_2x16:
1965    case ir_unop_unpack_unorm_2x16:
1966       unreachable("not reached: should be handled by lower_packing_builtins");
1967    case ir_unop_unpack_half_2x16_split_x:
1968    case ir_unop_unpack_half_2x16_split_y:
1969    case ir_binop_pack_half_2x16_split:
1970    case ir_unop_interpolate_at_centroid:
1971    case ir_binop_interpolate_at_sample:
1972    case ir_binop_interpolate_at_offset:
1973       unreachable("not reached: should not occur in vertex shader");
1974    case ir_binop_ldexp:
1975       unreachable("not reached: should be handled by ldexp_to_arith()");
1976    case ir_unop_d2f:
1977    case ir_unop_f2d:
1978    case ir_unop_d2i:
1979    case ir_unop_i2d:
1980    case ir_unop_d2u:
1981    case ir_unop_u2d:
1982    case ir_unop_d2b:
1983    case ir_unop_pack_double_2x32:
1984    case ir_unop_unpack_double_2x32:
1985    case ir_unop_frexp_sig:
1986    case ir_unop_frexp_exp:
1987       unreachable("fp64 todo");
1988    }
1989 }
1990
1991
1992 void
1993 vec4_visitor::visit(ir_swizzle *ir)
1994 {
1995    /* Note that this is only swizzles in expressions, not those on the left
1996     * hand side of an assignment, which do write masking.  See ir_assignment
1997     * for that.
1998     */
1999    const unsigned swz = brw_compose_swizzle(
2000       brw_swizzle_for_size(ir->type->vector_elements),
2001       BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
2002
2003    ir->val->accept(this);
2004    this->result = swizzle(this->result, swz);
2005 }
2006
2007 void
2008 vec4_visitor::visit(ir_dereference_variable *ir)
2009 {
2010    const struct glsl_type *type = ir->type;
2011    dst_reg *reg = variable_storage(ir->var);
2012
2013    if (!reg) {
2014       fail("Failed to find variable storage for %s\n", ir->var->name);
2015       this->result = src_reg(brw_null_reg());
2016       return;
2017    }
2018
2019    this->result = src_reg(*reg);
2020
2021    /* System values get their swizzle from the dst_reg writemask */
2022    if (ir->var->data.mode == ir_var_system_value)
2023       return;
2024
2025    if (type->is_scalar() || type->is_vector() || type->is_matrix())
2026       this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2027 }
2028
2029
2030 int
2031 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2032 {
2033    /* Under normal circumstances array elements are stored consecutively, so
2034     * the stride is equal to the size of the array element.
2035     */
2036    return type_size(ir->type);
2037 }
2038
2039
2040 void
2041 vec4_visitor::visit(ir_dereference_array *ir)
2042 {
2043    ir_constant *constant_index;
2044    src_reg src;
2045    int array_stride = compute_array_stride(ir);
2046
2047    constant_index = ir->array_index->constant_expression_value();
2048
2049    ir->array->accept(this);
2050    src = this->result;
2051
2052    if (constant_index) {
2053       src.reg_offset += constant_index->value.i[0] * array_stride;
2054    } else {
2055       /* Variable index array dereference.  It eats the "vec4" of the
2056        * base of the array and an index that offsets the Mesa register
2057        * index.
2058        */
2059       ir->array_index->accept(this);
2060
2061       src_reg index_reg;
2062
2063       if (array_stride == 1) {
2064          index_reg = this->result;
2065       } else {
2066          index_reg = src_reg(this, glsl_type::int_type);
2067
2068          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2069       }
2070
2071       if (src.reladdr) {
2072          src_reg temp = src_reg(this, glsl_type::int_type);
2073
2074          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2075
2076          index_reg = temp;
2077       }
2078
2079       src.reladdr = ralloc(mem_ctx, src_reg);
2080       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2081    }
2082
2083    /* If the type is smaller than a vec4, replicate the last channel out. */
2084    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2085       src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2086    else
2087       src.swizzle = BRW_SWIZZLE_NOOP;
2088    src.type = brw_type_for_base_type(ir->type);
2089
2090    this->result = src;
2091 }
2092
2093 void
2094 vec4_visitor::visit(ir_dereference_record *ir)
2095 {
2096    unsigned int i;
2097    const glsl_type *struct_type = ir->record->type;
2098    int offset = 0;
2099
2100    ir->record->accept(this);
2101
2102    for (i = 0; i < struct_type->length; i++) {
2103       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2104          break;
2105       offset += type_size(struct_type->fields.structure[i].type);
2106    }
2107
2108    /* If the type is smaller than a vec4, replicate the last channel out. */
2109    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2110       this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2111    else
2112       this->result.swizzle = BRW_SWIZZLE_NOOP;
2113    this->result.type = brw_type_for_base_type(ir->type);
2114
2115    this->result.reg_offset += offset;
2116 }
2117
2118 /**
2119  * We want to be careful in assignment setup to hit the actual storage
2120  * instead of potentially using a temporary like we might with the
2121  * ir_dereference handler.
2122  */
2123 static dst_reg
2124 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2125 {
2126    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2127     * access of a vector, it must be separated into a series conditional moves
2128     * before reaching this point (see ir_vec_index_to_cond_assign).
2129     */
2130    assert(ir->as_dereference());
2131    ir_dereference_array *deref_array = ir->as_dereference_array();
2132    if (deref_array) {
2133       assert(!deref_array->array->type->is_vector());
2134    }
2135
2136    /* Use the rvalue deref handler for the most part.  We'll ignore
2137     * swizzles in it and write swizzles using writemask, though.
2138     */
2139    ir->accept(v);
2140    return dst_reg(v->result);
2141 }
2142
2143 void
2144 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2145                               const struct glsl_type *type,
2146                               enum brw_predicate predicate)
2147 {
2148    if (type->base_type == GLSL_TYPE_STRUCT) {
2149       for (unsigned int i = 0; i < type->length; i++) {
2150          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2151       }
2152       return;
2153    }
2154
2155    if (type->is_array()) {
2156       for (unsigned int i = 0; i < type->length; i++) {
2157          emit_block_move(dst, src, type->fields.array, predicate);
2158       }
2159       return;
2160    }
2161
2162    if (type->is_matrix()) {
2163       const struct glsl_type *vec_type;
2164
2165       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2166                                          type->vector_elements, 1);
2167
2168       for (int i = 0; i < type->matrix_columns; i++) {
2169          emit_block_move(dst, src, vec_type, predicate);
2170       }
2171       return;
2172    }
2173
2174    assert(type->is_scalar() || type->is_vector());
2175
2176    dst->type = brw_type_for_base_type(type);
2177    src->type = dst->type;
2178
2179    dst->writemask = (1 << type->vector_elements) - 1;
2180
2181    src->swizzle = brw_swizzle_for_size(type->vector_elements);
2182
2183    vec4_instruction *inst = emit(MOV(*dst, *src));
2184    inst->predicate = predicate;
2185
2186    dst->reg_offset++;
2187    src->reg_offset++;
2188 }
2189
2190
2191 /* If the RHS processing resulted in an instruction generating a
2192  * temporary value, and it would be easy to rewrite the instruction to
2193  * generate its result right into the LHS instead, do so.  This ends
2194  * up reliably removing instructions where it can be tricky to do so
2195  * later without real UD chain information.
2196  */
2197 bool
2198 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2199                                      dst_reg dst,
2200                                      src_reg src,
2201                                      vec4_instruction *pre_rhs_inst,
2202                                      vec4_instruction *last_rhs_inst)
2203 {
2204    /* This could be supported, but it would take more smarts. */
2205    if (ir->condition)
2206       return false;
2207
2208    if (pre_rhs_inst == last_rhs_inst)
2209       return false; /* No instructions generated to work with. */
2210
2211    /* Make sure the last instruction generated our source reg. */
2212    if (src.file != GRF ||
2213        src.file != last_rhs_inst->dst.file ||
2214        src.reg != last_rhs_inst->dst.reg ||
2215        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2216        src.reladdr ||
2217        src.abs ||
2218        src.negate ||
2219        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2220       return false;
2221
2222    /* Check that that last instruction fully initialized the channels
2223     * we want to use, in the order we want to use them.  We could
2224     * potentially reswizzle the operands of many instructions so that
2225     * we could handle out of order channels, but don't yet.
2226     */
2227
2228    for (unsigned i = 0; i < 4; i++) {
2229       if (dst.writemask & (1 << i)) {
2230          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2231             return false;
2232
2233          if (BRW_GET_SWZ(src.swizzle, i) != i)
2234             return false;
2235       }
2236    }
2237
2238    /* Success!  Rewrite the instruction. */
2239    last_rhs_inst->dst.file = dst.file;
2240    last_rhs_inst->dst.reg = dst.reg;
2241    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2242    last_rhs_inst->dst.reladdr = dst.reladdr;
2243    last_rhs_inst->dst.writemask &= dst.writemask;
2244
2245    return true;
2246 }
2247
2248 void
2249 vec4_visitor::visit(ir_assignment *ir)
2250 {
2251    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2252    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2253
2254    if (!ir->lhs->type->is_scalar() &&
2255        !ir->lhs->type->is_vector()) {
2256       ir->rhs->accept(this);
2257       src_reg src = this->result;
2258
2259       if (ir->condition) {
2260          emit_bool_to_cond_code(ir->condition, &predicate);
2261       }
2262
2263       /* emit_block_move doesn't account for swizzles in the source register.
2264        * This should be ok, since the source register is a structure or an
2265        * array, and those can't be swizzled.  But double-check to be sure.
2266        */
2267       assert(src.swizzle ==
2268              (ir->rhs->type->is_matrix()
2269               ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2270               : BRW_SWIZZLE_NOOP));
2271
2272       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2273       return;
2274    }
2275
2276    /* Now we're down to just a scalar/vector with writemasks. */
2277    int i;
2278
2279    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2280    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2281
2282    ir->rhs->accept(this);
2283
2284    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2285
2286    int swizzles[4];
2287    int src_chan = 0;
2288
2289    assert(ir->lhs->type->is_vector() ||
2290           ir->lhs->type->is_scalar());
2291    dst.writemask = ir->write_mask;
2292
2293    /* Swizzle a small RHS vector into the channels being written.
2294     *
2295     * glsl ir treats write_mask as dictating how many channels are
2296     * present on the RHS while in our instructions we need to make
2297     * those channels appear in the slots of the vec4 they're written to.
2298     */
2299    for (int i = 0; i < 4; i++)
2300       swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2301
2302    src_reg src = swizzle(this->result,
2303                          BRW_SWIZZLE4(swizzles[0], swizzles[1],
2304                                       swizzles[2], swizzles[3]));
2305
2306    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2307       return;
2308    }
2309
2310    if (ir->condition) {
2311       emit_bool_to_cond_code(ir->condition, &predicate);
2312    }
2313
2314    for (i = 0; i < type_size(ir->lhs->type); i++) {
2315       vec4_instruction *inst = emit(MOV(dst, src));
2316       inst->predicate = predicate;
2317
2318       dst.reg_offset++;
2319       src.reg_offset++;
2320    }
2321 }
2322
2323 void
2324 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2325 {
2326    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2327       foreach_in_list(ir_constant, field_value, &ir->components) {
2328          emit_constant_values(dst, field_value);
2329       }
2330       return;
2331    }
2332
2333    if (ir->type->is_array()) {
2334       for (unsigned int i = 0; i < ir->type->length; i++) {
2335          emit_constant_values(dst, ir->array_elements[i]);
2336       }
2337       return;
2338    }
2339
2340    if (ir->type->is_matrix()) {
2341       for (int i = 0; i < ir->type->matrix_columns; i++) {
2342          float *vec = &ir->value.f[i * ir->type->vector_elements];
2343
2344          for (int j = 0; j < ir->type->vector_elements; j++) {
2345             dst->writemask = 1 << j;
2346             dst->type = BRW_REGISTER_TYPE_F;
2347
2348             emit(MOV(*dst, src_reg(vec[j])));
2349          }
2350          dst->reg_offset++;
2351       }
2352       return;
2353    }
2354
2355    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2356
2357    for (int i = 0; i < ir->type->vector_elements; i++) {
2358       if (!(remaining_writemask & (1 << i)))
2359          continue;
2360
2361       dst->writemask = 1 << i;
2362       dst->type = brw_type_for_base_type(ir->type);
2363
2364       /* Find other components that match the one we're about to
2365        * write.  Emits fewer instructions for things like vec4(0.5,
2366        * 1.5, 1.5, 1.5).
2367        */
2368       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2369          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2370             if (ir->value.b[i] == ir->value.b[j])
2371                dst->writemask |= (1 << j);
2372          } else {
2373             /* u, i, and f storage all line up, so no need for a
2374              * switch case for comparing each type.
2375              */
2376             if (ir->value.u[i] == ir->value.u[j])
2377                dst->writemask |= (1 << j);
2378          }
2379       }
2380
2381       switch (ir->type->base_type) {
2382       case GLSL_TYPE_FLOAT:
2383          emit(MOV(*dst, src_reg(ir->value.f[i])));
2384          break;
2385       case GLSL_TYPE_INT:
2386          emit(MOV(*dst, src_reg(ir->value.i[i])));
2387          break;
2388       case GLSL_TYPE_UINT:
2389          emit(MOV(*dst, src_reg(ir->value.u[i])));
2390          break;
2391       case GLSL_TYPE_BOOL:
2392          emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2393          break;
2394       default:
2395          unreachable("Non-float/uint/int/bool constant");
2396       }
2397
2398       remaining_writemask &= ~dst->writemask;
2399    }
2400    dst->reg_offset++;
2401 }
2402
2403 void
2404 vec4_visitor::visit(ir_constant *ir)
2405 {
2406    dst_reg dst = dst_reg(this, ir->type);
2407    this->result = src_reg(dst);
2408
2409    emit_constant_values(&dst, ir);
2410 }
2411
2412 void
2413 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2414 {
2415    ir_dereference *deref = static_cast<ir_dereference *>(
2416       ir->actual_parameters.get_head());
2417    ir_variable *location = deref->variable_referenced();
2418    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2419                           location->data.binding);
2420
2421    /* Calculate the surface offset */
2422    src_reg offset(this, glsl_type::uint_type);
2423    ir_dereference_array *deref_array = deref->as_dereference_array();
2424    if (deref_array) {
2425       deref_array->array_index->accept(this);
2426
2427       src_reg tmp(this, glsl_type::uint_type);
2428       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2429       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2430    } else {
2431       offset = location->data.atomic.offset;
2432    }
2433
2434    /* Emit the appropriate machine instruction */
2435    const char *callee = ir->callee->function_name();
2436    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2437
2438    if (!strcmp("__intrinsic_atomic_read", callee)) {
2439       emit_untyped_surface_read(surf_index, dst, offset);
2440
2441    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2442       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2443                           src_reg(), src_reg());
2444
2445    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2446       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2447                           src_reg(), src_reg());
2448    }
2449
2450    brw_mark_surface_used(stage_prog_data, surf_index);
2451 }
2452
2453 void
2454 vec4_visitor::visit(ir_call *ir)
2455 {
2456    const char *callee = ir->callee->function_name();
2457
2458    if (!strcmp("__intrinsic_atomic_read", callee) ||
2459        !strcmp("__intrinsic_atomic_increment", callee) ||
2460        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2461       visit_atomic_counter_intrinsic(ir);
2462    } else {
2463       unreachable("Unsupported intrinsic.");
2464    }
2465 }
2466
2467 src_reg
2468 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2469 {
2470    vec4_instruction *inst =
2471       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2472                                     dst_reg(this, glsl_type::uvec4_type));
2473    inst->base_mrf = 2;
2474    inst->src[1] = sampler;
2475
2476    int param_base;
2477
2478    if (devinfo->gen >= 9) {
2479       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2480       vec4_instruction *header_inst = new(mem_ctx)
2481          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2482                           dst_reg(MRF, inst->base_mrf));
2483
2484       emit(header_inst);
2485
2486       inst->mlen = 2;
2487       inst->header_size = 1;
2488       param_base = inst->base_mrf + 1;
2489    } else {
2490       inst->mlen = 1;
2491       param_base = inst->base_mrf;
2492    }
2493
2494    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2495    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2496    int zero_mask = 0xf & ~coord_mask;
2497
2498    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2499             coordinate));
2500
2501    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2502             src_reg(0)));
2503
2504    emit(inst);
2505    return src_reg(inst->dst);
2506 }
2507
2508 static bool
2509 is_high_sampler(const struct brw_device_info *devinfo, src_reg sampler)
2510 {
2511    if (devinfo->gen < 8 && !devinfo->is_haswell)
2512       return false;
2513
2514    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2515 }
2516
2517 void
2518 vec4_visitor::visit(ir_texture *ir)
2519 {
2520    uint32_t sampler =
2521       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2522
2523    ir_rvalue *nonconst_sampler_index =
2524       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2525
2526    /* Handle non-constant sampler array indexing */
2527    src_reg sampler_reg;
2528    if (nonconst_sampler_index) {
2529       /* The highest sampler which may be used by this operation is
2530        * the last element of the array. Mark it here, because the generator
2531        * doesn't have enough information to determine the bound.
2532        */
2533       uint32_t array_size = ir->sampler->as_dereference_array()
2534          ->array->type->array_size();
2535
2536       uint32_t max_used = sampler + array_size - 1;
2537       if (ir->op == ir_tg4 && devinfo->gen < 8) {
2538          max_used += prog_data->base.binding_table.gather_texture_start;
2539       } else {
2540          max_used += prog_data->base.binding_table.texture_start;
2541       }
2542
2543       brw_mark_surface_used(&prog_data->base, max_used);
2544
2545       /* Emit code to evaluate the actual indexing expression */
2546       nonconst_sampler_index->accept(this);
2547       src_reg temp(this, glsl_type::uint_type);
2548       emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
2549       sampler_reg = emit_uniformize(temp);
2550    } else {
2551       /* Single sampler, or constant array index; the indexing expression
2552        * is just an immediate.
2553        */
2554       sampler_reg = src_reg(sampler);
2555    }
2556
2557    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2558     * emitting anything other than setting up the constant result.
2559     */
2560    if (ir->op == ir_tg4) {
2561       ir_constant *chan = ir->lod_info.component->as_constant();
2562       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2563       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2564          dst_reg result(this, ir->type);
2565          this->result = src_reg(result);
2566          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2567          return;
2568       }
2569    }
2570
2571    /* Should be lowered by do_lower_texture_projection */
2572    assert(!ir->projector);
2573
2574    /* Should be lowered */
2575    assert(!ir->offset || !ir->offset->type->is_array());
2576
2577    /* Generate code to compute all the subexpression trees.  This has to be
2578     * done before loading any values into MRFs for the sampler message since
2579     * generating these values may involve SEND messages that need the MRFs.
2580     */
2581    src_reg coordinate;
2582    if (ir->coordinate) {
2583       ir->coordinate->accept(this);
2584       coordinate = this->result;
2585    }
2586
2587    src_reg shadow_comparitor;
2588    if (ir->shadow_comparitor) {
2589       ir->shadow_comparitor->accept(this);
2590       shadow_comparitor = this->result;
2591    }
2592
2593    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2594    src_reg offset_value;
2595    if (has_nonconstant_offset) {
2596       ir->offset->accept(this);
2597       offset_value = src_reg(this->result);
2598    }
2599
2600    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2601    src_reg lod, dPdx, dPdy, sample_index, mcs;
2602    switch (ir->op) {
2603    case ir_tex:
2604       lod = src_reg(0.0f);
2605       lod_type = glsl_type::float_type;
2606       break;
2607    case ir_txf:
2608    case ir_txl:
2609    case ir_txs:
2610       ir->lod_info.lod->accept(this);
2611       lod = this->result;
2612       lod_type = ir->lod_info.lod->type;
2613       break;
2614    case ir_query_levels:
2615       lod = src_reg(0);
2616       lod_type = glsl_type::int_type;
2617       break;
2618    case ir_txf_ms:
2619       ir->lod_info.sample_index->accept(this);
2620       sample_index = this->result;
2621       sample_index_type = ir->lod_info.sample_index->type;
2622
2623       if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2624          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2625       else
2626          mcs = src_reg(0u);
2627       break;
2628    case ir_txd:
2629       ir->lod_info.grad.dPdx->accept(this);
2630       dPdx = this->result;
2631
2632       ir->lod_info.grad.dPdy->accept(this);
2633       dPdy = this->result;
2634
2635       lod_type = ir->lod_info.grad.dPdx->type;
2636       break;
2637    case ir_txb:
2638    case ir_lod:
2639    case ir_tg4:
2640       break;
2641    }
2642
2643    enum opcode opcode;
2644    switch (ir->op) {
2645    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2646    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2647    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2648    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2649    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2650    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2651    case ir_tg4: opcode = has_nonconstant_offset
2652                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2653    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2654    case ir_txb:
2655       unreachable("TXB is not valid for vertex shaders.");
2656    case ir_lod:
2657       unreachable("LOD is not valid for vertex shaders.");
2658    default:
2659       unreachable("Unrecognized tex op");
2660    }
2661
2662    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2663       opcode, dst_reg(this, ir->type));
2664
2665    if (ir->offset != NULL && !has_nonconstant_offset) {
2666       inst->offset =
2667          brw_texture_offset(ir->offset->as_constant()->value.i,
2668                             ir->offset->type->vector_elements);
2669    }
2670
2671    /* Stuff the channel select bits in the top of the texture offset */
2672    if (ir->op == ir_tg4)
2673       inst->offset |= gather_channel(ir, sampler) << 16;
2674
2675    /* The message header is necessary for:
2676     * - Gen4 (always)
2677     * - Gen9+ for selecting SIMD4x2
2678     * - Texel offsets
2679     * - Gather channel selection
2680     * - Sampler indices too large to fit in a 4-bit value.
2681     */
2682    inst->header_size =
2683       (devinfo->gen < 5 || devinfo->gen >= 9 ||
2684        inst->offset != 0 || ir->op == ir_tg4 ||
2685        is_high_sampler(devinfo, sampler_reg)) ? 1 : 0;
2686    inst->base_mrf = 2;
2687    inst->mlen = inst->header_size + 1; /* always at least one */
2688    inst->dst.writemask = WRITEMASK_XYZW;
2689    inst->shadow_compare = ir->shadow_comparitor != NULL;
2690
2691    inst->src[1] = sampler_reg;
2692
2693    /* MRF for the first parameter */
2694    int param_base = inst->base_mrf + inst->header_size;
2695
2696    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2697       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2698       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2699    } else {
2700       /* Load the coordinate */
2701       /* FINISHME: gl_clamp_mask and saturate */
2702       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2703       int zero_mask = 0xf & ~coord_mask;
2704
2705       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2706                coordinate));
2707
2708       if (zero_mask != 0) {
2709          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2710                   src_reg(0)));
2711       }
2712       /* Load the shadow comparitor */
2713       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2714          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2715                           WRITEMASK_X),
2716                   shadow_comparitor));
2717          inst->mlen++;
2718       }
2719
2720       /* Load the LOD info */
2721       if (ir->op == ir_tex || ir->op == ir_txl) {
2722          int mrf, writemask;
2723          if (devinfo->gen >= 5) {
2724             mrf = param_base + 1;
2725             if (ir->shadow_comparitor) {
2726                writemask = WRITEMASK_Y;
2727                /* mlen already incremented */
2728             } else {
2729                writemask = WRITEMASK_X;
2730                inst->mlen++;
2731             }
2732          } else /* devinfo->gen == 4 */ {
2733             mrf = param_base;
2734             writemask = WRITEMASK_W;
2735          }
2736          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2737       } else if (ir->op == ir_txf) {
2738          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2739       } else if (ir->op == ir_txf_ms) {
2740          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2741                   sample_index));
2742          if (devinfo->gen >= 7) {
2743             /* MCS data is in the first channel of `mcs`, but we need to get it into
2744              * the .y channel of the second vec4 of params, so replicate .x across
2745              * the whole vec4 and then mask off everything except .y
2746              */
2747             mcs.swizzle = BRW_SWIZZLE_XXXX;
2748             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2749                      mcs));
2750          }
2751          inst->mlen++;
2752       } else if (ir->op == ir_txd) {
2753          const glsl_type *type = lod_type;
2754
2755          if (devinfo->gen >= 5) {
2756             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2757             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2758             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2759             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2760             inst->mlen++;
2761
2762             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2763                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2764                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2765                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2766                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2767                inst->mlen++;
2768
2769                if (ir->shadow_comparitor) {
2770                   emit(MOV(dst_reg(MRF, param_base + 2,
2771                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2772                            shadow_comparitor));
2773                }
2774             }
2775          } else /* devinfo->gen == 4 */ {
2776             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2777             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2778             inst->mlen += 2;
2779          }
2780       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2781          if (ir->shadow_comparitor) {
2782             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2783                      shadow_comparitor));
2784          }
2785
2786          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2787                   offset_value));
2788          inst->mlen++;
2789       }
2790    }
2791
2792    emit(inst);
2793
2794    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2795     * spec requires layers.
2796     */
2797    if (ir->op == ir_txs) {
2798       glsl_type const *type = ir->sampler->type;
2799       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2800           type->sampler_array) {
2801          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2802                    writemask(inst->dst, WRITEMASK_Z),
2803                    src_reg(inst->dst), src_reg(6));
2804       }
2805    }
2806
2807    if (devinfo->gen == 6 && ir->op == ir_tg4) {
2808       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2809    }
2810
2811    swizzle_result(ir, src_reg(inst->dst), sampler);
2812 }
2813
2814 /**
2815  * Apply workarounds for Gen6 gather with UINT/SINT
2816  */
2817 void
2818 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2819 {
2820    if (!wa)
2821       return;
2822
2823    int width = (wa & WA_8BIT) ? 8 : 16;
2824    dst_reg dst_f = dst;
2825    dst_f.type = BRW_REGISTER_TYPE_F;
2826
2827    /* Convert from UNORM to UINT */
2828    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2829    emit(MOV(dst, src_reg(dst_f)));
2830
2831    if (wa & WA_SIGN) {
2832       /* Reinterpret the UINT value as a signed INT value by
2833        * shifting the sign bit into place, then shifting back
2834        * preserving sign.
2835        */
2836       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2837       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2838    }
2839 }
2840
2841 /**
2842  * Set up the gather channel based on the swizzle, for gather4.
2843  */
2844 uint32_t
2845 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2846 {
2847    ir_constant *chan = ir->lod_info.component->as_constant();
2848    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2849    switch (swiz) {
2850       case SWIZZLE_X: return 0;
2851       case SWIZZLE_Y:
2852          /* gather4 sampler is broken for green channel on RG32F --
2853           * we must ask for blue instead.
2854           */
2855          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2856             return 2;
2857          return 1;
2858       case SWIZZLE_Z: return 2;
2859       case SWIZZLE_W: return 3;
2860       default:
2861          unreachable("Not reached"); /* zero, one swizzles handled already */
2862    }
2863 }
2864
2865 void
2866 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2867 {
2868    int s = key->tex.swizzles[sampler];
2869
2870    this->result = src_reg(this, ir->type);
2871    dst_reg swizzled_result(this->result);
2872
2873    if (ir->op == ir_query_levels) {
2874       /* # levels is in .w */
2875       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2876       emit(MOV(swizzled_result, orig_val));
2877       return;
2878    }
2879
2880    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2881                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2882       emit(MOV(swizzled_result, orig_val));
2883       return;
2884    }
2885
2886
2887    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2888    int swizzle[4] = {0};
2889
2890    for (int i = 0; i < 4; i++) {
2891       switch (GET_SWZ(s, i)) {
2892       case SWIZZLE_ZERO:
2893          zero_mask |= (1 << i);
2894          break;
2895       case SWIZZLE_ONE:
2896          one_mask |= (1 << i);
2897          break;
2898       default:
2899          copy_mask |= (1 << i);
2900          swizzle[i] = GET_SWZ(s, i);
2901          break;
2902       }
2903    }
2904
2905    if (copy_mask) {
2906       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2907       swizzled_result.writemask = copy_mask;
2908       emit(MOV(swizzled_result, orig_val));
2909    }
2910
2911    if (zero_mask) {
2912       swizzled_result.writemask = zero_mask;
2913       emit(MOV(swizzled_result, src_reg(0.0f)));
2914    }
2915
2916    if (one_mask) {
2917       swizzled_result.writemask = one_mask;
2918       emit(MOV(swizzled_result, src_reg(1.0f)));
2919    }
2920 }
2921
2922 void
2923 vec4_visitor::visit(ir_return *)
2924 {
2925    unreachable("not reached");
2926 }
2927
2928 void
2929 vec4_visitor::visit(ir_discard *)
2930 {
2931    unreachable("not reached");
2932 }
2933
2934 void
2935 vec4_visitor::visit(ir_if *ir)
2936 {
2937    /* Don't point the annotation at the if statement, because then it plus
2938     * the then and else blocks get printed.
2939     */
2940    this->base_ir = ir->condition;
2941
2942    if (devinfo->gen == 6) {
2943       emit_if_gen6(ir);
2944    } else {
2945       enum brw_predicate predicate;
2946       emit_bool_to_cond_code(ir->condition, &predicate);
2947       emit(IF(predicate));
2948    }
2949
2950    visit_instructions(&ir->then_instructions);
2951
2952    if (!ir->else_instructions.is_empty()) {
2953       this->base_ir = ir->condition;
2954       emit(BRW_OPCODE_ELSE);
2955
2956       visit_instructions(&ir->else_instructions);
2957    }
2958
2959    this->base_ir = ir->condition;
2960    emit(BRW_OPCODE_ENDIF);
2961 }
2962
2963 void
2964 vec4_visitor::visit(ir_emit_vertex *)
2965 {
2966    unreachable("not reached");
2967 }
2968
2969 void
2970 vec4_visitor::visit(ir_end_primitive *)
2971 {
2972    unreachable("not reached");
2973 }
2974
2975 void
2976 vec4_visitor::visit(ir_barrier *)
2977 {
2978    unreachable("not reached");
2979 }
2980
2981 void
2982 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2983                                   dst_reg dst, src_reg offset,
2984                                   src_reg src0, src_reg src1)
2985 {
2986    unsigned mlen = 0;
2987
2988    /* Set the atomic operation offset. */
2989    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2990    mlen++;
2991
2992    /* Set the atomic operation arguments. */
2993    if (src0.file != BAD_FILE) {
2994       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2995       mlen++;
2996    }
2997
2998    if (src1.file != BAD_FILE) {
2999       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
3000       mlen++;
3001    }
3002
3003    /* Emit the instruction.  Note that this maps to the normal SIMD8
3004     * untyped atomic message on Ivy Bridge, but that's OK because
3005     * unused channels will be masked out.
3006     */
3007    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3008                                  brw_message_reg(0),
3009                                  src_reg(surf_index), src_reg(atomic_op));
3010    inst->mlen = mlen;
3011 }
3012
3013 void
3014 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3015                                         src_reg offset)
3016 {
3017    /* Set the surface read offset. */
3018    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3019
3020    /* Emit the instruction.  Note that this maps to the normal SIMD8
3021     * untyped surface read message, but that's OK because unused
3022     * channels will be masked out.
3023     */
3024    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3025                                  brw_message_reg(0),
3026                                  src_reg(surf_index), src_reg(1));
3027    inst->mlen = 1;
3028 }
3029
3030 void
3031 vec4_visitor::emit_ndc_computation()
3032 {
3033    /* Get the position */
3034    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3035
3036    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3037    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3038    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3039
3040    current_annotation = "NDC";
3041    dst_reg ndc_w = ndc;
3042    ndc_w.writemask = WRITEMASK_W;
3043    src_reg pos_w = pos;
3044    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3045    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3046
3047    dst_reg ndc_xyz = ndc;
3048    ndc_xyz.writemask = WRITEMASK_XYZ;
3049
3050    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3051 }
3052
3053 void
3054 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3055 {
3056    if (devinfo->gen < 6 &&
3057        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3058         key->userclip_active || devinfo->has_negative_rhw_bug)) {
3059       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3060       dst_reg header1_w = header1;
3061       header1_w.writemask = WRITEMASK_W;
3062
3063       emit(MOV(header1, 0u));
3064
3065       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3066          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3067
3068          current_annotation = "Point size";
3069          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3070          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3071       }
3072
3073       if (key->userclip_active) {
3074          current_annotation = "Clipping flags";
3075          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3076          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3077
3078          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3079          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3080          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3081
3082          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3083          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3084          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3085          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3086       }
3087
3088       /* i965 clipping workaround:
3089        * 1) Test for -ve rhw
3090        * 2) If set,
3091        *      set ndc = (0,0,0,0)
3092        *      set ucp[6] = 1
3093        *
3094        * Later, clipping will detect ucp[6] and ensure the primitive is
3095        * clipped against all fixed planes.
3096        */
3097       if (devinfo->has_negative_rhw_bug) {
3098          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3099          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3100          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3101          vec4_instruction *inst;
3102          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3103          inst->predicate = BRW_PREDICATE_NORMAL;
3104          output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
3105          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3106          inst->predicate = BRW_PREDICATE_NORMAL;
3107       }
3108
3109       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3110    } else if (devinfo->gen < 6) {
3111       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3112    } else {
3113       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3114       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3115          dst_reg reg_w = reg;
3116          reg_w.writemask = WRITEMASK_W;
3117          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3118          reg_as_src.type = reg_w.type;
3119          reg_as_src.swizzle = brw_swizzle_for_size(1);
3120          emit(MOV(reg_w, reg_as_src));
3121       }
3122       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3123          dst_reg reg_y = reg;
3124          reg_y.writemask = WRITEMASK_Y;
3125          reg_y.type = BRW_REGISTER_TYPE_D;
3126          output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
3127          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3128       }
3129       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3130          dst_reg reg_z = reg;
3131          reg_z.writemask = WRITEMASK_Z;
3132          reg_z.type = BRW_REGISTER_TYPE_D;
3133          output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
3134          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3135       }
3136    }
3137 }
3138
3139 void
3140 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3141 {
3142    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3143     *
3144     *     "If a linked set of shaders forming the vertex stage contains no
3145     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3146     *     application has requested clipping against user clip planes through
3147     *     the API, then the coordinate written to gl_Position is used for
3148     *     comparison against the user clip planes."
3149     *
3150     * This function is only called if the shader didn't write to
3151     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3152     * if the user wrote to it; otherwise we use gl_Position.
3153     */
3154    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3155    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3156       clip_vertex = VARYING_SLOT_POS;
3157    }
3158
3159    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3160         ++i) {
3161       reg.writemask = 1 << i;
3162       emit(DP4(reg,
3163                src_reg(output_reg[clip_vertex]),
3164                src_reg(this->userplane[i + offset])));
3165    }
3166 }
3167
3168 vec4_instruction *
3169 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3170 {
3171    assert(varying < VARYING_SLOT_MAX);
3172    assert(output_reg[varying].type == reg.type);
3173    current_annotation = output_reg_annotation[varying];
3174    /* Copy the register, saturating if necessary */
3175    return emit(MOV(reg, src_reg(output_reg[varying])));
3176 }
3177
3178 void
3179 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3180 {
3181    reg.type = BRW_REGISTER_TYPE_F;
3182    output_reg[varying].type = reg.type;
3183
3184    switch (varying) {
3185    case VARYING_SLOT_PSIZ:
3186    {
3187       /* PSIZ is always in slot 0, and is coupled with other flags. */
3188       current_annotation = "indices, point width, clip flags";
3189       emit_psiz_and_flags(reg);
3190       break;
3191    }
3192    case BRW_VARYING_SLOT_NDC:
3193       current_annotation = "NDC";
3194       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3195       break;
3196    case VARYING_SLOT_POS:
3197       current_annotation = "gl_Position";
3198       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3199       break;
3200    case VARYING_SLOT_EDGE:
3201       /* This is present when doing unfilled polygons.  We're supposed to copy
3202        * the edge flag from the user-provided vertex array
3203        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3204        * of that attribute (starts as 1.0f).  This is then used in clipping to
3205        * determine which edges should be drawn as wireframe.
3206        */
3207       current_annotation = "edge flag";
3208       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3209                                     glsl_type::float_type, WRITEMASK_XYZW))));
3210       break;
3211    case BRW_VARYING_SLOT_PAD:
3212       /* No need to write to this slot */
3213       break;
3214    case VARYING_SLOT_COL0:
3215    case VARYING_SLOT_COL1:
3216    case VARYING_SLOT_BFC0:
3217    case VARYING_SLOT_BFC1: {
3218       /* These built-in varyings are only supported in compatibility mode,
3219        * and we only support GS in core profile.  So, this must be a vertex
3220        * shader.
3221        */
3222       assert(stage == MESA_SHADER_VERTEX);
3223       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3224       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3225          inst->saturate = true;
3226       break;
3227    }
3228
3229    default:
3230       emit_generic_urb_slot(reg, varying);
3231       break;
3232    }
3233 }
3234
3235 static int
3236 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3237 {
3238    if (devinfo->gen >= 6) {
3239       /* URB data written (does not include the message header reg) must
3240        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3241        * section 5.4.3.2.2: URB_INTERLEAVED.
3242        *
3243        * URB entries are allocated on a multiple of 1024 bits, so an
3244        * extra 128 bits written here to make the end align to 256 is
3245        * no problem.
3246        */
3247       if ((mlen % 2) != 1)
3248          mlen++;
3249    }
3250
3251    return mlen;
3252 }
3253
3254
3255 /**
3256  * Generates the VUE payload plus the necessary URB write instructions to
3257  * output it.
3258  *
3259  * The VUE layout is documented in Volume 2a.
3260  */
3261 void
3262 vec4_visitor::emit_vertex()
3263 {
3264    /* MRF 0 is reserved for the debugger, so start with message header
3265     * in MRF 1.
3266     */
3267    int base_mrf = 1;
3268    int mrf = base_mrf;
3269    /* In the process of generating our URB write message contents, we
3270     * may need to unspill a register or load from an array.  Those
3271     * reads would use MRFs 14-15.
3272     */
3273    int max_usable_mrf = 13;
3274
3275    /* The following assertion verifies that max_usable_mrf causes an
3276     * even-numbered amount of URB write data, which will meet gen6's
3277     * requirements for length alignment.
3278     */
3279    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3280
3281    /* First mrf is the g0-based message header containing URB handles and
3282     * such.
3283     */
3284    emit_urb_write_header(mrf++);
3285
3286    if (devinfo->gen < 6) {
3287       emit_ndc_computation();
3288    }
3289
3290    /* Lower legacy ff and ClipVertex clipping to clip distances */
3291    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3292       current_annotation = "user clip distances";
3293
3294       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3295       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3296
3297       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3298       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3299    }
3300
3301    /* We may need to split this up into several URB writes, so do them in a
3302     * loop.
3303     */
3304    int slot = 0;
3305    bool complete = false;
3306    do {
3307       /* URB offset is in URB row increments, and each of our MRFs is half of
3308        * one of those, since we're doing interleaved writes.
3309        */
3310       int offset = slot / 2;
3311
3312       mrf = base_mrf + 1;
3313       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3314          emit_urb_slot(dst_reg(MRF, mrf++),
3315                        prog_data->vue_map.slot_to_varying[slot]);
3316
3317          /* If this was max_usable_mrf, we can't fit anything more into this
3318           * URB WRITE.
3319           */
3320          if (mrf > max_usable_mrf) {
3321             slot++;
3322             break;
3323          }
3324       }
3325
3326       complete = slot >= prog_data->vue_map.num_slots;
3327       current_annotation = "URB write";
3328       vec4_instruction *inst = emit_urb_write_opcode(complete);
3329       inst->base_mrf = base_mrf;
3330       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3331       inst->offset += offset;
3332    } while(!complete);
3333 }
3334
3335
3336 src_reg
3337 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3338                                  src_reg *reladdr, int reg_offset)
3339 {
3340    /* Because we store the values to scratch interleaved like our
3341     * vertex data, we need to scale the vec4 index by 2.
3342     */
3343    int message_header_scale = 2;
3344
3345    /* Pre-gen6, the message header uses byte offsets instead of vec4
3346     * (16-byte) offset units.
3347     */
3348    if (devinfo->gen < 6)
3349       message_header_scale *= 16;
3350
3351    if (reladdr) {
3352       src_reg index = src_reg(this, glsl_type::int_type);
3353
3354       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3355                                    src_reg(reg_offset)));
3356       emit_before(block, inst, MUL(dst_reg(index), index,
3357                                    src_reg(message_header_scale)));
3358
3359       return index;
3360    } else {
3361       return src_reg(reg_offset * message_header_scale);
3362    }
3363 }
3364
3365 src_reg
3366 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3367                                        src_reg *reladdr, int reg_offset)
3368 {
3369    if (reladdr) {
3370       src_reg index = src_reg(this, glsl_type::int_type);
3371
3372       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3373                                    src_reg(reg_offset)));
3374
3375       /* Pre-gen6, the message header uses byte offsets instead of vec4
3376        * (16-byte) offset units.
3377        */
3378       if (devinfo->gen < 6) {
3379          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3380       }
3381
3382       return index;
3383    } else if (devinfo->gen >= 8) {
3384       /* Store the offset in a GRF so we can send-from-GRF. */
3385       src_reg offset = src_reg(this, glsl_type::int_type);
3386       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3387       return offset;
3388    } else {
3389       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3390       return src_reg(reg_offset * message_header_scale);
3391    }
3392 }
3393
3394 /**
3395  * Emits an instruction before @inst to load the value named by @orig_src
3396  * from scratch space at @base_offset to @temp.
3397  *
3398  * @base_offset is measured in 32-byte units (the size of a register).
3399  */
3400 void
3401 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3402                                 dst_reg temp, src_reg orig_src,
3403                                 int base_offset)
3404 {
3405    int reg_offset = base_offset + orig_src.reg_offset;
3406    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3407                                       reg_offset);
3408
3409    emit_before(block, inst, SCRATCH_READ(temp, index));
3410 }
3411
3412 /**
3413  * Emits an instruction after @inst to store the value to be written
3414  * to @orig_dst to scratch space at @base_offset, from @temp.
3415  *
3416  * @base_offset is measured in 32-byte units (the size of a register).
3417  */
3418 void
3419 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3420                                  int base_offset)
3421 {
3422    int reg_offset = base_offset + inst->dst.reg_offset;
3423    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3424                                       reg_offset);
3425
3426    /* Create a temporary register to store *inst's result in.
3427     *
3428     * We have to be careful in MOVing from our temporary result register in
3429     * the scratch write.  If we swizzle from channels of the temporary that
3430     * weren't initialized, it will confuse live interval analysis, which will
3431     * make spilling fail to make progress.
3432     */
3433    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3434                                        inst->dst.type),
3435                                 brw_swizzle_for_mask(inst->dst.writemask));
3436    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3437                                        inst->dst.writemask));
3438    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3439    write->predicate = inst->predicate;
3440    write->ir = inst->ir;
3441    write->annotation = inst->annotation;
3442    inst->insert_after(block, write);
3443
3444    inst->dst.file = temp.file;
3445    inst->dst.reg = temp.reg;
3446    inst->dst.reg_offset = temp.reg_offset;
3447    inst->dst.reladdr = NULL;
3448 }
3449
3450 /**
3451  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3452  * adds the scratch read(s) before \p inst. The function also checks for
3453  * recursive reladdr scratch accesses, issuing the corresponding scratch
3454  * loads and rewriting reladdr references accordingly.
3455  *
3456  * \return \p src if it did not require a scratch load, otherwise, the
3457  * register holding the result of the scratch load that the caller should
3458  * use to rewrite src.
3459  */
3460 src_reg
3461 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3462                                    vec4_instruction *inst, src_reg src)
3463 {
3464    /* Resolve recursive reladdr scratch access by calling ourselves
3465     * with src.reladdr
3466     */
3467    if (src.reladdr)
3468       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3469                                           *src.reladdr);
3470
3471    /* Now handle scratch access on src */
3472    if (src.file == GRF && scratch_loc[src.reg] != -1) {
3473       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3474       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3475       src.reg = temp.reg;
3476       src.reg_offset = temp.reg_offset;
3477       src.reladdr = NULL;
3478    }
3479
3480    return src;
3481 }
3482
3483 /**
3484  * We can't generally support array access in GRF space, because a
3485  * single instruction's destination can only span 2 contiguous
3486  * registers.  So, we send all GRF arrays that get variable index
3487  * access to scratch space.
3488  */
3489 void
3490 vec4_visitor::move_grf_array_access_to_scratch()
3491 {
3492    int scratch_loc[this->alloc.count];
3493    memset(scratch_loc, -1, sizeof(scratch_loc));
3494
3495    /* First, calculate the set of virtual GRFs that need to be punted
3496     * to scratch due to having any array access on them, and where in
3497     * scratch.
3498     */
3499    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3500       if (inst->dst.file == GRF && inst->dst.reladdr) {
3501          if (scratch_loc[inst->dst.reg] == -1) {
3502             scratch_loc[inst->dst.reg] = last_scratch;
3503             last_scratch += this->alloc.sizes[inst->dst.reg];
3504          }
3505
3506          for (src_reg *iter = inst->dst.reladdr;
3507               iter->reladdr;
3508               iter = iter->reladdr) {
3509             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3510                scratch_loc[iter->reg] = last_scratch;
3511                last_scratch += this->alloc.sizes[iter->reg];
3512             }
3513          }
3514       }
3515
3516       for (int i = 0 ; i < 3; i++) {
3517          for (src_reg *iter = &inst->src[i];
3518               iter->reladdr;
3519               iter = iter->reladdr) {
3520             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3521                scratch_loc[iter->reg] = last_scratch;
3522                last_scratch += this->alloc.sizes[iter->reg];
3523             }
3524          }
3525       }
3526    }
3527
3528    /* Now, for anything that will be accessed through scratch, rewrite
3529     * it to load/store.  Note that this is a _safe list walk, because
3530     * we may generate a new scratch_write instruction after the one
3531     * we're processing.
3532     */
3533    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3534       /* Set up the annotation tracking for new generated instructions. */
3535       base_ir = inst->ir;
3536       current_annotation = inst->annotation;
3537
3538       /* First handle scratch access on the dst. Notice we have to handle
3539        * the case where the dst's reladdr also points to scratch space.
3540        */
3541       if (inst->dst.reladdr)
3542          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3543                                                    *inst->dst.reladdr);
3544
3545       /* Now that we have handled any (possibly recursive) reladdr scratch
3546        * accesses for dst we can safely do the scratch write for dst itself
3547        */
3548       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3549          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3550
3551       /* Now handle scratch access on any src. In this case, since inst->src[i]
3552        * already is a src_reg, we can just call emit_resolve_reladdr with
3553        * inst->src[i] and it will take care of handling scratch loads for
3554        * both src and src.reladdr (recursively).
3555        */
3556       for (int i = 0 ; i < 3; i++) {
3557          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3558                                              inst->src[i]);
3559       }
3560    }
3561 }
3562
3563 /**
3564  * Emits an instruction before @inst to load the value named by @orig_src
3565  * from the pull constant buffer (surface) at @base_offset to @temp.
3566  */
3567 void
3568 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3569                                       dst_reg temp, src_reg orig_src,
3570                                       int base_offset)
3571 {
3572    int reg_offset = base_offset + orig_src.reg_offset;
3573    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3574    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3575                                              reg_offset);
3576
3577    emit_pull_constant_load_reg(temp,
3578                                index,
3579                                offset,
3580                                block, inst);
3581 }
3582
3583 /**
3584  * Implements array access of uniforms by inserting a
3585  * PULL_CONSTANT_LOAD instruction.
3586  *
3587  * Unlike temporary GRF array access (where we don't support it due to
3588  * the difficulty of doing relative addressing on instruction
3589  * destinations), we could potentially do array access of uniforms
3590  * that were loaded in GRF space as push constants.  In real-world
3591  * usage we've seen, though, the arrays being used are always larger
3592  * than we could load as push constants, so just always move all
3593  * uniform array access out to a pull constant buffer.
3594  */
3595 void
3596 vec4_visitor::move_uniform_array_access_to_pull_constants()
3597 {
3598    int pull_constant_loc[this->uniforms];
3599    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3600    bool nested_reladdr;
3601
3602    /* Walk through and find array access of uniforms.  Put a copy of that
3603     * uniform in the pull constant buffer.
3604     *
3605     * Note that we don't move constant-indexed accesses to arrays.  No
3606     * testing has been done of the performance impact of this choice.
3607     */
3608    do {
3609       nested_reladdr = false;
3610
3611       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3612          for (int i = 0 ; i < 3; i++) {
3613             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3614                continue;
3615
3616             int uniform = inst->src[i].reg;
3617
3618             if (inst->src[i].reladdr->reladdr)
3619                nested_reladdr = true;  /* will need another pass */
3620
3621             /* If this array isn't already present in the pull constant buffer,
3622              * add it.
3623              */
3624             if (pull_constant_loc[uniform] == -1) {
3625                const gl_constant_value **values =
3626                   &stage_prog_data->param[uniform * 4];
3627
3628                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3629
3630                assert(uniform < uniform_array_size);
3631                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3632                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3633                      = values[j];
3634                }
3635             }
3636
3637             /* Set up the annotation tracking for new generated instructions. */
3638             base_ir = inst->ir;
3639             current_annotation = inst->annotation;
3640
3641             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3642
3643             emit_pull_constant_load(block, inst, temp, inst->src[i],
3644                                     pull_constant_loc[uniform]);
3645
3646             inst->src[i].file = temp.file;
3647             inst->src[i].reg = temp.reg;
3648             inst->src[i].reg_offset = temp.reg_offset;
3649             inst->src[i].reladdr = NULL;
3650          }
3651       }
3652    } while (nested_reladdr);
3653
3654    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3655     * no need to track them as larger-than-vec4 objects.  This will be
3656     * relied on in cutting out unused uniform vectors from push
3657     * constants.
3658     */
3659    split_uniform_registers();
3660 }
3661
3662 void
3663 vec4_visitor::resolve_ud_negate(src_reg *reg)
3664 {
3665    if (reg->type != BRW_REGISTER_TYPE_UD ||
3666        !reg->negate)
3667       return;
3668
3669    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3670    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3671    *reg = temp;
3672 }
3673
3674 /**
3675  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3676  *
3677  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3678  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3679  */
3680 void
3681 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3682 {
3683    assert(devinfo->gen <= 5);
3684
3685    if (!rvalue->type->is_boolean())
3686       return;
3687
3688    src_reg and_result = src_reg(this, rvalue->type);
3689    src_reg neg_result = src_reg(this, rvalue->type);
3690    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3691    emit(MOV(dst_reg(neg_result), negate(and_result)));
3692    *reg = neg_result;
3693 }
3694
3695 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3696                            void *log_data,
3697                            struct gl_program *prog,
3698                            const struct brw_vue_prog_key *key,
3699                            struct brw_vue_prog_data *prog_data,
3700                            struct gl_shader_program *shader_prog,
3701                            gl_shader_stage stage,
3702                            void *mem_ctx,
3703                            bool no_spills,
3704                            int shader_time_index)
3705    : backend_shader(compiler, log_data, mem_ctx,
3706                     shader_prog, prog, &prog_data->base, stage),
3707      key(key),
3708      prog_data(prog_data),
3709      sanity_param_count(0),
3710      fail_msg(NULL),
3711      first_non_payload_grf(0),
3712      need_all_constants_in_pull_buffer(false),
3713      no_spills(no_spills),
3714      shader_time_index(shader_time_index),
3715      last_scratch(0)
3716 {
3717    this->failed = false;
3718
3719    this->base_ir = NULL;
3720    this->current_annotation = NULL;
3721    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3722
3723    this->variable_ht = hash_table_ctor(0,
3724                                        hash_table_pointer_hash,
3725                                        hash_table_pointer_compare);
3726
3727    this->virtual_grf_start = NULL;
3728    this->virtual_grf_end = NULL;
3729    this->live_intervals = NULL;
3730
3731    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3732
3733    this->uniforms = 0;
3734
3735    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3736     * at least one. See setup_uniforms() in brw_vec4.cpp.
3737     */
3738    this->uniform_array_size = 1;
3739    if (prog_data) {
3740       this->uniform_array_size =
3741          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3742    }
3743
3744    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3745    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3746 }
3747
3748 vec4_visitor::~vec4_visitor()
3749 {
3750    hash_table_dtor(this->variable_ht);
3751 }
3752
3753
3754 void
3755 vec4_visitor::fail(const char *format, ...)
3756 {
3757    va_list va;
3758    char *msg;
3759
3760    if (failed)
3761       return;
3762
3763    failed = true;
3764
3765    va_start(va, format);
3766    msg = ralloc_vasprintf(mem_ctx, format, va);
3767    va_end(va);
3768    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3769
3770    this->fail_msg = msg;
3771
3772    if (debug_enabled) {
3773       fprintf(stderr, "%s",  msg);
3774    }
3775 }
3776
3777 } /* namespace brw */