src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_present = false;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(devinfo->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240    if (dst.file == HW_REG)
 241       dst.fixed_hw_reg.type = dst.type;
 242
 243    resolve_ud_negate(&src0);
 244    resolve_ud_negate(&src1);
 245
 246    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 247    inst->conditional_mod = condition;
 248
 249    return inst;
 250 }
 251
 252 vec4_instruction *
 253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 254 {
 255    vec4_instruction *inst;
 256
 257    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 258                                         dst, index);
 259    inst->base_mrf = 14;
 260    inst->mlen = 2;
 261
 262    return inst;
 263 }
 264
 265 vec4_instruction *
 266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 267                             const src_reg &index)
 268 {
 269    vec4_instruction *inst;
 270
 271    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 272                                         dst, src, index);
 273    inst->base_mrf = 13;
 274    inst->mlen = 3;
 275
 276    return inst;
 277 }
 278
 279 void
 280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
 281 {
 282    static enum opcode dot_opcodes[] = {
 283       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
 284    };
 285
 286    emit(dot_opcodes[elements - 2], dst, src0, src1);
 287 }
 288
 289 src_reg
 290 vec4_visitor::fix_3src_operand(src_reg src)
 291 {
 292    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 293     * able to use vertical stride of zero to replicate the vec4 uniform, like
 294     *
 295     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 296     *
 297     * But you can't, since vertical stride is always four in three-source
 298     * instructions. Instead, insert a MOV instruction to do the replication so
 299     * that the three-source instruction can consume it.
 300     */
 301
 302    /* The MOV is only needed if the source is a uniform or immediate. */
 303    if (src.file != UNIFORM && src.file != IMM)
 304       return src;
 305
 306    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 307       return src;
 308
 309    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 310    expanded.type = src.type;
 311    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 312    return src_reg(expanded);
 313 }
 314
 315 src_reg
 316 vec4_visitor::fix_math_operand(src_reg src)
 317 {
 318    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 319       return src;
 320
 321    /* The gen6 math instruction ignores the source modifiers --
 322     * swizzle, abs, negate, and at least some parts of the register
 323     * region description.
 324     *
 325     * Rather than trying to enumerate all these cases, *always* expand the
 326     * operand to a temp GRF for gen6.
 327     *
 328     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 329     * can't use.
 330     */
 331
 332    if (devinfo->gen == 7 && src.file != IMM)
 333       return src;
 334
 335    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 336    expanded.type = src.type;
 337    emit(MOV(expanded, src));
 338    return src_reg(expanded);
 339 }
 340
 341 void
 342 vec4_visitor::emit_math(enum opcode opcode,
 343                         const dst_reg &dst,
 344                         const src_reg &src0, const src_reg &src1)
 345 {
 346    vec4_instruction *math =
 347       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 348
 349    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 350       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 351       math->dst = dst_reg(this, glsl_type::vec4_type);
 352       math->dst.type = dst.type;
 353       emit(MOV(dst, src_reg(math->dst)));
 354    } else if (devinfo->gen < 6) {
 355       math->base_mrf = 1;
 356       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 357    }
 358 }
 359
 360 void
 361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 362 {
 363    if (devinfo->gen < 7) {
 364       unreachable("ir_unop_pack_half_2x16 should be lowered");
 365    }
 366
 367    assert(dst.type == BRW_REGISTER_TYPE_UD);
 368    assert(src0.type == BRW_REGISTER_TYPE_F);
 369
 370    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 371     *
 372     *   Because this instruction does not have a 16-bit floating-point type,
 373     *   the destination data type must be Word (W).
 374     *
 375     *   The destination must be DWord-aligned and specify a horizontal stride
 376     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 377     *   each destination channel and the upper word is not modified.
 378     *
 379     * The above restriction implies that the f32to16 instruction must use
 380     * align1 mode, because only in align1 mode is it possible to specify
 381     * horizontal stride.  We choose here to defy the hardware docs and emit
 382     * align16 instructions.
 383     *
 384     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 385     * instructions. I was partially successful in that the code passed all
 386     * tests.  However, the code was dubiously correct and fragile, and the
 387     * tests were not harsh enough to probe that frailty. Not trusting the
 388     * code, I chose instead to remain in align16 mode in defiance of the hw
 389     * docs).
 390     *
 391     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 392     * simulator, emitting a f32to16 in align16 mode with UD as destination
 393     * data type is safe. The behavior differs from that specified in the PRM
 394     * in that the upper word of each destination channel is cleared to 0.
 395     */
 396
 397    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 398    src_reg tmp_src(tmp_dst);
 399
 400 #if 0
 401    /* Verify the undocumented behavior on which the following instructions
 402     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 403     * then the result of the bit-or instruction below will be incorrect.
 404     *
 405     * You should inspect the disasm output in order to verify that the MOV is
 406     * not optimized away.
 407     */
 408    emit(MOV(tmp_dst, src_reg(0x12345678u)));
 409 #endif
 410
 411    /* Give tmp the form below, where "." means untouched.
 412     *
 413     *     w z          y          x w z          y          x
 414     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 415     *
 416     * That the upper word of each write-channel be 0 is required for the
 417     * following bit-shift and bit-or instructions to work. Note that this
 418     * relies on the undocumented hardware behavior mentioned above.
 419     */
 420    tmp_dst.writemask = WRITEMASK_XY;
 421    emit(F32TO16(tmp_dst, src0));
 422
 423    /* Give the write-channels of dst the form:
 424     *   0xhhhh0000
 425     */
 426    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 427    emit(SHL(dst, tmp_src, src_reg(16u)));
 428
 429    /* Finally, give the write-channels of dst the form of packHalf2x16's
 430     * output:
 431     *   0xhhhhllll
 432     */
 433    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 434    emit(OR(dst, src_reg(dst), tmp_src));
 435 }
 436
 437 void
 438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 439 {
 440    if (devinfo->gen < 7) {
 441       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 442    }
 443
 444    assert(dst.type == BRW_REGISTER_TYPE_F);
 445    assert(src0.type == BRW_REGISTER_TYPE_UD);
 446
 447    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 448     *
 449     *   Because this instruction does not have a 16-bit floating-point type,
 450     *   the source data type must be Word (W). The destination type must be
 451     *   F (Float).
 452     *
 453     * To use W as the source data type, we must adjust horizontal strides,
 454     * which is only possible in align1 mode. All my [chadv] attempts at
 455     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 456     * Piglit tests, so I gave up.
 457     *
 458     * I've verified that, on gen7 hardware and the simulator, it is safe to
 459     * emit f16to32 in align16 mode with UD as source data type.
 460     */
 461
 462    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 463    src_reg tmp_src(tmp_dst);
 464
 465    tmp_dst.writemask = WRITEMASK_X;
 466    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
 467
 468    tmp_dst.writemask = WRITEMASK_Y;
 469    emit(SHR(tmp_dst, src0, src_reg(16u)));
 470
 471    dst.writemask = WRITEMASK_XY;
 472    emit(F16TO32(dst, tmp_src));
 473 }
 474
 475 void
 476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 477 {
 478    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 479     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 480     * is not suitable to generate the shift values, but we can use the packed
 481     * vector float and a type-converting MOV.
 482     */
 483    dst_reg shift(this, glsl_type::uvec4_type);
 484    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 485
 486    dst_reg shifted(this, glsl_type::uvec4_type);
 487    src0.swizzle = BRW_SWIZZLE_XXXX;
 488    emit(SHR(shifted, src0, src_reg(shift)));
 489
 490    shifted.type = BRW_REGISTER_TYPE_UB;
 491    dst_reg f(this, glsl_type::vec4_type);
 492    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 493
 494    emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
 495 }
 496
 497 void
 498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 499 {
 500    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 501     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 502     * is not suitable to generate the shift values, but we can use the packed
 503     * vector float and a type-converting MOV.
 504     */
 505    dst_reg shift(this, glsl_type::uvec4_type);
 506    emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
 507
 508    dst_reg shifted(this, glsl_type::uvec4_type);
 509    src0.swizzle = BRW_SWIZZLE_XXXX;
 510    emit(SHR(shifted, src0, src_reg(shift)));
 511
 512    shifted.type = BRW_REGISTER_TYPE_B;
 513    dst_reg f(this, glsl_type::vec4_type);
 514    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 515
 516    dst_reg scaled(this, glsl_type::vec4_type);
 517    emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
 518
 519    dst_reg max(this, glsl_type::vec4_type);
 520    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
 521    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
 522 }
 523
 524 void
 525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 526 {
 527    dst_reg saturated(this, glsl_type::vec4_type);
 528    vec4_instruction *inst = emit(MOV(saturated, src0));
 529    inst->saturate = true;
 530
 531    dst_reg scaled(this, glsl_type::vec4_type);
 532    emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
 533
 534    dst_reg rounded(this, glsl_type::vec4_type);
 535    emit(RNDE(rounded, src_reg(scaled)));
 536
 537    dst_reg u(this, glsl_type::uvec4_type);
 538    emit(MOV(u, src_reg(rounded)));
 539
 540    src_reg bytes(u);
 541    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 542 }
 543
 544 void
 545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 546 {
 547    dst_reg max(this, glsl_type::vec4_type);
 548    emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
 549
 550    dst_reg min(this, glsl_type::vec4_type);
 551    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
 552
 553    dst_reg scaled(this, glsl_type::vec4_type);
 554    emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
 555
 556    dst_reg rounded(this, glsl_type::vec4_type);
 557    emit(RNDE(rounded, src_reg(scaled)));
 558
 559    dst_reg i(this, glsl_type::ivec4_type);
 560    emit(MOV(i, src_reg(rounded)));
 561
 562    src_reg bytes(i);
 563    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 564 }
 565
 566 void
 567 vec4_visitor::visit_instructions(const exec_list *list)
 568 {
 569    foreach_in_list(ir_instruction, ir, list) {
 570       base_ir = ir;
 571       ir->accept(this);
 572    }
 573 }
 574
 575
 576 static int
 577 type_size(const struct glsl_type *type)
 578 {
 579    unsigned int i;
 580    int size;
 581
 582    switch (type->base_type) {
 583    case GLSL_TYPE_UINT:
 584    case GLSL_TYPE_INT:
 585    case GLSL_TYPE_FLOAT:
 586    case GLSL_TYPE_BOOL:
 587       if (type->is_matrix()) {
 588          return type->matrix_columns;
 589       } else {
 590          /* Regardless of size of vector, it gets a vec4. This is bad
 591           * packing for things like floats, but otherwise arrays become a
 592           * mess.  Hopefully a later pass over the code can pack scalars
 593           * down if appropriate.
 594           */
 595          return 1;
 596       }
 597    case GLSL_TYPE_ARRAY:
 598       assert(type->length > 0);
 599       return type_size(type->fields.array) * type->length;
 600    case GLSL_TYPE_STRUCT:
 601       size = 0;
 602       for (i = 0; i < type->length; i++) {
 603          size += type_size(type->fields.structure[i].type);
 604       }
 605       return size;
 606    case GLSL_TYPE_SAMPLER:
 607       /* Samplers take up no register space, since they're baked in at
 608        * link time.
 609        */
 610       return 0;
 611    case GLSL_TYPE_ATOMIC_UINT:
 612       return 0;
 613    case GLSL_TYPE_IMAGE:
 614    case GLSL_TYPE_VOID:
 615    case GLSL_TYPE_DOUBLE:
 616    case GLSL_TYPE_ERROR:
 617    case GLSL_TYPE_INTERFACE:
 618       unreachable("not reached");
 619    }
 620
 621    return 0;
 622 }
 623
 624 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 625 {
 626    init();
 627
 628    this->file = GRF;
 629    this->reg = v->alloc.allocate(type_size(type));
 630
 631    if (type->is_array() || type->is_record()) {
 632       this->swizzle = BRW_SWIZZLE_NOOP;
 633    } else {
 634       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 635    }
 636
 637    this->type = brw_type_for_base_type(type);
 638 }
 639
 640 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 641 {
 642    assert(size > 0);
 643
 644    init();
 645
 646    this->file = GRF;
 647    this->reg = v->alloc.allocate(type_size(type) * size);
 648
 649    this->swizzle = BRW_SWIZZLE_NOOP;
 650
 651    this->type = brw_type_for_base_type(type);
 652 }
 653
 654 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 655 {
 656    init();
 657
 658    this->file = GRF;
 659    this->reg = v->alloc.allocate(type_size(type));
 660
 661    if (type->is_array() || type->is_record()) {
 662       this->writemask = WRITEMASK_XYZW;
 663    } else {
 664       this->writemask = (1 << type->vector_elements) - 1;
 665    }
 666
 667    this->type = brw_type_for_base_type(type);
 668 }
 669
 670 /* Our support for uniforms is piggy-backed on the struct
 671  * gl_fragment_program, because that's where the values actually
 672  * get stored, rather than in some global gl_shader_program uniform
 673  * store.
 674  */
 675 void
 676 vec4_visitor::setup_uniform_values(ir_variable *ir)
 677 {
 678    int namelen = strlen(ir->name);
 679
 680    /* The data for our (non-builtin) uniforms is stored in a series of
 681     * gl_uniform_driver_storage structs for each subcomponent that
 682     * glGetUniformLocation() could name.  We know it's been set up in the same
 683     * order we'd walk the type, so walk the list of storage and find anything
 684     * with our name, or the prefix of a component that starts with our name.
 685     */
 686    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 687       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 688
 689       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 690           (storage->name[namelen] != 0 &&
 691            storage->name[namelen] != '.' &&
 692            storage->name[namelen] != '[')) {
 693          continue;
 694       }
 695
 696       gl_constant_value *components = storage->storage;
 697       unsigned vector_count = (MAX2(storage->array_elements, 1) *
 698                                storage->type->matrix_columns);
 699
 700       for (unsigned s = 0; s < vector_count; s++) {
 701          assert(uniforms < uniform_array_size);
 702          uniform_vector_size[uniforms] = storage->type->vector_elements;
 703
 704          int i;
 705          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
 706             stage_prog_data->param[uniforms * 4 + i] = components;
 707             components++;
 708          }
 709          for (; i < 4; i++) {
 710             static gl_constant_value zero = { 0.0 };
 711             stage_prog_data->param[uniforms * 4 + i] = &zero;
 712          }
 713
 714          uniforms++;
 715       }
 716    }
 717 }
 718
 719 void
 720 vec4_visitor::setup_uniform_clipplane_values()
 721 {
 722    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
 723
 724    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
 725       assert(this->uniforms < uniform_array_size);
 726       this->uniform_vector_size[this->uniforms] = 4;
 727       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
 728       this->userplane[i].type = BRW_REGISTER_TYPE_F;
 729       for (int j = 0; j < 4; ++j) {
 730          stage_prog_data->param[this->uniforms * 4 + j] =
 731             (gl_constant_value *) &clip_planes[i][j];
 732       }
 733       ++this->uniforms;
 734    }
 735 }
 736
 737 /* Our support for builtin uniforms is even scarier than non-builtin.
 738  * It sits on top of the PROG_STATE_VAR parameters that are
 739  * automatically updated from GL context state.
 740  */
 741 void
 742 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
 743 {
 744    const ir_state_slot *const slots = ir->get_state_slots();
 745    assert(slots != NULL);
 746
 747    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
 748       /* This state reference has already been setup by ir_to_mesa,
 749        * but we'll get the same index back here.  We can reference
 750        * ParameterValues directly, since unlike brw_fs.cpp, we never
 751        * add new state references during compile.
 752        */
 753       int index = _mesa_add_state_reference(this->prog->Parameters,
 754                                             (gl_state_index *)slots[i].tokens);
 755       gl_constant_value *values =
 756          &this->prog->Parameters->ParameterValues[index][0];
 757
 758       assert(this->uniforms < uniform_array_size);
 759
 760       for (unsigned j = 0; j < 4; j++)
 761          stage_prog_data->param[this->uniforms * 4 + j] =
 762             &values[GET_SWZ(slots[i].swizzle, j)];
 763
 764       this->uniform_vector_size[this->uniforms] =
 765          (ir->type->is_scalar() || ir->type->is_vector() ||
 766           ir->type->is_matrix() ? ir->type->vector_elements : 4);
 767
 768       this->uniforms++;
 769    }
 770 }
 771
 772 dst_reg *
 773 vec4_visitor::variable_storage(ir_variable *var)
 774 {
 775    return (dst_reg *)hash_table_find(this->variable_ht, var);
 776 }
 777
 778 void
 779 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
 780                                      enum brw_predicate *predicate)
 781 {
 782    ir_expression *expr = ir->as_expression();
 783
 784    *predicate = BRW_PREDICATE_NORMAL;
 785
 786    if (expr && expr->operation != ir_binop_ubo_load) {
 787       src_reg op[3];
 788       vec4_instruction *inst;
 789
 790       assert(expr->get_num_operands() <= 3);
 791       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 792          expr->operands[i]->accept(this);
 793          op[i] = this->result;
 794
 795          resolve_ud_negate(&op[i]);
 796       }
 797
 798       switch (expr->operation) {
 799       case ir_unop_logic_not:
 800          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
 801          inst->conditional_mod = BRW_CONDITIONAL_Z;
 802          break;
 803
 804       case ir_binop_logic_xor:
 805          if (devinfo->gen <= 5) {
 806             src_reg temp = src_reg(this, ir->type);
 807             emit(XOR(dst_reg(temp), op[0], op[1]));
 808             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 809          } else {
 810             inst = emit(XOR(dst_null_d(), op[0], op[1]));
 811          }
 812          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 813          break;
 814
 815       case ir_binop_logic_or:
 816          if (devinfo->gen <= 5) {
 817             src_reg temp = src_reg(this, ir->type);
 818             emit(OR(dst_reg(temp), op[0], op[1]));
 819             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 820          } else {
 821             inst = emit(OR(dst_null_d(), op[0], op[1]));
 822          }
 823          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 824          break;
 825
 826       case ir_binop_logic_and:
 827          if (devinfo->gen <= 5) {
 828             src_reg temp = src_reg(this, ir->type);
 829             emit(AND(dst_reg(temp), op[0], op[1]));
 830             inst = emit(AND(dst_null_d(), temp, src_reg(1)));
 831          } else {
 832             inst = emit(AND(dst_null_d(), op[0], op[1]));
 833          }
 834          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 835          break;
 836
 837       case ir_unop_f2b:
 838          if (devinfo->gen >= 6) {
 839             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
 840          } else {
 841             inst = emit(MOV(dst_null_f(), op[0]));
 842             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 843          }
 844          break;
 845
 846       case ir_unop_i2b:
 847          if (devinfo->gen >= 6) {
 848             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 849          } else {
 850             inst = emit(MOV(dst_null_d(), op[0]));
 851             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 852          }
 853          break;
 854
 855       case ir_binop_all_equal:
 856          if (devinfo->gen <= 5) {
 857             resolve_bool_comparison(expr->operands[0], &op[0]);
 858             resolve_bool_comparison(expr->operands[1], &op[1]);
 859          }
 860          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 861          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
 862          break;
 863
 864       case ir_binop_any_nequal:
 865          if (devinfo->gen <= 5) {
 866             resolve_bool_comparison(expr->operands[0], &op[0]);
 867             resolve_bool_comparison(expr->operands[1], &op[1]);
 868          }
 869          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 870          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 871          break;
 872
 873       case ir_unop_any:
 874          if (devinfo->gen <= 5) {
 875             resolve_bool_comparison(expr->operands[0], &op[0]);
 876          }
 877          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 878          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
 879          break;
 880
 881       case ir_binop_greater:
 882       case ir_binop_gequal:
 883       case ir_binop_less:
 884       case ir_binop_lequal:
 885       case ir_binop_equal:
 886       case ir_binop_nequal:
 887          if (devinfo->gen <= 5) {
 888             resolve_bool_comparison(expr->operands[0], &op[0]);
 889             resolve_bool_comparison(expr->operands[1], &op[1]);
 890          }
 891          emit(CMP(dst_null_d(), op[0], op[1],
 892                   brw_conditional_for_comparison(expr->operation)));
 893          break;
 894
 895       case ir_triop_csel: {
 896          /* Expand the boolean condition into the flag register. */
 897          inst = emit(MOV(dst_null_d(), op[0]));
 898          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 899
 900          /* Select which boolean to return. */
 901          dst_reg temp(this, expr->operands[1]->type);
 902          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
 903          inst->predicate = BRW_PREDICATE_NORMAL;
 904
 905          /* Expand the result to a condition code. */
 906          inst = emit(MOV(dst_null_d(), src_reg(temp)));
 907          inst->conditional_mod = BRW_CONDITIONAL_NZ;
 908          break;
 909       }
 910
 911       default:
 912          unreachable("not reached");
 913       }
 914       return;
 915    }
 916
 917    ir->accept(this);
 918
 919    resolve_ud_negate(&this->result);
 920
 921    vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
 922    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 923 }
 924
 925 /**
 926  * Emit a gen6 IF statement with the comparison folded into the IF
 927  * instruction.
 928  */
 929 void
 930 vec4_visitor::emit_if_gen6(ir_if *ir)
 931 {
 932    ir_expression *expr = ir->condition->as_expression();
 933
 934    if (expr && expr->operation != ir_binop_ubo_load) {
 935       src_reg op[3];
 936       dst_reg temp;
 937
 938       assert(expr->get_num_operands() <= 3);
 939       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
 940          expr->operands[i]->accept(this);
 941          op[i] = this->result;
 942       }
 943
 944       switch (expr->operation) {
 945       case ir_unop_logic_not:
 946          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
 947          return;
 948
 949       case ir_binop_logic_xor:
 950          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
 951          return;
 952
 953       case ir_binop_logic_or:
 954          temp = dst_reg(this, glsl_type::bool_type);
 955          emit(OR(temp, op[0], op[1]));
 956          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 957          return;
 958
 959       case ir_binop_logic_and:
 960          temp = dst_reg(this, glsl_type::bool_type);
 961          emit(AND(temp, op[0], op[1]));
 962          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
 963          return;
 964
 965       case ir_unop_f2b:
 966          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 967          return;
 968
 969       case ir_unop_i2b:
 970          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 971          return;
 972
 973       case ir_binop_greater:
 974       case ir_binop_gequal:
 975       case ir_binop_less:
 976       case ir_binop_lequal:
 977       case ir_binop_equal:
 978       case ir_binop_nequal:
 979          emit(IF(op[0], op[1],
 980                  brw_conditional_for_comparison(expr->operation)));
 981          return;
 982
 983       case ir_binop_all_equal:
 984          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
 985          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
 986          return;
 987
 988       case ir_binop_any_nequal:
 989          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
 990          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 991          return;
 992
 993       case ir_unop_any:
 994          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
 995          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
 996          return;
 997
 998       case ir_triop_csel: {
 999          /* Expand the boolean condition into the flag register. */
1000          vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1001          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1002
1003          /* Select which boolean to return. */
1004          dst_reg temp(this, expr->operands[1]->type);
1005          inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1006          inst->predicate = BRW_PREDICATE_NORMAL;
1007
1008          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1009          return;
1010       }
1011
1012       default:
1013          unreachable("not reached");
1014       }
1015       return;
1016    }
1017
1018    ir->condition->accept(this);
1019
1020    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1021 }
1022
1023 void
1024 vec4_visitor::visit(ir_variable *ir)
1025 {
1026    dst_reg *reg = NULL;
1027
1028    if (variable_storage(ir))
1029       return;
1030
1031    switch (ir->data.mode) {
1032    case ir_var_shader_in:
1033       assert(ir->data.location != -1);
1034       reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1035       break;
1036
1037    case ir_var_shader_out:
1038       assert(ir->data.location != -1);
1039       reg = new(mem_ctx) dst_reg(this, ir->type);
1040
1041       for (int i = 0; i < type_size(ir->type); i++) {
1042          output_reg[ir->data.location + i] = *reg;
1043          output_reg[ir->data.location + i].reg_offset = i;
1044          output_reg[ir->data.location + i].type =
1045             brw_type_for_base_type(ir->type->get_scalar_type());
1046          output_reg_annotation[ir->data.location + i] = ir->name;
1047       }
1048       break;
1049
1050    case ir_var_auto:
1051    case ir_var_temporary:
1052       reg = new(mem_ctx) dst_reg(this, ir->type);
1053       break;
1054
1055    case ir_var_uniform:
1056       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1057
1058       /* Thanks to the lower_ubo_reference pass, we will see only
1059        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1060        * variables, so no need for them to be in variable_ht.
1061        *
1062        * Some uniforms, such as samplers and atomic counters, have no actual
1063        * storage, so we should ignore them.
1064        */
1065       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1066          return;
1067
1068       /* Track how big the whole uniform variable is, in case we need to put a
1069        * copy of its data into pull constants for array access.
1070        */
1071       assert(this->uniforms < uniform_array_size);
1072       this->uniform_size[this->uniforms] = type_size(ir->type);
1073
1074       if (!strncmp(ir->name, "gl_", 3)) {
1075          setup_builtin_uniform_values(ir);
1076       } else {
1077          setup_uniform_values(ir);
1078       }
1079       break;
1080
1081    case ir_var_system_value:
1082       reg = make_reg_for_system_value(ir);
1083       break;
1084
1085    default:
1086       unreachable("not reached");
1087    }
1088
1089    reg->type = brw_type_for_base_type(ir->type);
1090    hash_table_insert(this->variable_ht, reg, ir);
1091 }
1092
1093 void
1094 vec4_visitor::visit(ir_loop *ir)
1095 {
1096    /* We don't want debugging output to print the whole body of the
1097     * loop as the annotation.
1098     */
1099    this->base_ir = NULL;
1100
1101    emit(BRW_OPCODE_DO);
1102
1103    visit_instructions(&ir->body_instructions);
1104
1105    emit(BRW_OPCODE_WHILE);
1106 }
1107
1108 void
1109 vec4_visitor::visit(ir_loop_jump *ir)
1110 {
1111    switch (ir->mode) {
1112    case ir_loop_jump::jump_break:
1113       emit(BRW_OPCODE_BREAK);
1114       break;
1115    case ir_loop_jump::jump_continue:
1116       emit(BRW_OPCODE_CONTINUE);
1117       break;
1118    }
1119 }
1120
1121
1122 void
1123 vec4_visitor::visit(ir_function_signature *)
1124 {
1125    unreachable("not reached");
1126 }
1127
1128 void
1129 vec4_visitor::visit(ir_function *ir)
1130 {
1131    /* Ignore function bodies other than main() -- we shouldn't see calls to
1132     * them since they should all be inlined.
1133     */
1134    if (strcmp(ir->name, "main") == 0) {
1135       const ir_function_signature *sig;
1136       exec_list empty;
1137
1138       sig = ir->matching_signature(NULL, &empty, false);
1139
1140       assert(sig);
1141
1142       visit_instructions(&sig->body);
1143    }
1144 }
1145
1146 bool
1147 vec4_visitor::try_emit_mad(ir_expression *ir)
1148 {
1149    /* 3-src instructions were introduced in gen6. */
1150    if (devinfo->gen < 6)
1151       return false;
1152
1153    /* MAD can only handle floating-point data. */
1154    if (ir->type->base_type != GLSL_TYPE_FLOAT)
1155       return false;
1156
1157    ir_rvalue *nonmul;
1158    ir_expression *mul;
1159    bool mul_negate, mul_abs;
1160
1161    for (int i = 0; i < 2; i++) {
1162       mul_negate = false;
1163       mul_abs = false;
1164
1165       mul = ir->operands[i]->as_expression();
1166       nonmul = ir->operands[1 - i];
1167
1168       if (mul && mul->operation == ir_unop_abs) {
1169          mul = mul->operands[0]->as_expression();
1170          mul_abs = true;
1171       } else if (mul && mul->operation == ir_unop_neg) {
1172          mul = mul->operands[0]->as_expression();
1173          mul_negate = true;
1174       }
1175
1176       if (mul && mul->operation == ir_binop_mul)
1177          break;
1178    }
1179
1180    if (!mul || mul->operation != ir_binop_mul)
1181       return false;
1182
1183    nonmul->accept(this);
1184    src_reg src0 = fix_3src_operand(this->result);
1185
1186    mul->operands[0]->accept(this);
1187    src_reg src1 = fix_3src_operand(this->result);
1188    src1.negate ^= mul_negate;
1189    src1.abs = mul_abs;
1190    if (mul_abs)
1191       src1.negate = false;
1192
1193    mul->operands[1]->accept(this);
1194    src_reg src2 = fix_3src_operand(this->result);
1195    src2.abs = mul_abs;
1196    if (mul_abs)
1197       src2.negate = false;
1198
1199    this->result = src_reg(this, ir->type);
1200    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1201
1202    return true;
1203 }
1204
1205 bool
1206 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1207 {
1208    /* This optimization relies on CMP setting the destination to 0 when
1209     * false.  Early hardware only sets the least significant bit, and
1210     * leaves the other bits undefined.  So we can't use it.
1211     */
1212    if (devinfo->gen < 6)
1213       return false;
1214
1215    ir_expression *const cmp = ir->operands[0]->as_expression();
1216
1217    if (cmp == NULL)
1218       return false;
1219
1220    switch (cmp->operation) {
1221    case ir_binop_less:
1222    case ir_binop_greater:
1223    case ir_binop_lequal:
1224    case ir_binop_gequal:
1225    case ir_binop_equal:
1226    case ir_binop_nequal:
1227       break;
1228
1229    default:
1230       return false;
1231    }
1232
1233    cmp->operands[0]->accept(this);
1234    const src_reg cmp_src0 = this->result;
1235
1236    cmp->operands[1]->accept(this);
1237    const src_reg cmp_src1 = this->result;
1238
1239    this->result = src_reg(this, ir->type);
1240
1241    emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1242             brw_conditional_for_comparison(cmp->operation)));
1243
1244    /* If the comparison is false, this->result will just happen to be zero.
1245     */
1246    vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1247                                        this->result, src_reg(1.0f));
1248    inst->predicate = BRW_PREDICATE_NORMAL;
1249    inst->predicate_inverse = true;
1250
1251    return true;
1252 }
1253
1254 void
1255 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1256                           src_reg src0, src_reg src1)
1257 {
1258    vec4_instruction *inst;
1259
1260    if (devinfo->gen >= 6) {
1261       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1262       inst->conditional_mod = conditionalmod;
1263    } else {
1264       emit(CMP(dst, src0, src1, conditionalmod));
1265
1266       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1267       inst->predicate = BRW_PREDICATE_NORMAL;
1268    }
1269 }
1270
1271 void
1272 vec4_visitor::emit_lrp(const dst_reg &dst,
1273                        const src_reg &x, const src_reg &y, const src_reg &a)
1274 {
1275    if (devinfo->gen >= 6) {
1276       /* Note that the instruction's argument order is reversed from GLSL
1277        * and the IR.
1278        */
1279       emit(LRP(dst,
1280                fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1281    } else {
1282       /* Earlier generations don't support three source operations, so we
1283        * need to emit x*(1-a) + y*a.
1284        */
1285       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
1286       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
1287       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1288       y_times_a.writemask           = dst.writemask;
1289       one_minus_a.writemask         = dst.writemask;
1290       x_times_one_minus_a.writemask = dst.writemask;
1291
1292       emit(MUL(y_times_a, y, a));
1293       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1294       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1295       emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1296    }
1297 }
1298
1299 /**
1300  * Emits the instructions needed to perform a pull constant load. before_block
1301  * and before_inst can be NULL in which case the instruction will be appended
1302  * to the end of the instruction list.
1303  */
1304 void
1305 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1306                                           src_reg surf_index,
1307                                           src_reg offset_reg,
1308                                           bblock_t *before_block,
1309                                           vec4_instruction *before_inst)
1310 {
1311    assert((before_inst == NULL && before_block == NULL) ||
1312           (before_inst && before_block));
1313
1314    vec4_instruction *pull;
1315
1316    if (devinfo->gen >= 9) {
1317       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1318       src_reg header(this, glsl_type::uvec4_type, 2);
1319
1320       pull = new(mem_ctx)
1321          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1322                           dst_reg(header));
1323
1324       if (before_inst)
1325          emit_before(before_block, before_inst, pull);
1326       else
1327          emit(pull);
1328
1329       dst_reg index_reg = retype(offset(dst_reg(header), 1),
1330                                  offset_reg.type);
1331       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1332
1333       if (before_inst)
1334          emit_before(before_block, before_inst, pull);
1335       else
1336          emit(pull);
1337
1338       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1339                                            dst,
1340                                            surf_index,
1341                                            header);
1342       pull->mlen = 2;
1343       pull->header_present = true;
1344    } else if (devinfo->gen >= 7) {
1345       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1346
1347       grf_offset.type = offset_reg.type;
1348
1349       pull = MOV(grf_offset, offset_reg);
1350
1351       if (before_inst)
1352          emit_before(before_block, before_inst, pull);
1353       else
1354          emit(pull);
1355
1356       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1357                                            dst,
1358                                            surf_index,
1359                                            src_reg(grf_offset));
1360       pull->mlen = 1;
1361    } else {
1362       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1363                                            dst,
1364                                            surf_index,
1365                                            offset_reg);
1366       pull->base_mrf = 14;
1367       pull->mlen = 1;
1368    }
1369
1370    if (before_inst)
1371       emit_before(before_block, before_inst, pull);
1372    else
1373       emit(pull);
1374 }
1375
1376 void
1377 vec4_visitor::visit(ir_expression *ir)
1378 {
1379    unsigned int operand;
1380    src_reg op[ARRAY_SIZE(ir->operands)];
1381    vec4_instruction *inst;
1382
1383    if (ir->operation == ir_binop_add) {
1384       if (try_emit_mad(ir))
1385          return;
1386    }
1387
1388    if (ir->operation == ir_unop_b2f) {
1389       if (try_emit_b2f_of_compare(ir))
1390          return;
1391    }
1392
1393    /* Storage for our result.  Ideally for an assignment we'd be using
1394     * the actual storage for the result here, instead.
1395     */
1396    dst_reg result_dst(this, ir->type);
1397    src_reg result_src(result_dst);
1398
1399    if (ir->operation == ir_triop_csel) {
1400       ir->operands[1]->accept(this);
1401       op[1] = this->result;
1402       ir->operands[2]->accept(this);
1403       op[2] = this->result;
1404
1405       enum brw_predicate predicate;
1406       emit_bool_to_cond_code(ir->operands[0], &predicate);
1407       inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1408       inst->predicate = predicate;
1409       this->result = result_src;
1410       return;
1411    }
1412
1413    for (operand = 0; operand < ir->get_num_operands(); operand++) {
1414       this->result.file = BAD_FILE;
1415       ir->operands[operand]->accept(this);
1416       if (this->result.file == BAD_FILE) {
1417          fprintf(stderr, "Failed to get tree for expression operand:\n");
1418          ir->operands[operand]->fprint(stderr);
1419          exit(1);
1420       }
1421       op[operand] = this->result;
1422
1423       /* Matrix expression operands should have been broken down to vector
1424        * operations already.
1425        */
1426       assert(!ir->operands[operand]->type->is_matrix());
1427    }
1428
1429    /* If nothing special happens, this is the result. */
1430    this->result = result_src;
1431
1432    switch (ir->operation) {
1433    case ir_unop_logic_not:
1434       emit(NOT(result_dst, op[0]));
1435       break;
1436    case ir_unop_neg:
1437       op[0].negate = !op[0].negate;
1438       emit(MOV(result_dst, op[0]));
1439       break;
1440    case ir_unop_abs:
1441       op[0].abs = true;
1442       op[0].negate = false;
1443       emit(MOV(result_dst, op[0]));
1444       break;
1445
1446    case ir_unop_sign:
1447       if (ir->type->is_float()) {
1448          /* AND(val, 0x80000000) gives the sign bit.
1449           *
1450           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1451           * zero.
1452           */
1453          emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1454
1455          op[0].type = BRW_REGISTER_TYPE_UD;
1456          result_dst.type = BRW_REGISTER_TYPE_UD;
1457          emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1458
1459          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1460          inst->predicate = BRW_PREDICATE_NORMAL;
1461
1462          this->result.type = BRW_REGISTER_TYPE_F;
1463       } else {
1464          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1465           *               -> non-negative val generates 0x00000000.
1466           *  Predicated OR sets 1 if val is positive.
1467           */
1468          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1469
1470          emit(ASR(result_dst, op[0], src_reg(31)));
1471
1472          inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1473          inst->predicate = BRW_PREDICATE_NORMAL;
1474       }
1475       break;
1476
1477    case ir_unop_rcp:
1478       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1479       break;
1480
1481    case ir_unop_exp2:
1482       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1483       break;
1484    case ir_unop_log2:
1485       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1486       break;
1487    case ir_unop_exp:
1488    case ir_unop_log:
1489       unreachable("not reached: should be handled by ir_explog_to_explog2");
1490    case ir_unop_sin:
1491       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1492       break;
1493    case ir_unop_cos:
1494       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1495       break;
1496
1497    case ir_unop_dFdx:
1498    case ir_unop_dFdx_coarse:
1499    case ir_unop_dFdx_fine:
1500    case ir_unop_dFdy:
1501    case ir_unop_dFdy_coarse:
1502    case ir_unop_dFdy_fine:
1503       unreachable("derivatives not valid in vertex shader");
1504
1505    case ir_unop_bitfield_reverse:
1506       emit(BFREV(result_dst, op[0]));
1507       break;
1508    case ir_unop_bit_count:
1509       emit(CBIT(result_dst, op[0]));
1510       break;
1511    case ir_unop_find_msb: {
1512       src_reg temp = src_reg(this, glsl_type::uint_type);
1513
1514       inst = emit(FBH(dst_reg(temp), op[0]));
1515       inst->dst.writemask = WRITEMASK_XYZW;
1516
1517       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1518        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1519        * subtract the result from 31 to convert the MSB count into an LSB count.
1520        */
1521
1522       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1523       temp.swizzle = BRW_SWIZZLE_NOOP;
1524       emit(MOV(result_dst, temp));
1525
1526       src_reg src_tmp = src_reg(result_dst);
1527       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1528
1529       src_tmp.negate = true;
1530       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1531       inst->predicate = BRW_PREDICATE_NORMAL;
1532       break;
1533    }
1534    case ir_unop_find_lsb:
1535       emit(FBL(result_dst, op[0]));
1536       break;
1537    case ir_unop_saturate:
1538       inst = emit(MOV(result_dst, op[0]));
1539       inst->saturate = true;
1540       break;
1541
1542    case ir_unop_noise:
1543       unreachable("not reached: should be handled by lower_noise");
1544
1545    case ir_binop_add:
1546       emit(ADD(result_dst, op[0], op[1]));
1547       break;
1548    case ir_binop_sub:
1549       unreachable("not reached: should be handled by ir_sub_to_add_neg");
1550
1551    case ir_binop_mul:
1552       if (devinfo->gen < 8 && ir->type->is_integer()) {
1553          /* For integer multiplication, the MUL uses the low 16 bits of one of
1554           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
1555           * accumulates in the contribution of the upper 16 bits of that
1556           * operand.  If we can determine that one of the args is in the low
1557           * 16 bits, though, we can just emit a single MUL.
1558           */
1559          if (ir->operands[0]->is_uint16_constant()) {
1560             if (devinfo->gen < 7)
1561                emit(MUL(result_dst, op[0], op[1]));
1562             else
1563                emit(MUL(result_dst, op[1], op[0]));
1564          } else if (ir->operands[1]->is_uint16_constant()) {
1565             if (devinfo->gen < 7)
1566                emit(MUL(result_dst, op[1], op[0]));
1567             else
1568                emit(MUL(result_dst, op[0], op[1]));
1569          } else {
1570             struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1571
1572             emit(MUL(acc, op[0], op[1]));
1573             emit(MACH(dst_null_d(), op[0], op[1]));
1574             emit(MOV(result_dst, src_reg(acc)));
1575          }
1576       } else {
1577          emit(MUL(result_dst, op[0], op[1]));
1578       }
1579       break;
1580    case ir_binop_imul_high: {
1581       struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1582
1583       emit(MUL(acc, op[0], op[1]));
1584       emit(MACH(result_dst, op[0], op[1]));
1585       break;
1586    }
1587    case ir_binop_div:
1588       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1589       assert(ir->type->is_integer());
1590       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1591       break;
1592    case ir_binop_carry: {
1593       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1594
1595       emit(ADDC(dst_null_ud(), op[0], op[1]));
1596       emit(MOV(result_dst, src_reg(acc)));
1597       break;
1598    }
1599    case ir_binop_borrow: {
1600       struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1601
1602       emit(SUBB(dst_null_ud(), op[0], op[1]));
1603       emit(MOV(result_dst, src_reg(acc)));
1604       break;
1605    }
1606    case ir_binop_mod:
1607       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1608       assert(ir->type->is_integer());
1609       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1610       break;
1611
1612    case ir_binop_less:
1613    case ir_binop_greater:
1614    case ir_binop_lequal:
1615    case ir_binop_gequal:
1616    case ir_binop_equal:
1617    case ir_binop_nequal: {
1618       if (devinfo->gen <= 5) {
1619          resolve_bool_comparison(ir->operands[0], &op[0]);
1620          resolve_bool_comparison(ir->operands[1], &op[1]);
1621       }
1622       emit(CMP(result_dst, op[0], op[1],
1623                brw_conditional_for_comparison(ir->operation)));
1624       break;
1625    }
1626
1627    case ir_binop_all_equal:
1628       if (devinfo->gen <= 5) {
1629          resolve_bool_comparison(ir->operands[0], &op[0]);
1630          resolve_bool_comparison(ir->operands[1], &op[1]);
1631       }
1632
1633       /* "==" operator producing a scalar boolean. */
1634       if (ir->operands[0]->type->is_vector() ||
1635           ir->operands[1]->type->is_vector()) {
1636          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1637          emit(MOV(result_dst, src_reg(0)));
1638          inst = emit(MOV(result_dst, src_reg(~0)));
1639          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1640       } else {
1641          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1642       }
1643       break;
1644    case ir_binop_any_nequal:
1645       if (devinfo->gen <= 5) {
1646          resolve_bool_comparison(ir->operands[0], &op[0]);
1647          resolve_bool_comparison(ir->operands[1], &op[1]);
1648       }
1649
1650       /* "!=" operator producing a scalar boolean. */
1651       if (ir->operands[0]->type->is_vector() ||
1652           ir->operands[1]->type->is_vector()) {
1653          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1654
1655          emit(MOV(result_dst, src_reg(0)));
1656          inst = emit(MOV(result_dst, src_reg(~0)));
1657          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1658       } else {
1659          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1660       }
1661       break;
1662
1663    case ir_unop_any:
1664       if (devinfo->gen <= 5) {
1665          resolve_bool_comparison(ir->operands[0], &op[0]);
1666       }
1667       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1668       emit(MOV(result_dst, src_reg(0)));
1669
1670       inst = emit(MOV(result_dst, src_reg(~0)));
1671       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1672       break;
1673
1674    case ir_binop_logic_xor:
1675       emit(XOR(result_dst, op[0], op[1]));
1676       break;
1677
1678    case ir_binop_logic_or:
1679       emit(OR(result_dst, op[0], op[1]));
1680       break;
1681
1682    case ir_binop_logic_and:
1683       emit(AND(result_dst, op[0], op[1]));
1684       break;
1685
1686    case ir_binop_dot:
1687       assert(ir->operands[0]->type->is_vector());
1688       assert(ir->operands[0]->type == ir->operands[1]->type);
1689       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1690       break;
1691
1692    case ir_unop_sqrt:
1693       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1694       break;
1695    case ir_unop_rsq:
1696       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1697       break;
1698
1699    case ir_unop_bitcast_i2f:
1700    case ir_unop_bitcast_u2f:
1701       this->result = op[0];
1702       this->result.type = BRW_REGISTER_TYPE_F;
1703       break;
1704
1705    case ir_unop_bitcast_f2i:
1706       this->result = op[0];
1707       this->result.type = BRW_REGISTER_TYPE_D;
1708       break;
1709
1710    case ir_unop_bitcast_f2u:
1711       this->result = op[0];
1712       this->result.type = BRW_REGISTER_TYPE_UD;
1713       break;
1714
1715    case ir_unop_i2f:
1716    case ir_unop_i2u:
1717    case ir_unop_u2i:
1718    case ir_unop_u2f:
1719    case ir_unop_f2i:
1720    case ir_unop_f2u:
1721       emit(MOV(result_dst, op[0]));
1722       break;
1723    case ir_unop_b2i:
1724       emit(AND(result_dst, op[0], src_reg(1)));
1725       break;
1726    case ir_unop_b2f:
1727       if (devinfo->gen <= 5) {
1728          resolve_bool_comparison(ir->operands[0], &op[0]);
1729       }
1730       op[0].type = BRW_REGISTER_TYPE_D;
1731       result_dst.type = BRW_REGISTER_TYPE_D;
1732       emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1733       result_dst.type = BRW_REGISTER_TYPE_F;
1734       break;
1735    case ir_unop_f2b:
1736       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1737       break;
1738    case ir_unop_i2b:
1739       emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1740       break;
1741
1742    case ir_unop_trunc:
1743       emit(RNDZ(result_dst, op[0]));
1744       break;
1745    case ir_unop_ceil: {
1746          src_reg tmp = src_reg(this, ir->type);
1747          op[0].negate = !op[0].negate;
1748          emit(RNDD(dst_reg(tmp), op[0]));
1749          tmp.negate = true;
1750          emit(MOV(result_dst, tmp));
1751       }
1752       break;
1753    case ir_unop_floor:
1754       inst = emit(RNDD(result_dst, op[0]));
1755       break;
1756    case ir_unop_fract:
1757       inst = emit(FRC(result_dst, op[0]));
1758       break;
1759    case ir_unop_round_even:
1760       emit(RNDE(result_dst, op[0]));
1761       break;
1762
1763    case ir_binop_min:
1764       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1765       break;
1766    case ir_binop_max:
1767       emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1768       break;
1769
1770    case ir_binop_pow:
1771       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1772       break;
1773
1774    case ir_unop_bit_not:
1775       inst = emit(NOT(result_dst, op[0]));
1776       break;
1777    case ir_binop_bit_and:
1778       inst = emit(AND(result_dst, op[0], op[1]));
1779       break;
1780    case ir_binop_bit_xor:
1781       inst = emit(XOR(result_dst, op[0], op[1]));
1782       break;
1783    case ir_binop_bit_or:
1784       inst = emit(OR(result_dst, op[0], op[1]));
1785       break;
1786
1787    case ir_binop_lshift:
1788       inst = emit(SHL(result_dst, op[0], op[1]));
1789       break;
1790
1791    case ir_binop_rshift:
1792       if (ir->type->base_type == GLSL_TYPE_INT)
1793          inst = emit(ASR(result_dst, op[0], op[1]));
1794       else
1795          inst = emit(SHR(result_dst, op[0], op[1]));
1796       break;
1797
1798    case ir_binop_bfm:
1799       emit(BFI1(result_dst, op[0], op[1]));
1800       break;
1801
1802    case ir_binop_ubo_load: {
1803       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1804       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1805       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1806       src_reg offset;
1807
1808       /* Now, load the vector from that offset. */
1809       assert(ir->type->is_vector() || ir->type->is_scalar());
1810
1811       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1812       packed_consts.type = result.type;
1813       src_reg surf_index;
1814
1815       if (const_uniform_block) {
1816          /* The block index is a constant, so just emit the binding table entry
1817           * as an immediate.
1818           */
1819          surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1820                               const_uniform_block->value.u[0]);
1821       } else {
1822          /* The block index is not a constant. Evaluate the index expression
1823           * per-channel and add the base UBO index; the generator will select
1824           * a value from any live channel.
1825           */
1826          surf_index = src_reg(this, glsl_type::uint_type);
1827          emit(ADD(dst_reg(surf_index), op[0],
1828                   src_reg(prog_data->base.binding_table.ubo_start)));
1829
1830          /* Assume this may touch any UBO. It would be nice to provide
1831           * a tighter bound, but the array information is already lowered away.
1832           */
1833          brw_mark_surface_used(&prog_data->base,
1834                                prog_data->base.binding_table.ubo_start +
1835                                shader_prog->NumUniformBlocks - 1);
1836       }
1837
1838       if (const_offset_ir) {
1839          if (devinfo->gen >= 8) {
1840             /* Store the offset in a GRF so we can send-from-GRF. */
1841             offset = src_reg(this, glsl_type::int_type);
1842             emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1843          } else {
1844             /* Immediates are fine on older generations since they'll be moved
1845              * to a (potentially fake) MRF at the generator level.
1846              */
1847             offset = src_reg(const_offset / 16);
1848          }
1849       } else {
1850          offset = src_reg(this, glsl_type::uint_type);
1851          emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1852       }
1853
1854       emit_pull_constant_load_reg(dst_reg(packed_consts),
1855                                   surf_index,
1856                                   offset,
1857                                   NULL, NULL /* before_block/inst */);
1858
1859       packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1860       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1861                                             const_offset % 16 / 4,
1862                                             const_offset % 16 / 4,
1863                                             const_offset % 16 / 4);
1864
1865       /* UBO bools are any nonzero int.  We need to convert them to 0/~0. */
1866       if (ir->type->base_type == GLSL_TYPE_BOOL) {
1867          emit(CMP(result_dst, packed_consts, src_reg(0u),
1868                   BRW_CONDITIONAL_NZ));
1869       } else {
1870          emit(MOV(result_dst, packed_consts));
1871       }
1872       break;
1873    }
1874
1875    case ir_binop_vector_extract:
1876       unreachable("should have been lowered by vec_index_to_cond_assign");
1877
1878    case ir_triop_fma:
1879       op[0] = fix_3src_operand(op[0]);
1880       op[1] = fix_3src_operand(op[1]);
1881       op[2] = fix_3src_operand(op[2]);
1882       /* Note that the instruction's argument order is reversed from GLSL
1883        * and the IR.
1884        */
1885       emit(MAD(result_dst, op[2], op[1], op[0]));
1886       break;
1887
1888    case ir_triop_lrp:
1889       emit_lrp(result_dst, op[0], op[1], op[2]);
1890       break;
1891
1892    case ir_triop_csel:
1893       unreachable("already handled above");
1894       break;
1895
1896    case ir_triop_bfi:
1897       op[0] = fix_3src_operand(op[0]);
1898       op[1] = fix_3src_operand(op[1]);
1899       op[2] = fix_3src_operand(op[2]);
1900       emit(BFI2(result_dst, op[0], op[1], op[2]));
1901       break;
1902
1903    case ir_triop_bitfield_extract:
1904       op[0] = fix_3src_operand(op[0]);
1905       op[1] = fix_3src_operand(op[1]);
1906       op[2] = fix_3src_operand(op[2]);
1907       /* Note that the instruction's argument order is reversed from GLSL
1908        * and the IR.
1909        */
1910       emit(BFE(result_dst, op[2], op[1], op[0]));
1911       break;
1912
1913    case ir_triop_vector_insert:
1914       unreachable("should have been lowered by lower_vector_insert");
1915
1916    case ir_quadop_bitfield_insert:
1917       unreachable("not reached: should be handled by "
1918               "bitfield_insert_to_bfm_bfi\n");
1919
1920    case ir_quadop_vector:
1921       unreachable("not reached: should be handled by lower_quadop_vector");
1922
1923    case ir_unop_pack_half_2x16:
1924       emit_pack_half_2x16(result_dst, op[0]);
1925       break;
1926    case ir_unop_unpack_half_2x16:
1927       emit_unpack_half_2x16(result_dst, op[0]);
1928       break;
1929    case ir_unop_unpack_unorm_4x8:
1930       emit_unpack_unorm_4x8(result_dst, op[0]);
1931       break;
1932    case ir_unop_unpack_snorm_4x8:
1933       emit_unpack_snorm_4x8(result_dst, op[0]);
1934       break;
1935    case ir_unop_pack_unorm_4x8:
1936       emit_pack_unorm_4x8(result_dst, op[0]);
1937       break;
1938    case ir_unop_pack_snorm_4x8:
1939       emit_pack_snorm_4x8(result_dst, op[0]);
1940       break;
1941    case ir_unop_pack_snorm_2x16:
1942    case ir_unop_pack_unorm_2x16:
1943    case ir_unop_unpack_snorm_2x16:
1944    case ir_unop_unpack_unorm_2x16:
1945       unreachable("not reached: should be handled by lower_packing_builtins");
1946    case ir_unop_unpack_half_2x16_split_x:
1947    case ir_unop_unpack_half_2x16_split_y:
1948    case ir_binop_pack_half_2x16_split:
1949    case ir_unop_interpolate_at_centroid:
1950    case ir_binop_interpolate_at_sample:
1951    case ir_binop_interpolate_at_offset:
1952       unreachable("not reached: should not occur in vertex shader");
1953    case ir_binop_ldexp:
1954       unreachable("not reached: should be handled by ldexp_to_arith()");
1955    case ir_unop_d2f:
1956    case ir_unop_f2d:
1957    case ir_unop_d2i:
1958    case ir_unop_i2d:
1959    case ir_unop_d2u:
1960    case ir_unop_u2d:
1961    case ir_unop_d2b:
1962    case ir_unop_pack_double_2x32:
1963    case ir_unop_unpack_double_2x32:
1964    case ir_unop_frexp_sig:
1965    case ir_unop_frexp_exp:
1966       unreachable("fp64 todo");
1967    }
1968 }
1969
1970
1971 void
1972 vec4_visitor::visit(ir_swizzle *ir)
1973 {
1974    /* Note that this is only swizzles in expressions, not those on the left
1975     * hand side of an assignment, which do write masking.  See ir_assignment
1976     * for that.
1977     */
1978    const unsigned swz = brw_compose_swizzle(
1979       brw_swizzle_for_size(ir->type->vector_elements),
1980       BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
1981
1982    ir->val->accept(this);
1983    this->result = swizzle(this->result, swz);
1984 }
1985
1986 void
1987 vec4_visitor::visit(ir_dereference_variable *ir)
1988 {
1989    const struct glsl_type *type = ir->type;
1990    dst_reg *reg = variable_storage(ir->var);
1991
1992    if (!reg) {
1993       fail("Failed to find variable storage for %s\n", ir->var->name);
1994       this->result = src_reg(brw_null_reg());
1995       return;
1996    }
1997
1998    this->result = src_reg(*reg);
1999
2000    /* System values get their swizzle from the dst_reg writemask */
2001    if (ir->var->data.mode == ir_var_system_value)
2002       return;
2003
2004    if (type->is_scalar() || type->is_vector() || type->is_matrix())
2005       this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2006 }
2007
2008
2009 int
2010 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2011 {
2012    /* Under normal circumstances array elements are stored consecutively, so
2013     * the stride is equal to the size of the array element.
2014     */
2015    return type_size(ir->type);
2016 }
2017
2018
2019 void
2020 vec4_visitor::visit(ir_dereference_array *ir)
2021 {
2022    ir_constant *constant_index;
2023    src_reg src;
2024    int array_stride = compute_array_stride(ir);
2025
2026    constant_index = ir->array_index->constant_expression_value();
2027
2028    ir->array->accept(this);
2029    src = this->result;
2030
2031    if (constant_index) {
2032       src.reg_offset += constant_index->value.i[0] * array_stride;
2033    } else {
2034       /* Variable index array dereference.  It eats the "vec4" of the
2035        * base of the array and an index that offsets the Mesa register
2036        * index.
2037        */
2038       ir->array_index->accept(this);
2039
2040       src_reg index_reg;
2041
2042       if (array_stride == 1) {
2043          index_reg = this->result;
2044       } else {
2045          index_reg = src_reg(this, glsl_type::int_type);
2046
2047          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2048       }
2049
2050       if (src.reladdr) {
2051          src_reg temp = src_reg(this, glsl_type::int_type);
2052
2053          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2054
2055          index_reg = temp;
2056       }
2057
2058       src.reladdr = ralloc(mem_ctx, src_reg);
2059       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2060    }
2061
2062    /* If the type is smaller than a vec4, replicate the last channel out. */
2063    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2064       src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2065    else
2066       src.swizzle = BRW_SWIZZLE_NOOP;
2067    src.type = brw_type_for_base_type(ir->type);
2068
2069    this->result = src;
2070 }
2071
2072 void
2073 vec4_visitor::visit(ir_dereference_record *ir)
2074 {
2075    unsigned int i;
2076    const glsl_type *struct_type = ir->record->type;
2077    int offset = 0;
2078
2079    ir->record->accept(this);
2080
2081    for (i = 0; i < struct_type->length; i++) {
2082       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2083          break;
2084       offset += type_size(struct_type->fields.structure[i].type);
2085    }
2086
2087    /* If the type is smaller than a vec4, replicate the last channel out. */
2088    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2089       this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2090    else
2091       this->result.swizzle = BRW_SWIZZLE_NOOP;
2092    this->result.type = brw_type_for_base_type(ir->type);
2093
2094    this->result.reg_offset += offset;
2095 }
2096
2097 /**
2098  * We want to be careful in assignment setup to hit the actual storage
2099  * instead of potentially using a temporary like we might with the
2100  * ir_dereference handler.
2101  */
2102 static dst_reg
2103 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2104 {
2105    /* The LHS must be a dereference.  If the LHS is a variable indexed array
2106     * access of a vector, it must be separated into a series conditional moves
2107     * before reaching this point (see ir_vec_index_to_cond_assign).
2108     */
2109    assert(ir->as_dereference());
2110    ir_dereference_array *deref_array = ir->as_dereference_array();
2111    if (deref_array) {
2112       assert(!deref_array->array->type->is_vector());
2113    }
2114
2115    /* Use the rvalue deref handler for the most part.  We'll ignore
2116     * swizzles in it and write swizzles using writemask, though.
2117     */
2118    ir->accept(v);
2119    return dst_reg(v->result);
2120 }
2121
2122 void
2123 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2124                               const struct glsl_type *type,
2125                               enum brw_predicate predicate)
2126 {
2127    if (type->base_type == GLSL_TYPE_STRUCT) {
2128       for (unsigned int i = 0; i < type->length; i++) {
2129          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2130       }
2131       return;
2132    }
2133
2134    if (type->is_array()) {
2135       for (unsigned int i = 0; i < type->length; i++) {
2136          emit_block_move(dst, src, type->fields.array, predicate);
2137       }
2138       return;
2139    }
2140
2141    if (type->is_matrix()) {
2142       const struct glsl_type *vec_type;
2143
2144       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2145                                          type->vector_elements, 1);
2146
2147       for (int i = 0; i < type->matrix_columns; i++) {
2148          emit_block_move(dst, src, vec_type, predicate);
2149       }
2150       return;
2151    }
2152
2153    assert(type->is_scalar() || type->is_vector());
2154
2155    dst->type = brw_type_for_base_type(type);
2156    src->type = dst->type;
2157
2158    dst->writemask = (1 << type->vector_elements) - 1;
2159
2160    src->swizzle = brw_swizzle_for_size(type->vector_elements);
2161
2162    vec4_instruction *inst = emit(MOV(*dst, *src));
2163    inst->predicate = predicate;
2164
2165    dst->reg_offset++;
2166    src->reg_offset++;
2167 }
2168
2169
2170 /* If the RHS processing resulted in an instruction generating a
2171  * temporary value, and it would be easy to rewrite the instruction to
2172  * generate its result right into the LHS instead, do so.  This ends
2173  * up reliably removing instructions where it can be tricky to do so
2174  * later without real UD chain information.
2175  */
2176 bool
2177 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2178                                      dst_reg dst,
2179                                      src_reg src,
2180                                      vec4_instruction *pre_rhs_inst,
2181                                      vec4_instruction *last_rhs_inst)
2182 {
2183    /* This could be supported, but it would take more smarts. */
2184    if (ir->condition)
2185       return false;
2186
2187    if (pre_rhs_inst == last_rhs_inst)
2188       return false; /* No instructions generated to work with. */
2189
2190    /* Make sure the last instruction generated our source reg. */
2191    if (src.file != GRF ||
2192        src.file != last_rhs_inst->dst.file ||
2193        src.reg != last_rhs_inst->dst.reg ||
2194        src.reg_offset != last_rhs_inst->dst.reg_offset ||
2195        src.reladdr ||
2196        src.abs ||
2197        src.negate ||
2198        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2199       return false;
2200
2201    /* Check that that last instruction fully initialized the channels
2202     * we want to use, in the order we want to use them.  We could
2203     * potentially reswizzle the operands of many instructions so that
2204     * we could handle out of order channels, but don't yet.
2205     */
2206
2207    for (unsigned i = 0; i < 4; i++) {
2208       if (dst.writemask & (1 << i)) {
2209          if (!(last_rhs_inst->dst.writemask & (1 << i)))
2210             return false;
2211
2212          if (BRW_GET_SWZ(src.swizzle, i) != i)
2213             return false;
2214       }
2215    }
2216
2217    /* Success!  Rewrite the instruction. */
2218    last_rhs_inst->dst.file = dst.file;
2219    last_rhs_inst->dst.reg = dst.reg;
2220    last_rhs_inst->dst.reg_offset = dst.reg_offset;
2221    last_rhs_inst->dst.reladdr = dst.reladdr;
2222    last_rhs_inst->dst.writemask &= dst.writemask;
2223
2224    return true;
2225 }
2226
2227 void
2228 vec4_visitor::visit(ir_assignment *ir)
2229 {
2230    dst_reg dst = get_assignment_lhs(ir->lhs, this);
2231    enum brw_predicate predicate = BRW_PREDICATE_NONE;
2232
2233    if (!ir->lhs->type->is_scalar() &&
2234        !ir->lhs->type->is_vector()) {
2235       ir->rhs->accept(this);
2236       src_reg src = this->result;
2237
2238       if (ir->condition) {
2239          emit_bool_to_cond_code(ir->condition, &predicate);
2240       }
2241
2242       /* emit_block_move doesn't account for swizzles in the source register.
2243        * This should be ok, since the source register is a structure or an
2244        * array, and those can't be swizzled.  But double-check to be sure.
2245        */
2246       assert(src.swizzle ==
2247              (ir->rhs->type->is_matrix()
2248               ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2249               : BRW_SWIZZLE_NOOP));
2250
2251       emit_block_move(&dst, &src, ir->rhs->type, predicate);
2252       return;
2253    }
2254
2255    /* Now we're down to just a scalar/vector with writemasks. */
2256    int i;
2257
2258    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2259    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2260
2261    ir->rhs->accept(this);
2262
2263    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2264
2265    int swizzles[4];
2266    int src_chan = 0;
2267
2268    assert(ir->lhs->type->is_vector() ||
2269           ir->lhs->type->is_scalar());
2270    dst.writemask = ir->write_mask;
2271
2272    /* Swizzle a small RHS vector into the channels being written.
2273     *
2274     * glsl ir treats write_mask as dictating how many channels are
2275     * present on the RHS while in our instructions we need to make
2276     * those channels appear in the slots of the vec4 they're written to.
2277     */
2278    for (int i = 0; i < 4; i++)
2279       swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2280
2281    src_reg src = swizzle(this->result,
2282                          BRW_SWIZZLE4(swizzles[0], swizzles[1],
2283                                       swizzles[2], swizzles[3]));
2284
2285    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2286       return;
2287    }
2288
2289    if (ir->condition) {
2290       emit_bool_to_cond_code(ir->condition, &predicate);
2291    }
2292
2293    for (i = 0; i < type_size(ir->lhs->type); i++) {
2294       vec4_instruction *inst = emit(MOV(dst, src));
2295       inst->predicate = predicate;
2296
2297       dst.reg_offset++;
2298       src.reg_offset++;
2299    }
2300 }
2301
2302 void
2303 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2304 {
2305    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2306       foreach_in_list(ir_constant, field_value, &ir->components) {
2307          emit_constant_values(dst, field_value);
2308       }
2309       return;
2310    }
2311
2312    if (ir->type->is_array()) {
2313       for (unsigned int i = 0; i < ir->type->length; i++) {
2314          emit_constant_values(dst, ir->array_elements[i]);
2315       }
2316       return;
2317    }
2318
2319    if (ir->type->is_matrix()) {
2320       for (int i = 0; i < ir->type->matrix_columns; i++) {
2321          float *vec = &ir->value.f[i * ir->type->vector_elements];
2322
2323          for (int j = 0; j < ir->type->vector_elements; j++) {
2324             dst->writemask = 1 << j;
2325             dst->type = BRW_REGISTER_TYPE_F;
2326
2327             emit(MOV(*dst, src_reg(vec[j])));
2328          }
2329          dst->reg_offset++;
2330       }
2331       return;
2332    }
2333
2334    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2335
2336    for (int i = 0; i < ir->type->vector_elements; i++) {
2337       if (!(remaining_writemask & (1 << i)))
2338          continue;
2339
2340       dst->writemask = 1 << i;
2341       dst->type = brw_type_for_base_type(ir->type);
2342
2343       /* Find other components that match the one we're about to
2344        * write.  Emits fewer instructions for things like vec4(0.5,
2345        * 1.5, 1.5, 1.5).
2346        */
2347       for (int j = i + 1; j < ir->type->vector_elements; j++) {
2348          if (ir->type->base_type == GLSL_TYPE_BOOL) {
2349             if (ir->value.b[i] == ir->value.b[j])
2350                dst->writemask |= (1 << j);
2351          } else {
2352             /* u, i, and f storage all line up, so no need for a
2353              * switch case for comparing each type.
2354              */
2355             if (ir->value.u[i] == ir->value.u[j])
2356                dst->writemask |= (1 << j);
2357          }
2358       }
2359
2360       switch (ir->type->base_type) {
2361       case GLSL_TYPE_FLOAT:
2362          emit(MOV(*dst, src_reg(ir->value.f[i])));
2363          break;
2364       case GLSL_TYPE_INT:
2365          emit(MOV(*dst, src_reg(ir->value.i[i])));
2366          break;
2367       case GLSL_TYPE_UINT:
2368          emit(MOV(*dst, src_reg(ir->value.u[i])));
2369          break;
2370       case GLSL_TYPE_BOOL:
2371          emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2372          break;
2373       default:
2374          unreachable("Non-float/uint/int/bool constant");
2375       }
2376
2377       remaining_writemask &= ~dst->writemask;
2378    }
2379    dst->reg_offset++;
2380 }
2381
2382 void
2383 vec4_visitor::visit(ir_constant *ir)
2384 {
2385    dst_reg dst = dst_reg(this, ir->type);
2386    this->result = src_reg(dst);
2387
2388    emit_constant_values(&dst, ir);
2389 }
2390
2391 void
2392 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2393 {
2394    ir_dereference *deref = static_cast<ir_dereference *>(
2395       ir->actual_parameters.get_head());
2396    ir_variable *location = deref->variable_referenced();
2397    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2398                           location->data.binding);
2399
2400    /* Calculate the surface offset */
2401    src_reg offset(this, glsl_type::uint_type);
2402    ir_dereference_array *deref_array = deref->as_dereference_array();
2403    if (deref_array) {
2404       deref_array->array_index->accept(this);
2405
2406       src_reg tmp(this, glsl_type::uint_type);
2407       emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2408       emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2409    } else {
2410       offset = location->data.atomic.offset;
2411    }
2412
2413    /* Emit the appropriate machine instruction */
2414    const char *callee = ir->callee->function_name();
2415    dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2416
2417    if (!strcmp("__intrinsic_atomic_read", callee)) {
2418       emit_untyped_surface_read(surf_index, dst, offset);
2419
2420    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2421       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2422                           src_reg(), src_reg());
2423
2424    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2425       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2426                           src_reg(), src_reg());
2427    }
2428 }
2429
2430 void
2431 vec4_visitor::visit(ir_call *ir)
2432 {
2433    const char *callee = ir->callee->function_name();
2434
2435    if (!strcmp("__intrinsic_atomic_read", callee) ||
2436        !strcmp("__intrinsic_atomic_increment", callee) ||
2437        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2438       visit_atomic_counter_intrinsic(ir);
2439    } else {
2440       unreachable("Unsupported intrinsic.");
2441    }
2442 }
2443
2444 src_reg
2445 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2446 {
2447    vec4_instruction *inst =
2448       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2449                                     dst_reg(this, glsl_type::uvec4_type));
2450    inst->base_mrf = 2;
2451    inst->mlen = 1;
2452    inst->src[1] = sampler;
2453
2454    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2455    int param_base = inst->base_mrf;
2456    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2457    int zero_mask = 0xf & ~coord_mask;
2458
2459    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2460             coordinate));
2461
2462    emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2463             src_reg(0)));
2464
2465    emit(inst);
2466    return src_reg(inst->dst);
2467 }
2468
2469 static bool
2470 is_high_sampler(const struct brw_device_info *devinfo, src_reg sampler)
2471 {
2472    if (devinfo->gen < 8 && !devinfo->is_haswell)
2473       return false;
2474
2475    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2476 }
2477
2478 void
2479 vec4_visitor::visit(ir_texture *ir)
2480 {
2481    uint32_t sampler =
2482       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2483
2484    ir_rvalue *nonconst_sampler_index =
2485       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2486
2487    /* Handle non-constant sampler array indexing */
2488    src_reg sampler_reg;
2489    if (nonconst_sampler_index) {
2490       /* The highest sampler which may be used by this operation is
2491        * the last element of the array. Mark it here, because the generator
2492        * doesn't have enough information to determine the bound.
2493        */
2494       uint32_t array_size = ir->sampler->as_dereference_array()
2495          ->array->type->array_size();
2496
2497       uint32_t max_used = sampler + array_size - 1;
2498       if (ir->op == ir_tg4 && devinfo->gen < 8) {
2499          max_used += prog_data->base.binding_table.gather_texture_start;
2500       } else {
2501          max_used += prog_data->base.binding_table.texture_start;
2502       }
2503
2504       brw_mark_surface_used(&prog_data->base, max_used);
2505
2506       /* Emit code to evaluate the actual indexing expression */
2507       nonconst_sampler_index->accept(this);
2508       dst_reg temp(this, glsl_type::uint_type);
2509       emit(ADD(temp, this->result, src_reg(sampler)))
2510          ->force_writemask_all = true;
2511       sampler_reg = src_reg(temp);
2512    } else {
2513       /* Single sampler, or constant array index; the indexing expression
2514        * is just an immediate.
2515        */
2516       sampler_reg = src_reg(sampler);
2517    }
2518
2519    /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2520     * emitting anything other than setting up the constant result.
2521     */
2522    if (ir->op == ir_tg4) {
2523       ir_constant *chan = ir->lod_info.component->as_constant();
2524       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2525       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2526          dst_reg result(this, ir->type);
2527          this->result = src_reg(result);
2528          emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2529          return;
2530       }
2531    }
2532
2533    /* Should be lowered by do_lower_texture_projection */
2534    assert(!ir->projector);
2535
2536    /* Should be lowered */
2537    assert(!ir->offset || !ir->offset->type->is_array());
2538
2539    /* Generate code to compute all the subexpression trees.  This has to be
2540     * done before loading any values into MRFs for the sampler message since
2541     * generating these values may involve SEND messages that need the MRFs.
2542     */
2543    src_reg coordinate;
2544    if (ir->coordinate) {
2545       ir->coordinate->accept(this);
2546       coordinate = this->result;
2547    }
2548
2549    src_reg shadow_comparitor;
2550    if (ir->shadow_comparitor) {
2551       ir->shadow_comparitor->accept(this);
2552       shadow_comparitor = this->result;
2553    }
2554
2555    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2556    src_reg offset_value;
2557    if (has_nonconstant_offset) {
2558       ir->offset->accept(this);
2559       offset_value = src_reg(this->result);
2560    }
2561
2562    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2563    src_reg lod, dPdx, dPdy, sample_index, mcs;
2564    switch (ir->op) {
2565    case ir_tex:
2566       lod = src_reg(0.0f);
2567       lod_type = glsl_type::float_type;
2568       break;
2569    case ir_txf:
2570    case ir_txl:
2571    case ir_txs:
2572       ir->lod_info.lod->accept(this);
2573       lod = this->result;
2574       lod_type = ir->lod_info.lod->type;
2575       break;
2576    case ir_query_levels:
2577       lod = src_reg(0);
2578       lod_type = glsl_type::int_type;
2579       break;
2580    case ir_txf_ms:
2581       ir->lod_info.sample_index->accept(this);
2582       sample_index = this->result;
2583       sample_index_type = ir->lod_info.sample_index->type;
2584
2585       if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2586          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2587       else
2588          mcs = src_reg(0u);
2589       break;
2590    case ir_txd:
2591       ir->lod_info.grad.dPdx->accept(this);
2592       dPdx = this->result;
2593
2594       ir->lod_info.grad.dPdy->accept(this);
2595       dPdy = this->result;
2596
2597       lod_type = ir->lod_info.grad.dPdx->type;
2598       break;
2599    case ir_txb:
2600    case ir_lod:
2601    case ir_tg4:
2602       break;
2603    }
2604
2605    enum opcode opcode;
2606    switch (ir->op) {
2607    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2608    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2609    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2610    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2611    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2612    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2613    case ir_tg4: opcode = has_nonconstant_offset
2614                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2615    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2616    case ir_txb:
2617       unreachable("TXB is not valid for vertex shaders.");
2618    case ir_lod:
2619       unreachable("LOD is not valid for vertex shaders.");
2620    default:
2621       unreachable("Unrecognized tex op");
2622    }
2623
2624    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2625       opcode, dst_reg(this, ir->type));
2626
2627    if (ir->offset != NULL && !has_nonconstant_offset) {
2628       inst->offset =
2629          brw_texture_offset(ir->offset->as_constant()->value.i,
2630                             ir->offset->type->vector_elements);
2631    }
2632
2633    /* Stuff the channel select bits in the top of the texture offset */
2634    if (ir->op == ir_tg4)
2635       inst->offset |= gather_channel(ir, sampler) << 16;
2636
2637    /* The message header is necessary for:
2638     * - Gen4 (always)
2639     * - Gen9+ for selecting SIMD4x2
2640     * - Texel offsets
2641     * - Gather channel selection
2642     * - Sampler indices too large to fit in a 4-bit value.
2643     */
2644    inst->header_present =
2645       devinfo->gen < 5 || devinfo->gen >= 9 ||
2646       inst->offset != 0 || ir->op == ir_tg4 ||
2647       is_high_sampler(devinfo, sampler_reg);
2648    inst->base_mrf = 2;
2649    inst->mlen = inst->header_present + 1; /* always at least one */
2650    inst->dst.writemask = WRITEMASK_XYZW;
2651    inst->shadow_compare = ir->shadow_comparitor != NULL;
2652
2653    inst->src[1] = sampler_reg;
2654
2655    /* MRF for the first parameter */
2656    int param_base = inst->base_mrf + inst->header_present;
2657
2658    if (ir->op == ir_txs || ir->op == ir_query_levels) {
2659       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2660       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2661    } else {
2662       /* Load the coordinate */
2663       /* FINISHME: gl_clamp_mask and saturate */
2664       int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2665       int zero_mask = 0xf & ~coord_mask;
2666
2667       emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2668                coordinate));
2669
2670       if (zero_mask != 0) {
2671          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2672                   src_reg(0)));
2673       }
2674       /* Load the shadow comparitor */
2675       if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2676          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2677                           WRITEMASK_X),
2678                   shadow_comparitor));
2679          inst->mlen++;
2680       }
2681
2682       /* Load the LOD info */
2683       if (ir->op == ir_tex || ir->op == ir_txl) {
2684          int mrf, writemask;
2685          if (devinfo->gen >= 5) {
2686             mrf = param_base + 1;
2687             if (ir->shadow_comparitor) {
2688                writemask = WRITEMASK_Y;
2689                /* mlen already incremented */
2690             } else {
2691                writemask = WRITEMASK_X;
2692                inst->mlen++;
2693             }
2694          } else /* devinfo->gen == 4 */ {
2695             mrf = param_base;
2696             writemask = WRITEMASK_W;
2697          }
2698          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2699       } else if (ir->op == ir_txf) {
2700          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2701       } else if (ir->op == ir_txf_ms) {
2702          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2703                   sample_index));
2704          if (devinfo->gen >= 7) {
2705             /* MCS data is in the first channel of `mcs`, but we need to get it into
2706              * the .y channel of the second vec4 of params, so replicate .x across
2707              * the whole vec4 and then mask off everything except .y
2708              */
2709             mcs.swizzle = BRW_SWIZZLE_XXXX;
2710             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2711                      mcs));
2712          }
2713          inst->mlen++;
2714       } else if (ir->op == ir_txd) {
2715          const glsl_type *type = lod_type;
2716
2717          if (devinfo->gen >= 5) {
2718             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2719             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2720             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2721             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2722             inst->mlen++;
2723
2724             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2725                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2726                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2727                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2728                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2729                inst->mlen++;
2730
2731                if (ir->shadow_comparitor) {
2732                   emit(MOV(dst_reg(MRF, param_base + 2,
2733                                    ir->shadow_comparitor->type, WRITEMASK_Z),
2734                            shadow_comparitor));
2735                }
2736             }
2737          } else /* devinfo->gen == 4 */ {
2738             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2739             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2740             inst->mlen += 2;
2741          }
2742       } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2743          if (ir->shadow_comparitor) {
2744             emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2745                      shadow_comparitor));
2746          }
2747
2748          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2749                   offset_value));
2750          inst->mlen++;
2751       }
2752    }
2753
2754    emit(inst);
2755
2756    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2757     * spec requires layers.
2758     */
2759    if (ir->op == ir_txs) {
2760       glsl_type const *type = ir->sampler->type;
2761       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2762           type->sampler_array) {
2763          emit_math(SHADER_OPCODE_INT_QUOTIENT,
2764                    writemask(inst->dst, WRITEMASK_Z),
2765                    src_reg(inst->dst), src_reg(6));
2766       }
2767    }
2768
2769    if (devinfo->gen == 6 && ir->op == ir_tg4) {
2770       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2771    }
2772
2773    swizzle_result(ir, src_reg(inst->dst), sampler);
2774 }
2775
2776 /**
2777  * Apply workarounds for Gen6 gather with UINT/SINT
2778  */
2779 void
2780 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2781 {
2782    if (!wa)
2783       return;
2784
2785    int width = (wa & WA_8BIT) ? 8 : 16;
2786    dst_reg dst_f = dst;
2787    dst_f.type = BRW_REGISTER_TYPE_F;
2788
2789    /* Convert from UNORM to UINT */
2790    emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2791    emit(MOV(dst, src_reg(dst_f)));
2792
2793    if (wa & WA_SIGN) {
2794       /* Reinterpret the UINT value as a signed INT value by
2795        * shifting the sign bit into place, then shifting back
2796        * preserving sign.
2797        */
2798       emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2799       emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2800    }
2801 }
2802
2803 /**
2804  * Set up the gather channel based on the swizzle, for gather4.
2805  */
2806 uint32_t
2807 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2808 {
2809    ir_constant *chan = ir->lod_info.component->as_constant();
2810    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2811    switch (swiz) {
2812       case SWIZZLE_X: return 0;
2813       case SWIZZLE_Y:
2814          /* gather4 sampler is broken for green channel on RG32F --
2815           * we must ask for blue instead.
2816           */
2817          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2818             return 2;
2819          return 1;
2820       case SWIZZLE_Z: return 2;
2821       case SWIZZLE_W: return 3;
2822       default:
2823          unreachable("Not reached"); /* zero, one swizzles handled already */
2824    }
2825 }
2826
2827 void
2828 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2829 {
2830    int s = key->tex.swizzles[sampler];
2831
2832    this->result = src_reg(this, ir->type);
2833    dst_reg swizzled_result(this->result);
2834
2835    if (ir->op == ir_query_levels) {
2836       /* # levels is in .w */
2837       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2838       emit(MOV(swizzled_result, orig_val));
2839       return;
2840    }
2841
2842    if (ir->op == ir_txs || ir->type == glsl_type::float_type
2843                         || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2844       emit(MOV(swizzled_result, orig_val));
2845       return;
2846    }
2847
2848
2849    int zero_mask = 0, one_mask = 0, copy_mask = 0;
2850    int swizzle[4] = {0};
2851
2852    for (int i = 0; i < 4; i++) {
2853       switch (GET_SWZ(s, i)) {
2854       case SWIZZLE_ZERO:
2855          zero_mask |= (1 << i);
2856          break;
2857       case SWIZZLE_ONE:
2858          one_mask |= (1 << i);
2859          break;
2860       default:
2861          copy_mask |= (1 << i);
2862          swizzle[i] = GET_SWZ(s, i);
2863          break;
2864       }
2865    }
2866
2867    if (copy_mask) {
2868       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2869       swizzled_result.writemask = copy_mask;
2870       emit(MOV(swizzled_result, orig_val));
2871    }
2872
2873    if (zero_mask) {
2874       swizzled_result.writemask = zero_mask;
2875       emit(MOV(swizzled_result, src_reg(0.0f)));
2876    }
2877
2878    if (one_mask) {
2879       swizzled_result.writemask = one_mask;
2880       emit(MOV(swizzled_result, src_reg(1.0f)));
2881    }
2882 }
2883
2884 void
2885 vec4_visitor::visit(ir_return *)
2886 {
2887    unreachable("not reached");
2888 }
2889
2890 void
2891 vec4_visitor::visit(ir_discard *)
2892 {
2893    unreachable("not reached");
2894 }
2895
2896 void
2897 vec4_visitor::visit(ir_if *ir)
2898 {
2899    /* Don't point the annotation at the if statement, because then it plus
2900     * the then and else blocks get printed.
2901     */
2902    this->base_ir = ir->condition;
2903
2904    if (devinfo->gen == 6) {
2905       emit_if_gen6(ir);
2906    } else {
2907       enum brw_predicate predicate;
2908       emit_bool_to_cond_code(ir->condition, &predicate);
2909       emit(IF(predicate));
2910    }
2911
2912    visit_instructions(&ir->then_instructions);
2913
2914    if (!ir->else_instructions.is_empty()) {
2915       this->base_ir = ir->condition;
2916       emit(BRW_OPCODE_ELSE);
2917
2918       visit_instructions(&ir->else_instructions);
2919    }
2920
2921    this->base_ir = ir->condition;
2922    emit(BRW_OPCODE_ENDIF);
2923 }
2924
2925 void
2926 vec4_visitor::visit(ir_emit_vertex *)
2927 {
2928    unreachable("not reached");
2929 }
2930
2931 void
2932 vec4_visitor::visit(ir_end_primitive *)
2933 {
2934    unreachable("not reached");
2935 }
2936
2937 void
2938 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2939                                   dst_reg dst, src_reg offset,
2940                                   src_reg src0, src_reg src1)
2941 {
2942    unsigned mlen = 0;
2943
2944    /* Set the atomic operation offset. */
2945    emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2946    mlen++;
2947
2948    /* Set the atomic operation arguments. */
2949    if (src0.file != BAD_FILE) {
2950       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2951       mlen++;
2952    }
2953
2954    if (src1.file != BAD_FILE) {
2955       emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2956       mlen++;
2957    }
2958
2959    /* Emit the instruction.  Note that this maps to the normal SIMD8
2960     * untyped atomic message on Ivy Bridge, but that's OK because
2961     * unused channels will be masked out.
2962     */
2963    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2964                                  src_reg(atomic_op), src_reg(surf_index));
2965    inst->base_mrf = 0;
2966    inst->mlen = mlen;
2967 }
2968
2969 void
2970 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2971                                         src_reg offset)
2972 {
2973    /* Set the surface read offset. */
2974    emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2975
2976    /* Emit the instruction.  Note that this maps to the normal SIMD8
2977     * untyped surface read message, but that's OK because unused
2978     * channels will be masked out.
2979     */
2980    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2981                                  dst, src_reg(surf_index));
2982    inst->base_mrf = 0;
2983    inst->mlen = 1;
2984 }
2985
2986 void
2987 vec4_visitor::emit_ndc_computation()
2988 {
2989    /* Get the position */
2990    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2991
2992    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2993    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2994    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2995
2996    current_annotation = "NDC";
2997    dst_reg ndc_w = ndc;
2998    ndc_w.writemask = WRITEMASK_W;
2999    src_reg pos_w = pos;
3000    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3001    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3002
3003    dst_reg ndc_xyz = ndc;
3004    ndc_xyz.writemask = WRITEMASK_XYZ;
3005
3006    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3007 }
3008
3009 void
3010 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3011 {
3012    if (devinfo->gen < 6 &&
3013        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3014         key->userclip_active || devinfo->has_negative_rhw_bug)) {
3015       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3016       dst_reg header1_w = header1;
3017       header1_w.writemask = WRITEMASK_W;
3018
3019       emit(MOV(header1, 0u));
3020
3021       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3022          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3023
3024          current_annotation = "Point size";
3025          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3026          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3027       }
3028
3029       if (key->userclip_active) {
3030          current_annotation = "Clipping flags";
3031          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3032          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3033
3034          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3035          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3036          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3037
3038          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3039          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3040          emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3041          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3042       }
3043
3044       /* i965 clipping workaround:
3045        * 1) Test for -ve rhw
3046        * 2) If set,
3047        *      set ndc = (0,0,0,0)
3048        *      set ucp[6] = 1
3049        *
3050        * Later, clipping will detect ucp[6] and ensure the primitive is
3051        * clipped against all fixed planes.
3052        */
3053       if (devinfo->has_negative_rhw_bug) {
3054          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3055          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3056          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3057          vec4_instruction *inst;
3058          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3059          inst->predicate = BRW_PREDICATE_NORMAL;
3060          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3061          inst->predicate = BRW_PREDICATE_NORMAL;
3062       }
3063
3064       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3065    } else if (devinfo->gen < 6) {
3066       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3067    } else {
3068       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3069       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3070          dst_reg reg_w = reg;
3071          reg_w.writemask = WRITEMASK_W;
3072          emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3073       }
3074       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3075          dst_reg reg_y = reg;
3076          reg_y.writemask = WRITEMASK_Y;
3077          reg_y.type = BRW_REGISTER_TYPE_D;
3078          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3079       }
3080       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3081          dst_reg reg_z = reg;
3082          reg_z.writemask = WRITEMASK_Z;
3083          reg_z.type = BRW_REGISTER_TYPE_D;
3084          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3085       }
3086    }
3087 }
3088
3089 void
3090 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3091 {
3092    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3093     *
3094     *     "If a linked set of shaders forming the vertex stage contains no
3095     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3096     *     application has requested clipping against user clip planes through
3097     *     the API, then the coordinate written to gl_Position is used for
3098     *     comparison against the user clip planes."
3099     *
3100     * This function is only called if the shader didn't write to
3101     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3102     * if the user wrote to it; otherwise we use gl_Position.
3103     */
3104    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3105    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3106       clip_vertex = VARYING_SLOT_POS;
3107    }
3108
3109    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3110         ++i) {
3111       reg.writemask = 1 << i;
3112       emit(DP4(reg,
3113                src_reg(output_reg[clip_vertex]),
3114                src_reg(this->userplane[i + offset])));
3115    }
3116 }
3117
3118 vec4_instruction *
3119 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3120 {
3121    assert (varying < VARYING_SLOT_MAX);
3122    reg.type = output_reg[varying].type;
3123    current_annotation = output_reg_annotation[varying];
3124    /* Copy the register, saturating if necessary */
3125    return emit(MOV(reg, src_reg(output_reg[varying])));
3126 }
3127
3128 void
3129 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3130 {
3131    reg.type = BRW_REGISTER_TYPE_F;
3132
3133    switch (varying) {
3134    case VARYING_SLOT_PSIZ:
3135    {
3136       /* PSIZ is always in slot 0, and is coupled with other flags. */
3137       current_annotation = "indices, point width, clip flags";
3138       emit_psiz_and_flags(reg);
3139       break;
3140    }
3141    case BRW_VARYING_SLOT_NDC:
3142       current_annotation = "NDC";
3143       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3144       break;
3145    case VARYING_SLOT_POS:
3146       current_annotation = "gl_Position";
3147       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3148       break;
3149    case VARYING_SLOT_EDGE:
3150       /* This is present when doing unfilled polygons.  We're supposed to copy
3151        * the edge flag from the user-provided vertex array
3152        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3153        * of that attribute (starts as 1.0f).  This is then used in clipping to
3154        * determine which edges should be drawn as wireframe.
3155        */
3156       current_annotation = "edge flag";
3157       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3158                                     glsl_type::float_type, WRITEMASK_XYZW))));
3159       break;
3160    case BRW_VARYING_SLOT_PAD:
3161       /* No need to write to this slot */
3162       break;
3163    case VARYING_SLOT_COL0:
3164    case VARYING_SLOT_COL1:
3165    case VARYING_SLOT_BFC0:
3166    case VARYING_SLOT_BFC1: {
3167       /* These built-in varyings are only supported in compatibility mode,
3168        * and we only support GS in core profile.  So, this must be a vertex
3169        * shader.
3170        */
3171       assert(stage == MESA_SHADER_VERTEX);
3172       vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3173       if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3174          inst->saturate = true;
3175       break;
3176    }
3177
3178    default:
3179       emit_generic_urb_slot(reg, varying);
3180       break;
3181    }
3182 }
3183
3184 static int
3185 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3186 {
3187    if (devinfo->gen >= 6) {
3188       /* URB data written (does not include the message header reg) must
3189        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
3190        * section 5.4.3.2.2: URB_INTERLEAVED.
3191        *
3192        * URB entries are allocated on a multiple of 1024 bits, so an
3193        * extra 128 bits written here to make the end align to 256 is
3194        * no problem.
3195        */
3196       if ((mlen % 2) != 1)
3197          mlen++;
3198    }
3199
3200    return mlen;
3201 }
3202
3203
3204 /**
3205  * Generates the VUE payload plus the necessary URB write instructions to
3206  * output it.
3207  *
3208  * The VUE layout is documented in Volume 2a.
3209  */
3210 void
3211 vec4_visitor::emit_vertex()
3212 {
3213    /* MRF 0 is reserved for the debugger, so start with message header
3214     * in MRF 1.
3215     */
3216    int base_mrf = 1;
3217    int mrf = base_mrf;
3218    /* In the process of generating our URB write message contents, we
3219     * may need to unspill a register or load from an array.  Those
3220     * reads would use MRFs 14-15.
3221     */
3222    int max_usable_mrf = 13;
3223
3224    /* The following assertion verifies that max_usable_mrf causes an
3225     * even-numbered amount of URB write data, which will meet gen6's
3226     * requirements for length alignment.
3227     */
3228    assert ((max_usable_mrf - base_mrf) % 2 == 0);
3229
3230    /* First mrf is the g0-based message header containing URB handles and
3231     * such.
3232     */
3233    emit_urb_write_header(mrf++);
3234
3235    if (devinfo->gen < 6) {
3236       emit_ndc_computation();
3237    }
3238
3239    /* Lower legacy ff and ClipVertex clipping to clip distances */
3240    if (key->userclip_active && !prog->UsesClipDistanceOut) {
3241       current_annotation = "user clip distances";
3242
3243       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3244       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3245
3246       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3247       emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3248    }
3249
3250    /* We may need to split this up into several URB writes, so do them in a
3251     * loop.
3252     */
3253    int slot = 0;
3254    bool complete = false;
3255    do {
3256       /* URB offset is in URB row increments, and each of our MRFs is half of
3257        * one of those, since we're doing interleaved writes.
3258        */
3259       int offset = slot / 2;
3260
3261       mrf = base_mrf + 1;
3262       for (; slot < prog_data->vue_map.num_slots; ++slot) {
3263          emit_urb_slot(dst_reg(MRF, mrf++),
3264                        prog_data->vue_map.slot_to_varying[slot]);
3265
3266          /* If this was max_usable_mrf, we can't fit anything more into this
3267           * URB WRITE.
3268           */
3269          if (mrf > max_usable_mrf) {
3270             slot++;
3271             break;
3272          }
3273       }
3274
3275       complete = slot >= prog_data->vue_map.num_slots;
3276       current_annotation = "URB write";
3277       vec4_instruction *inst = emit_urb_write_opcode(complete);
3278       inst->base_mrf = base_mrf;
3279       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3280       inst->offset += offset;
3281    } while(!complete);
3282 }
3283
3284
3285 src_reg
3286 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3287                                  src_reg *reladdr, int reg_offset)
3288 {
3289    /* Because we store the values to scratch interleaved like our
3290     * vertex data, we need to scale the vec4 index by 2.
3291     */
3292    int message_header_scale = 2;
3293
3294    /* Pre-gen6, the message header uses byte offsets instead of vec4
3295     * (16-byte) offset units.
3296     */
3297    if (devinfo->gen < 6)
3298       message_header_scale *= 16;
3299
3300    if (reladdr) {
3301       src_reg index = src_reg(this, glsl_type::int_type);
3302
3303       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3304                                    src_reg(reg_offset)));
3305       emit_before(block, inst, MUL(dst_reg(index), index,
3306                                    src_reg(message_header_scale)));
3307
3308       return index;
3309    } else {
3310       return src_reg(reg_offset * message_header_scale);
3311    }
3312 }
3313
3314 src_reg
3315 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3316                                        src_reg *reladdr, int reg_offset)
3317 {
3318    if (reladdr) {
3319       src_reg index = src_reg(this, glsl_type::int_type);
3320
3321       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3322                                    src_reg(reg_offset)));
3323
3324       /* Pre-gen6, the message header uses byte offsets instead of vec4
3325        * (16-byte) offset units.
3326        */
3327       if (devinfo->gen < 6) {
3328          emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3329       }
3330
3331       return index;
3332    } else if (devinfo->gen >= 8) {
3333       /* Store the offset in a GRF so we can send-from-GRF. */
3334       src_reg offset = src_reg(this, glsl_type::int_type);
3335       emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3336       return offset;
3337    } else {
3338       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3339       return src_reg(reg_offset * message_header_scale);
3340    }
3341 }
3342
3343 /**
3344  * Emits an instruction before @inst to load the value named by @orig_src
3345  * from scratch space at @base_offset to @temp.
3346  *
3347  * @base_offset is measured in 32-byte units (the size of a register).
3348  */
3349 void
3350 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3351                                 dst_reg temp, src_reg orig_src,
3352                                 int base_offset)
3353 {
3354    int reg_offset = base_offset + orig_src.reg_offset;
3355    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3356                                       reg_offset);
3357
3358    emit_before(block, inst, SCRATCH_READ(temp, index));
3359 }
3360
3361 /**
3362  * Emits an instruction after @inst to store the value to be written
3363  * to @orig_dst to scratch space at @base_offset, from @temp.
3364  *
3365  * @base_offset is measured in 32-byte units (the size of a register).
3366  */
3367 void
3368 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3369                                  int base_offset)
3370 {
3371    int reg_offset = base_offset + inst->dst.reg_offset;
3372    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3373                                       reg_offset);
3374
3375    /* Create a temporary register to store *inst's result in.
3376     *
3377     * We have to be careful in MOVing from our temporary result register in
3378     * the scratch write.  If we swizzle from channels of the temporary that
3379     * weren't initialized, it will confuse live interval analysis, which will
3380     * make spilling fail to make progress.
3381     */
3382    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3383                                        inst->dst.type),
3384                                 brw_swizzle_for_mask(inst->dst.writemask));
3385    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3386                                        inst->dst.writemask));
3387    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3388    write->predicate = inst->predicate;
3389    write->ir = inst->ir;
3390    write->annotation = inst->annotation;
3391    inst->insert_after(block, write);
3392
3393    inst->dst.file = temp.file;
3394    inst->dst.reg = temp.reg;
3395    inst->dst.reg_offset = temp.reg_offset;
3396    inst->dst.reladdr = NULL;
3397 }
3398
3399 /**
3400  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3401  * adds the scratch read(s) before \p inst. The function also checks for
3402  * recursive reladdr scratch accesses, issuing the corresponding scratch
3403  * loads and rewriting reladdr references accordingly.
3404  *
3405  * \return \p src if it did not require a scratch load, otherwise, the
3406  * register holding the result of the scratch load that the caller should
3407  * use to rewrite src.
3408  */
3409 src_reg
3410 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3411                                    vec4_instruction *inst, src_reg src)
3412 {
3413    /* Resolve recursive reladdr scratch access by calling ourselves
3414     * with src.reladdr
3415     */
3416    if (src.reladdr)
3417       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3418                                           *src.reladdr);
3419
3420    /* Now handle scratch access on src */
3421    if (src.file == GRF && scratch_loc[src.reg] != -1) {
3422       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3423       emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3424       src.reg = temp.reg;
3425       src.reg_offset = temp.reg_offset;
3426       src.reladdr = NULL;
3427    }
3428
3429    return src;
3430 }
3431
3432 /**
3433  * We can't generally support array access in GRF space, because a
3434  * single instruction's destination can only span 2 contiguous
3435  * registers.  So, we send all GRF arrays that get variable index
3436  * access to scratch space.
3437  */
3438 void
3439 vec4_visitor::move_grf_array_access_to_scratch()
3440 {
3441    int scratch_loc[this->alloc.count];
3442    memset(scratch_loc, -1, sizeof(scratch_loc));
3443
3444    /* First, calculate the set of virtual GRFs that need to be punted
3445     * to scratch due to having any array access on them, and where in
3446     * scratch.
3447     */
3448    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3449       if (inst->dst.file == GRF && inst->dst.reladdr) {
3450          if (scratch_loc[inst->dst.reg] == -1) {
3451             scratch_loc[inst->dst.reg] = c->last_scratch;
3452             c->last_scratch += this->alloc.sizes[inst->dst.reg];
3453          }
3454
3455          for (src_reg *iter = inst->dst.reladdr;
3456               iter->reladdr;
3457               iter = iter->reladdr) {
3458             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3459                scratch_loc[iter->reg] = c->last_scratch;
3460                c->last_scratch += this->alloc.sizes[iter->reg];
3461             }
3462          }
3463       }
3464
3465       for (int i = 0 ; i < 3; i++) {
3466          for (src_reg *iter = &inst->src[i];
3467               iter->reladdr;
3468               iter = iter->reladdr) {
3469             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3470                scratch_loc[iter->reg] = c->last_scratch;
3471                c->last_scratch += this->alloc.sizes[iter->reg];
3472             }
3473          }
3474       }
3475    }
3476
3477    /* Now, for anything that will be accessed through scratch, rewrite
3478     * it to load/store.  Note that this is a _safe list walk, because
3479     * we may generate a new scratch_write instruction after the one
3480     * we're processing.
3481     */
3482    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3483       /* Set up the annotation tracking for new generated instructions. */
3484       base_ir = inst->ir;
3485       current_annotation = inst->annotation;
3486
3487       /* First handle scratch access on the dst. Notice we have to handle
3488        * the case where the dst's reladdr also points to scratch space.
3489        */
3490       if (inst->dst.reladdr)
3491          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3492                                                    *inst->dst.reladdr);
3493
3494       /* Now that we have handled any (possibly recursive) reladdr scratch
3495        * accesses for dst we can safely do the scratch write for dst itself
3496        */
3497       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3498          emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3499
3500       /* Now handle scratch access on any src. In this case, since inst->src[i]
3501        * already is a src_reg, we can just call emit_resolve_reladdr with
3502        * inst->src[i] and it will take care of handling scratch loads for
3503        * both src and src.reladdr (recursively).
3504        */
3505       for (int i = 0 ; i < 3; i++) {
3506          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3507                                              inst->src[i]);
3508       }
3509    }
3510 }
3511
3512 /**
3513  * Emits an instruction before @inst to load the value named by @orig_src
3514  * from the pull constant buffer (surface) at @base_offset to @temp.
3515  */
3516 void
3517 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3518                                       dst_reg temp, src_reg orig_src,
3519                                       int base_offset)
3520 {
3521    int reg_offset = base_offset + orig_src.reg_offset;
3522    src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3523    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3524                                              reg_offset);
3525
3526    emit_pull_constant_load_reg(temp,
3527                                index,
3528                                offset,
3529                                block, inst);
3530 }
3531
3532 /**
3533  * Implements array access of uniforms by inserting a
3534  * PULL_CONSTANT_LOAD instruction.
3535  *
3536  * Unlike temporary GRF array access (where we don't support it due to
3537  * the difficulty of doing relative addressing on instruction
3538  * destinations), we could potentially do array access of uniforms
3539  * that were loaded in GRF space as push constants.  In real-world
3540  * usage we've seen, though, the arrays being used are always larger
3541  * than we could load as push constants, so just always move all
3542  * uniform array access out to a pull constant buffer.
3543  */
3544 void
3545 vec4_visitor::move_uniform_array_access_to_pull_constants()
3546 {
3547    int pull_constant_loc[this->uniforms];
3548    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3549    bool nested_reladdr;
3550
3551    /* Walk through and find array access of uniforms.  Put a copy of that
3552     * uniform in the pull constant buffer.
3553     *
3554     * Note that we don't move constant-indexed accesses to arrays.  No
3555     * testing has been done of the performance impact of this choice.
3556     */
3557    do {
3558       nested_reladdr = false;
3559
3560       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3561          for (int i = 0 ; i < 3; i++) {
3562             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3563                continue;
3564
3565             int uniform = inst->src[i].reg;
3566
3567             if (inst->src[i].reladdr->reladdr)
3568                nested_reladdr = true;  /* will need another pass */
3569
3570             /* If this array isn't already present in the pull constant buffer,
3571              * add it.
3572              */
3573             if (pull_constant_loc[uniform] == -1) {
3574                const gl_constant_value **values =
3575                   &stage_prog_data->param[uniform * 4];
3576
3577                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3578
3579                assert(uniform < uniform_array_size);
3580                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3581                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3582                      = values[j];
3583                }
3584             }
3585
3586             /* Set up the annotation tracking for new generated instructions. */
3587             base_ir = inst->ir;
3588             current_annotation = inst->annotation;
3589
3590             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3591
3592             emit_pull_constant_load(block, inst, temp, inst->src[i],
3593                                     pull_constant_loc[uniform]);
3594
3595             inst->src[i].file = temp.file;
3596             inst->src[i].reg = temp.reg;
3597             inst->src[i].reg_offset = temp.reg_offset;
3598             inst->src[i].reladdr = NULL;
3599          }
3600       }
3601    } while (nested_reladdr);
3602
3603    /* Now there are no accesses of the UNIFORM file with a reladdr, so
3604     * no need to track them as larger-than-vec4 objects.  This will be
3605     * relied on in cutting out unused uniform vectors from push
3606     * constants.
3607     */
3608    split_uniform_registers();
3609 }
3610
3611 void
3612 vec4_visitor::resolve_ud_negate(src_reg *reg)
3613 {
3614    if (reg->type != BRW_REGISTER_TYPE_UD ||
3615        !reg->negate)
3616       return;
3617
3618    src_reg temp = src_reg(this, glsl_type::uvec4_type);
3619    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3620    *reg = temp;
3621 }
3622
3623 /**
3624  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3625  *
3626  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3627  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3628  */
3629 void
3630 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3631 {
3632    assert(devinfo->gen <= 5);
3633
3634    if (!rvalue->type->is_boolean())
3635       return;
3636
3637    src_reg and_result = src_reg(this, rvalue->type);
3638    src_reg neg_result = src_reg(this, rvalue->type);
3639    emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3640    emit(MOV(dst_reg(neg_result), negate(and_result)));
3641    *reg = neg_result;
3642 }
3643
3644 vec4_visitor::vec4_visitor(struct brw_context *brw,
3645                            struct brw_vec4_compile *c,
3646                            struct gl_program *prog,
3647                            const struct brw_vue_prog_key *key,
3648                            struct brw_vue_prog_data *prog_data,
3649                            struct gl_shader_program *shader_prog,
3650                            gl_shader_stage stage,
3651                            void *mem_ctx,
3652                            bool no_spills,
3653                            shader_time_shader_type st_base,
3654                            shader_time_shader_type st_written,
3655                            shader_time_shader_type st_reset)
3656    : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3657      c(c),
3658      key(key),
3659      prog_data(prog_data),
3660      sanity_param_count(0),
3661      fail_msg(NULL),
3662      first_non_payload_grf(0),
3663      need_all_constants_in_pull_buffer(false),
3664      no_spills(no_spills),
3665      st_base(st_base),
3666      st_written(st_written),
3667      st_reset(st_reset)
3668 {
3669    this->mem_ctx = mem_ctx;
3670    this->failed = false;
3671
3672    this->base_ir = NULL;
3673    this->current_annotation = NULL;
3674    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3675
3676    this->variable_ht = hash_table_ctor(0,
3677                                        hash_table_pointer_hash,
3678                                        hash_table_pointer_compare);
3679
3680    this->virtual_grf_start = NULL;
3681    this->virtual_grf_end = NULL;
3682    this->live_intervals = NULL;
3683
3684    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3685
3686    this->uniforms = 0;
3687
3688    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3689     * at least one. See setup_uniforms() in brw_vec4.cpp.
3690     */
3691    this->uniform_array_size = 1;
3692    if (prog_data) {
3693       this->uniform_array_size =
3694          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3695    }
3696
3697    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3698    this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3699 }
3700
3701 vec4_visitor::~vec4_visitor()
3702 {
3703    hash_table_dtor(this->variable_ht);
3704 }
3705
3706
3707 void
3708 vec4_visitor::fail(const char *format, ...)
3709 {
3710    va_list va;
3711    char *msg;
3712
3713    if (failed)
3714       return;
3715
3716    failed = true;
3717
3718    va_start(va, format);
3719    msg = ralloc_vasprintf(mem_ctx, format, va);
3720    va_end(va);
3721    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3722
3723    this->fail_msg = msg;
3724
3725    if (debug_enabled) {
3726       fprintf(stderr, "%s",  msg);
3727    }
3728 }
3729
3730 } /* namespace brw */