src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "glsl/ir_uniform.h"
  27 #include "program/sampler.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
  50    this->shadow_compare = false;
  51    this->ir = NULL;
  52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  53    this->header_size = 0;
  54    this->flag_subreg = 0;
  55    this->mlen = 0;
  56    this->base_mrf = 0;
  57    this->offset = 0;
  58    this->annotation = NULL;
  59 }
  60
  61 vec4_instruction *
  62 vec4_visitor::emit(vec4_instruction *inst)
  63 {
  64    inst->ir = this->base_ir;
  65    inst->annotation = this->current_annotation;
  66
  67    this->instructions.push_tail(inst);
  68
  69    return inst;
  70 }
  71
  72 vec4_instruction *
  73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  74                           vec4_instruction *new_inst)
  75 {
  76    new_inst->ir = inst->ir;
  77    new_inst->annotation = inst->annotation;
  78
  79    inst->insert_before(block, new_inst);
  80
  81    return inst;
  82 }
  83
  84 vec4_instruction *
  85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  86                    const src_reg &src1, const src_reg &src2)
  87 {
  88    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  89 }
  90
  91
  92 vec4_instruction *
  93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  94                    const src_reg &src1)
  95 {
  96    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
  97 }
  98
  99 vec4_instruction *
 100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 101 {
 102    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 103 }
 104
 105 vec4_instruction *
 106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 107 {
 108    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 109 }
 110
 111 vec4_instruction *
 112 vec4_visitor::emit(enum opcode opcode)
 113 {
 114    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 115 }
 116
 117 #define ALU1(op)                                                        \
 118    vec4_instruction *                                                   \
 119    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 120    {                                                                    \
 121       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 122    }
 123
 124 #define ALU2(op)                                                        \
 125    vec4_instruction *                                                   \
 126    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 127                     const src_reg &src1)                                \
 128    {                                                                    \
 129       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 130                                            src0, src1);                 \
 131    }
 132
 133 #define ALU2_ACC(op)                                                    \
 134    vec4_instruction *                                                   \
 135    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 136                     const src_reg &src1)                                \
 137    {                                                                    \
 138       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 139                        BRW_OPCODE_##op, dst, src0, src1);               \
 140       inst->writes_accumulator = true;                                  \
 141       return inst;                                                      \
 142    }
 143
 144 #define ALU3(op)                                                        \
 145    vec4_instruction *                                                   \
 146    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 147                     const src_reg &src1, const src_reg &src2)           \
 148    {                                                                    \
 149       assert(devinfo->gen >= 6);                                                \
 150       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 151                                            src0, src1, src2);           \
 152    }
 153
 154 ALU1(NOT)
 155 ALU1(MOV)
 156 ALU1(FRC)
 157 ALU1(RNDD)
 158 ALU1(RNDE)
 159 ALU1(RNDZ)
 160 ALU1(F32TO16)
 161 ALU1(F16TO32)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2_ACC(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(DP3)
 169 ALU2(DP4)
 170 ALU2(DPH)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2_ACC(ADDC)
 184 ALU2_ACC(SUBB)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 vec4_instruction *
 189 vec4_visitor::IF(enum brw_predicate predicate)
 190 {
 191    vec4_instruction *inst;
 192
 193    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 vec4_instruction *
 201 vec4_visitor::IF(src_reg src0, src_reg src1,
 202                  enum brw_conditional_mod condition)
 203 {
 204    assert(devinfo->gen == 6);
 205
 206    vec4_instruction *inst;
 207
 208    resolve_ud_negate(&src0);
 209    resolve_ud_negate(&src1);
 210
 211    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 212                                         src0, src1);
 213    inst->conditional_mod = condition;
 214
 215    return inst;
 216 }
 217
 218 /**
 219  * CMP: Sets the low bit of the destination channels with the result
 220  * of the comparison, while the upper bits are undefined, and updates
 221  * the flag register with the packed 16 bits of the result.
 222  */
 223 vec4_instruction *
 224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 225                   enum brw_conditional_mod condition)
 226 {
 227    vec4_instruction *inst;
 228
 229    /* Take the instruction:
 230     *
 231     * CMP null<d> src0<f> src1<f>
 232     *
 233     * Original gen4 does type conversion to the destination type before
 234     * comparison, producing garbage results for floating point comparisons.
 235     *
 236     * The destination type doesn't matter on newer generations, so we set the
 237     * type to match src0 so we can compact the instruction.
 238     */
 239    dst.type = src0.type;
 240
 241    resolve_ud_negate(&src0);
 242    resolve_ud_negate(&src1);
 243
 244    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 245    inst->conditional_mod = condition;
 246
 247    return inst;
 248 }
 249
 250 vec4_instruction *
 251 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 252 {
 253    vec4_instruction *inst;
 254
 255    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 256                                         dst, index);
 257    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 258    inst->mlen = 2;
 259
 260    return inst;
 261 }
 262
 263 vec4_instruction *
 264 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 265                             const src_reg &index)
 266 {
 267    vec4_instruction *inst;
 268
 269    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 270                                         dst, src, index);
 271    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
 272    inst->mlen = 3;
 273
 274    return inst;
 275 }
 276
 277 src_reg
 278 vec4_visitor::fix_3src_operand(const src_reg &src)
 279 {
 280    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 281     * able to use vertical stride of zero to replicate the vec4 uniform, like
 282     *
 283     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 284     *
 285     * But you can't, since vertical stride is always four in three-source
 286     * instructions. Instead, insert a MOV instruction to do the replication so
 287     * that the three-source instruction can consume it.
 288     */
 289
 290    /* The MOV is only needed if the source is a uniform or immediate. */
 291    if (src.file != UNIFORM && src.file != IMM)
 292       return src;
 293
 294    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 295       return src;
 296
 297    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 298    expanded.type = src.type;
 299    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 300    return src_reg(expanded);
 301 }
 302
 303 src_reg
 304 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 305 {
 306    if (!src.abs && !src.negate)
 307       return src;
 308
 309    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 310    resolved.type = src.type;
 311    emit(MOV(resolved, src));
 312
 313    return src_reg(resolved);
 314 }
 315
 316 src_reg
 317 vec4_visitor::fix_math_operand(const src_reg &src)
 318 {
 319    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 320       return src;
 321
 322    /* The gen6 math instruction ignores the source modifiers --
 323     * swizzle, abs, negate, and at least some parts of the register
 324     * region description.
 325     *
 326     * Rather than trying to enumerate all these cases, *always* expand the
 327     * operand to a temp GRF for gen6.
 328     *
 329     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 330     * can't use.
 331     */
 332
 333    if (devinfo->gen == 7 && src.file != IMM)
 334       return src;
 335
 336    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 337    expanded.type = src.type;
 338    emit(MOV(expanded, src));
 339    return src_reg(expanded);
 340 }
 341
 342 vec4_instruction *
 343 vec4_visitor::emit_math(enum opcode opcode,
 344                         const dst_reg &dst,
 345                         const src_reg &src0, const src_reg &src1)
 346 {
 347    vec4_instruction *math =
 348       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 349
 350    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 351       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 352       math->dst = dst_reg(this, glsl_type::vec4_type);
 353       math->dst.type = dst.type;
 354       math = emit(MOV(dst, src_reg(math->dst)));
 355    } else if (devinfo->gen < 6) {
 356       math->base_mrf = 1;
 357       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 358    }
 359
 360    return math;
 361 }
 362
 363 void
 364 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 365 {
 366    if (devinfo->gen < 7) {
 367       unreachable("ir_unop_pack_half_2x16 should be lowered");
 368    }
 369
 370    assert(dst.type == BRW_REGISTER_TYPE_UD);
 371    assert(src0.type == BRW_REGISTER_TYPE_F);
 372
 373    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 374     *
 375     *   Because this instruction does not have a 16-bit floating-point type,
 376     *   the destination data type must be Word (W).
 377     *
 378     *   The destination must be DWord-aligned and specify a horizontal stride
 379     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 380     *   each destination channel and the upper word is not modified.
 381     *
 382     * The above restriction implies that the f32to16 instruction must use
 383     * align1 mode, because only in align1 mode is it possible to specify
 384     * horizontal stride.  We choose here to defy the hardware docs and emit
 385     * align16 instructions.
 386     *
 387     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 388     * instructions. I was partially successful in that the code passed all
 389     * tests.  However, the code was dubiously correct and fragile, and the
 390     * tests were not harsh enough to probe that frailty. Not trusting the
 391     * code, I chose instead to remain in align16 mode in defiance of the hw
 392     * docs).
 393     *
 394     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 395     * simulator, emitting a f32to16 in align16 mode with UD as destination
 396     * data type is safe. The behavior differs from that specified in the PRM
 397     * in that the upper word of each destination channel is cleared to 0.
 398     */
 399
 400    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 401    src_reg tmp_src(tmp_dst);
 402
 403 #if 0
 404    /* Verify the undocumented behavior on which the following instructions
 405     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 406     * then the result of the bit-or instruction below will be incorrect.
 407     *
 408     * You should inspect the disasm output in order to verify that the MOV is
 409     * not optimized away.
 410     */
 411    emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
 412 #endif
 413
 414    /* Give tmp the form below, where "." means untouched.
 415     *
 416     *     w z          y          x w z          y          x
 417     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 418     *
 419     * That the upper word of each write-channel be 0 is required for the
 420     * following bit-shift and bit-or instructions to work. Note that this
 421     * relies on the undocumented hardware behavior mentioned above.
 422     */
 423    tmp_dst.writemask = WRITEMASK_XY;
 424    emit(F32TO16(tmp_dst, src0));
 425
 426    /* Give the write-channels of dst the form:
 427     *   0xhhhh0000
 428     */
 429    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 430    emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
 431
 432    /* Finally, give the write-channels of dst the form of packHalf2x16's
 433     * output:
 434     *   0xhhhhllll
 435     */
 436    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 437    emit(OR(dst, src_reg(dst), tmp_src));
 438 }
 439
 440 void
 441 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 442 {
 443    if (devinfo->gen < 7) {
 444       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 445    }
 446
 447    assert(dst.type == BRW_REGISTER_TYPE_F);
 448    assert(src0.type == BRW_REGISTER_TYPE_UD);
 449
 450    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 451     *
 452     *   Because this instruction does not have a 16-bit floating-point type,
 453     *   the source data type must be Word (W). The destination type must be
 454     *   F (Float).
 455     *
 456     * To use W as the source data type, we must adjust horizontal strides,
 457     * which is only possible in align1 mode. All my [chadv] attempts at
 458     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 459     * Piglit tests, so I gave up.
 460     *
 461     * I've verified that, on gen7 hardware and the simulator, it is safe to
 462     * emit f16to32 in align16 mode with UD as source data type.
 463     */
 464
 465    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 466    src_reg tmp_src(tmp_dst);
 467
 468    tmp_dst.writemask = WRITEMASK_X;
 469    emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
 470
 471    tmp_dst.writemask = WRITEMASK_Y;
 472    emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
 473
 474    dst.writemask = WRITEMASK_XY;
 475    emit(F16TO32(dst, tmp_src));
 476 }
 477
 478 void
 479 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 480 {
 481    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 482     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 483     * is not suitable to generate the shift values, but we can use the packed
 484     * vector float and a type-converting MOV.
 485     */
 486    dst_reg shift(this, glsl_type::uvec4_type);
 487    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 488
 489    dst_reg shifted(this, glsl_type::uvec4_type);
 490    src0.swizzle = BRW_SWIZZLE_XXXX;
 491    emit(SHR(shifted, src0, src_reg(shift)));
 492
 493    shifted.type = BRW_REGISTER_TYPE_UB;
 494    dst_reg f(this, glsl_type::vec4_type);
 495    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 496
 497    emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
 498 }
 499
 500 void
 501 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 502 {
 503    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 504     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 505     * is not suitable to generate the shift values, but we can use the packed
 506     * vector float and a type-converting MOV.
 507     */
 508    dst_reg shift(this, glsl_type::uvec4_type);
 509    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 510
 511    dst_reg shifted(this, glsl_type::uvec4_type);
 512    src0.swizzle = BRW_SWIZZLE_XXXX;
 513    emit(SHR(shifted, src0, src_reg(shift)));
 514
 515    shifted.type = BRW_REGISTER_TYPE_B;
 516    dst_reg f(this, glsl_type::vec4_type);
 517    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 518
 519    dst_reg scaled(this, glsl_type::vec4_type);
 520    emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
 521
 522    dst_reg max(this, glsl_type::vec4_type);
 523    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
 524    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
 525 }
 526
 527 void
 528 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 529 {
 530    dst_reg saturated(this, glsl_type::vec4_type);
 531    vec4_instruction *inst = emit(MOV(saturated, src0));
 532    inst->saturate = true;
 533
 534    dst_reg scaled(this, glsl_type::vec4_type);
 535    emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
 536
 537    dst_reg rounded(this, glsl_type::vec4_type);
 538    emit(RNDE(rounded, src_reg(scaled)));
 539
 540    dst_reg u(this, glsl_type::uvec4_type);
 541    emit(MOV(u, src_reg(rounded)));
 542
 543    src_reg bytes(u);
 544    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 545 }
 546
 547 void
 548 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 549 {
 550    dst_reg max(this, glsl_type::vec4_type);
 551    emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
 552
 553    dst_reg min(this, glsl_type::vec4_type);
 554    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
 555
 556    dst_reg scaled(this, glsl_type::vec4_type);
 557    emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
 558
 559    dst_reg rounded(this, glsl_type::vec4_type);
 560    emit(RNDE(rounded, src_reg(scaled)));
 561
 562    dst_reg i(this, glsl_type::ivec4_type);
 563    emit(MOV(i, src_reg(rounded)));
 564
 565    src_reg bytes(i);
 566    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 567 }
 568
 569 /**
 570  * Returns the minimum number of vec4 elements needed to pack a type.
 571  *
 572  * For simple types, it will return 1 (a single vec4); for matrices, the
 573  * number of columns; for array and struct, the sum of the vec4_size of
 574  * each of its elements; and for sampler and atomic, zero.
 575  *
 576  * This method is useful to calculate how much register space is needed to
 577  * store a particular type.
 578  */
 579 extern "C" int
 580 type_size_vec4(const struct glsl_type *type)
 581 {
 582    unsigned int i;
 583    int size;
 584
 585    switch (type->base_type) {
 586    case GLSL_TYPE_UINT:
 587    case GLSL_TYPE_INT:
 588    case GLSL_TYPE_FLOAT:
 589    case GLSL_TYPE_BOOL:
 590       if (type->is_matrix()) {
 591          return type->matrix_columns;
 592       } else {
 593          /* Regardless of size of vector, it gets a vec4. This is bad
 594           * packing for things like floats, but otherwise arrays become a
 595           * mess.  Hopefully a later pass over the code can pack scalars
 596           * down if appropriate.
 597           */
 598          return 1;
 599       }
 600    case GLSL_TYPE_ARRAY:
 601       assert(type->length > 0);
 602       return type_size_vec4(type->fields.array) * type->length;
 603    case GLSL_TYPE_STRUCT:
 604       size = 0;
 605       for (i = 0; i < type->length; i++) {
 606          size += type_size_vec4(type->fields.structure[i].type);
 607       }
 608       return size;
 609    case GLSL_TYPE_SUBROUTINE:
 610       return 1;
 611
 612    case GLSL_TYPE_SAMPLER:
 613       /* Samplers take up no register space, since they're baked in at
 614        * link time.
 615        */
 616       return 0;
 617    case GLSL_TYPE_ATOMIC_UINT:
 618       return 0;
 619    case GLSL_TYPE_IMAGE:
 620       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 621    case GLSL_TYPE_VOID:
 622    case GLSL_TYPE_DOUBLE:
 623    case GLSL_TYPE_ERROR:
 624    case GLSL_TYPE_INTERFACE:
 625       unreachable("not reached");
 626    }
 627
 628    return 0;
 629 }
 630
 631 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 632 {
 633    init();
 634
 635    this->file = VGRF;
 636    this->nr = v->alloc.allocate(type_size_vec4(type));
 637
 638    if (type->is_array() || type->is_record()) {
 639       this->swizzle = BRW_SWIZZLE_NOOP;
 640    } else {
 641       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 642    }
 643
 644    this->type = brw_type_for_base_type(type);
 645 }
 646
 647 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 648 {
 649    assert(size > 0);
 650
 651    init();
 652
 653    this->file = VGRF;
 654    this->nr = v->alloc.allocate(type_size_vec4(type) * size);
 655
 656    this->swizzle = BRW_SWIZZLE_NOOP;
 657
 658    this->type = brw_type_for_base_type(type);
 659 }
 660
 661 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 662 {
 663    init();
 664
 665    this->file = VGRF;
 666    this->nr = v->alloc.allocate(type_size_vec4(type));
 667
 668    if (type->is_array() || type->is_record()) {
 669       this->writemask = WRITEMASK_XYZW;
 670    } else {
 671       this->writemask = (1 << type->vector_elements) - 1;
 672    }
 673
 674    this->type = brw_type_for_base_type(type);
 675 }
 676
 677 vec4_instruction *
 678 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
 679                           src_reg src0, src_reg src1)
 680 {
 681    vec4_instruction *inst;
 682
 683    if (devinfo->gen >= 6) {
 684       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 685       inst->conditional_mod = conditionalmod;
 686    } else {
 687       emit(CMP(dst, src0, src1, conditionalmod));
 688
 689       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 690       inst->predicate = BRW_PREDICATE_NORMAL;
 691    }
 692
 693    return inst;
 694 }
 695
 696 vec4_instruction *
 697 vec4_visitor::emit_lrp(const dst_reg &dst,
 698                        const src_reg &x, const src_reg &y, const src_reg &a)
 699 {
 700    if (devinfo->gen >= 6) {
 701       /* Note that the instruction's argument order is reversed from GLSL
 702        * and the IR.
 703        */
 704      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
 705                      fix_3src_operand(x)));
 706    } else {
 707       /* Earlier generations don't support three source operations, so we
 708        * need to emit x*(1-a) + y*a.
 709        */
 710       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
 711       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
 712       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
 713       y_times_a.writemask           = dst.writemask;
 714       one_minus_a.writemask         = dst.writemask;
 715       x_times_one_minus_a.writemask = dst.writemask;
 716
 717       emit(MUL(y_times_a, y, a));
 718       emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
 719       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
 720       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
 721    }
 722 }
 723
 724 /**
 725  * Emits the instructions needed to perform a pull constant load. before_block
 726  * and before_inst can be NULL in which case the instruction will be appended
 727  * to the end of the instruction list.
 728  */
 729 void
 730 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
 731                                           src_reg surf_index,
 732                                           src_reg offset_reg,
 733                                           bblock_t *before_block,
 734                                           vec4_instruction *before_inst)
 735 {
 736    assert((before_inst == NULL && before_block == NULL) ||
 737           (before_inst && before_block));
 738
 739    vec4_instruction *pull;
 740
 741    if (devinfo->gen >= 9) {
 742       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 743       src_reg header(this, glsl_type::uvec4_type, 2);
 744
 745       pull = new(mem_ctx)
 746          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 747                           dst_reg(header));
 748
 749       if (before_inst)
 750          emit_before(before_block, before_inst, pull);
 751       else
 752          emit(pull);
 753
 754       dst_reg index_reg = retype(offset(dst_reg(header), 1),
 755                                  offset_reg.type);
 756       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
 757
 758       if (before_inst)
 759          emit_before(before_block, before_inst, pull);
 760       else
 761          emit(pull);
 762
 763       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 764                                            dst,
 765                                            surf_index,
 766                                            header);
 767       pull->mlen = 2;
 768       pull->header_size = 1;
 769    } else if (devinfo->gen >= 7) {
 770       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
 771
 772       grf_offset.type = offset_reg.type;
 773
 774       pull = MOV(grf_offset, offset_reg);
 775
 776       if (before_inst)
 777          emit_before(before_block, before_inst, pull);
 778       else
 779          emit(pull);
 780
 781       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 782                                            dst,
 783                                            surf_index,
 784                                            src_reg(grf_offset));
 785       pull->mlen = 1;
 786    } else {
 787       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
 788                                            dst,
 789                                            surf_index,
 790                                            offset_reg);
 791       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
 792       pull->mlen = 1;
 793    }
 794
 795    if (before_inst)
 796       emit_before(before_block, before_inst, pull);
 797    else
 798       emit(pull);
 799 }
 800
 801 src_reg
 802 vec4_visitor::emit_uniformize(const src_reg &src)
 803 {
 804    const src_reg chan_index(this, glsl_type::uint_type);
 805    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
 806                               src.type);
 807
 808    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
 809       ->force_writemask_all = true;
 810    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
 811       ->force_writemask_all = true;
 812
 813    return src_reg(dst);
 814 }
 815
 816 src_reg
 817 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
 818                              src_reg coordinate, src_reg sampler)
 819 {
 820    vec4_instruction *inst =
 821       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
 822                                     dst_reg(this, glsl_type::uvec4_type));
 823    inst->base_mrf = 2;
 824    inst->src[1] = sampler;
 825
 826    int param_base;
 827
 828    if (devinfo->gen >= 9) {
 829       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 830       vec4_instruction *header_inst = new(mem_ctx)
 831          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 832                           dst_reg(MRF, inst->base_mrf));
 833
 834       emit(header_inst);
 835
 836       inst->mlen = 2;
 837       inst->header_size = 1;
 838       param_base = inst->base_mrf + 1;
 839    } else {
 840       inst->mlen = 1;
 841       param_base = inst->base_mrf;
 842    }
 843
 844    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
 845    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
 846    int zero_mask = 0xf & ~coord_mask;
 847
 848    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
 849             coordinate));
 850
 851    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
 852             brw_imm_d(0)));
 853
 854    emit(inst);
 855    return src_reg(inst->dst);
 856 }
 857
 858 bool
 859 vec4_visitor::is_high_sampler(src_reg sampler)
 860 {
 861    if (devinfo->gen < 8 && !devinfo->is_haswell)
 862       return false;
 863
 864    return sampler.file != IMM || sampler.ud >= 16;
 865 }
 866
 867 void
 868 vec4_visitor::emit_texture(ir_texture_opcode op,
 869                            dst_reg dest,
 870                            const glsl_type *dest_type,
 871                            src_reg coordinate,
 872                            int coord_components,
 873                            src_reg shadow_comparitor,
 874                            src_reg lod, src_reg lod2,
 875                            src_reg sample_index,
 876                            uint32_t constant_offset,
 877                            src_reg offset_value,
 878                            src_reg mcs,
 879                            bool is_cube_array,
 880                            uint32_t sampler,
 881                            src_reg sampler_reg)
 882 {
 883    /* The sampler can only meaningfully compute LOD for fragment shader
 884     * messages. For all other stages, we change the opcode to TXL and hardcode
 885     * the LOD to 0.
 886     *
 887     * textureQueryLevels() is implemented in terms of TXS so we need to pass a
 888     * valid LOD argument.
 889     */
 890    if (op == ir_tex || op == ir_query_levels) {
 891       assert(lod.file == BAD_FILE);
 892       lod = brw_imm_f(0.0f);
 893    }
 894
 895    enum opcode opcode;
 896    switch (op) {
 897    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
 898    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 899    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 900    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 901    case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
 902                              SHADER_OPCODE_TXF_CMS); break;
 903    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 904    case ir_tg4: opcode = offset_value.file != BAD_FILE
 905                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
 906    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
 907    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
 908    case ir_txb:
 909       unreachable("TXB is not valid for vertex shaders.");
 910    case ir_lod:
 911       unreachable("LOD is not valid for vertex shaders.");
 912    case ir_samples_identical: {
 913       /* There are some challenges implementing this for vec4, and it seems
 914        * unlikely to be used anyway.  For now, just return false ways.
 915        */
 916       emit(MOV(dest, brw_imm_ud(0u)));
 917       return;
 918    }
 919    default:
 920       unreachable("Unrecognized tex op");
 921    }
 922
 923    vec4_instruction *inst = new(mem_ctx) vec4_instruction(
 924       opcode, dst_reg(this, dest_type));
 925
 926    inst->offset = constant_offset;
 927
 928    /* The message header is necessary for:
 929     * - Gen4 (always)
 930     * - Gen9+ for selecting SIMD4x2
 931     * - Texel offsets
 932     * - Gather channel selection
 933     * - Sampler indices too large to fit in a 4-bit value.
 934     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
 935     */
 936    inst->header_size =
 937       (devinfo->gen < 5 || devinfo->gen >= 9 ||
 938        inst->offset != 0 || op == ir_tg4 ||
 939        op == ir_texture_samples ||
 940        is_high_sampler(sampler_reg)) ? 1 : 0;
 941    inst->base_mrf = 2;
 942    inst->mlen = inst->header_size;
 943    inst->dst.writemask = WRITEMASK_XYZW;
 944    inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
 945
 946    inst->src[1] = sampler_reg;
 947
 948    /* MRF for the first parameter */
 949    int param_base = inst->base_mrf + inst->header_size;
 950
 951    if (op == ir_txs || op == ir_query_levels) {
 952       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
 953       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
 954       inst->mlen++;
 955    } else if (op == ir_texture_samples) {
 956       inst->dst.writemask = WRITEMASK_X;
 957    } else {
 958       /* Load the coordinate */
 959       /* FINISHME: gl_clamp_mask and saturate */
 960       int coord_mask = (1 << coord_components) - 1;
 961       int zero_mask = 0xf & ~coord_mask;
 962
 963       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
 964                coordinate));
 965       inst->mlen++;
 966
 967       if (zero_mask != 0) {
 968          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
 969                   brw_imm_d(0)));
 970       }
 971       /* Load the shadow comparitor */
 972       if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
 973          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
 974                           WRITEMASK_X),
 975                   shadow_comparitor));
 976          inst->mlen++;
 977       }
 978
 979       /* Load the LOD info */
 980       if (op == ir_tex || op == ir_txl) {
 981          int mrf, writemask;
 982          if (devinfo->gen >= 5) {
 983             mrf = param_base + 1;
 984             if (shadow_comparitor.file != BAD_FILE) {
 985                writemask = WRITEMASK_Y;
 986                /* mlen already incremented */
 987             } else {
 988                writemask = WRITEMASK_X;
 989                inst->mlen++;
 990             }
 991          } else /* devinfo->gen == 4 */ {
 992             mrf = param_base;
 993             writemask = WRITEMASK_W;
 994          }
 995          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
 996       } else if (op == ir_txf) {
 997          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
 998       } else if (op == ir_txf_ms) {
 999          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1000                   sample_index));
1001          if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1002             /* MCS data is stored in the first two channels of ‘mcs’, but we
1003              * need to get it into the .y and .z channels of the second vec4
1004              * of params.
1005              */
1006             mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1007             emit(MOV(dst_reg(MRF, param_base + 1,
1008                              glsl_type::uint_type, WRITEMASK_YZ),
1009                      mcs));
1010          } else if (devinfo->gen >= 7) {
1011             /* MCS data is in the first channel of `mcs`, but we need to get it into
1012              * the .y channel of the second vec4 of params, so replicate .x across
1013              * the whole vec4 and then mask off everything except .y
1014              */
1015             mcs.swizzle = BRW_SWIZZLE_XXXX;
1016             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1017                      mcs));
1018          }
1019          inst->mlen++;
1020       } else if (op == ir_txd) {
1021          const brw_reg_type type = lod.type;
1022
1023          if (devinfo->gen >= 5) {
1024             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1025             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1026             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1027             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1028             inst->mlen++;
1029
1030             if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
1031                lod.swizzle = BRW_SWIZZLE_ZZZZ;
1032                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1033                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1034                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1035                inst->mlen++;
1036
1037                if (shadow_comparitor.file != BAD_FILE) {
1038                   emit(MOV(dst_reg(MRF, param_base + 2,
1039                                    shadow_comparitor.type, WRITEMASK_Z),
1040                            shadow_comparitor));
1041                }
1042             }
1043          } else /* devinfo->gen == 4 */ {
1044             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1045             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1046             inst->mlen += 2;
1047          }
1048       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1049          if (shadow_comparitor.file != BAD_FILE) {
1050             emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
1051                      shadow_comparitor));
1052          }
1053
1054          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1055                   offset_value));
1056          inst->mlen++;
1057       }
1058    }
1059
1060    emit(inst);
1061
1062    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1063     * spec requires layers.
1064     */
1065    if (op == ir_txs && is_cube_array) {
1066       emit_math(SHADER_OPCODE_INT_QUOTIENT,
1067                 writemask(inst->dst, WRITEMASK_Z),
1068                 src_reg(inst->dst), brw_imm_d(6));
1069    }
1070
1071    if (devinfo->gen == 6 && op == ir_tg4) {
1072       emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst);
1073    }
1074
1075    swizzle_result(op, dest,
1076                   src_reg(inst->dst), sampler, dest_type);
1077 }
1078
1079 /**
1080  * Apply workarounds for Gen6 gather with UINT/SINT
1081  */
1082 void
1083 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1084 {
1085    if (!wa)
1086       return;
1087
1088    int width = (wa & WA_8BIT) ? 8 : 16;
1089    dst_reg dst_f = dst;
1090    dst_f.type = BRW_REGISTER_TYPE_F;
1091
1092    /* Convert from UNORM to UINT */
1093    emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1094    emit(MOV(dst, src_reg(dst_f)));
1095
1096    if (wa & WA_SIGN) {
1097       /* Reinterpret the UINT value as a signed INT value by
1098        * shifting the sign bit into place, then shifting back
1099        * preserving sign.
1100        */
1101       emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1102       emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1103    }
1104 }
1105
1106 /**
1107  * Set up the gather channel based on the swizzle, for gather4.
1108  */
1109 uint32_t
1110 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
1111 {
1112    int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
1113    switch (swiz) {
1114       case SWIZZLE_X: return 0;
1115       case SWIZZLE_Y:
1116          /* gather4 sampler is broken for green channel on RG32F --
1117           * we must ask for blue instead.
1118           */
1119          if (key_tex->gather_channel_quirk_mask & (1 << sampler))
1120             return 2;
1121          return 1;
1122       case SWIZZLE_Z: return 2;
1123       case SWIZZLE_W: return 3;
1124       default:
1125          unreachable("Not reached"); /* zero, one swizzles handled already */
1126    }
1127 }
1128
1129 void
1130 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
1131                              src_reg orig_val, uint32_t sampler,
1132                              const glsl_type *dest_type)
1133 {
1134    int s = key_tex->swizzles[sampler];
1135
1136    dst_reg swizzled_result = dest;
1137
1138    if (op == ir_query_levels) {
1139       /* # levels is in .w */
1140       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1141       emit(MOV(swizzled_result, orig_val));
1142       return;
1143    }
1144
1145    if (op == ir_txs || dest_type == glsl_type::float_type
1146                         || s == SWIZZLE_NOOP || op == ir_tg4) {
1147       emit(MOV(swizzled_result, orig_val));
1148       return;
1149    }
1150
1151
1152    int zero_mask = 0, one_mask = 0, copy_mask = 0;
1153    int swizzle[4] = {0};
1154
1155    for (int i = 0; i < 4; i++) {
1156       switch (GET_SWZ(s, i)) {
1157       case SWIZZLE_ZERO:
1158          zero_mask |= (1 << i);
1159          break;
1160       case SWIZZLE_ONE:
1161          one_mask |= (1 << i);
1162          break;
1163       default:
1164          copy_mask |= (1 << i);
1165          swizzle[i] = GET_SWZ(s, i);
1166          break;
1167       }
1168    }
1169
1170    if (copy_mask) {
1171       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1172       swizzled_result.writemask = copy_mask;
1173       emit(MOV(swizzled_result, orig_val));
1174    }
1175
1176    if (zero_mask) {
1177       swizzled_result.writemask = zero_mask;
1178       emit(MOV(swizzled_result, brw_imm_f(0.0f)));
1179    }
1180
1181    if (one_mask) {
1182       swizzled_result.writemask = one_mask;
1183       emit(MOV(swizzled_result, brw_imm_f(1.0f)));
1184    }
1185 }
1186
1187 void
1188 vec4_visitor::gs_emit_vertex(int stream_id)
1189 {
1190    unreachable("not reached");
1191 }
1192
1193 void
1194 vec4_visitor::gs_end_primitive()
1195 {
1196    unreachable("not reached");
1197 }
1198
1199 void
1200 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
1201                                   dst_reg dst, src_reg surf_offset,
1202                                   src_reg src0, src_reg src1)
1203 {
1204    unsigned mlen = 1 + (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
1205    src_reg src_payload(this, glsl_type::uint_type, mlen);
1206    dst_reg payload(src_payload);
1207    payload.writemask = WRITEMASK_X;
1208
1209    /* Set the atomic operation offset. */
1210    emit(MOV(offset(payload, 0), surf_offset));
1211    unsigned i = 1;
1212
1213    /* Set the atomic operation arguments. */
1214    if (src0.file != BAD_FILE) {
1215       emit(MOV(offset(payload, i), src0));
1216       i++;
1217    }
1218
1219    if (src1.file != BAD_FILE) {
1220       emit(MOV(offset(payload, i), src1));
1221       i++;
1222    }
1223
1224    /* Emit the instruction.  Note that this maps to the normal SIMD8
1225     * untyped atomic message on Ivy Bridge, but that's OK because
1226     * unused channels will be masked out.
1227     */
1228    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
1229                                  src_payload,
1230                                  brw_imm_ud(surf_index), brw_imm_ud(atomic_op));
1231    inst->mlen = mlen;
1232 }
1233
1234 void
1235 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
1236                                         src_reg surf_offset)
1237 {
1238    dst_reg offset(this, glsl_type::uint_type);
1239    offset.writemask = WRITEMASK_X;
1240
1241    /* Set the surface read offset. */
1242    emit(MOV(offset, surf_offset));
1243
1244    /* Emit the instruction.  Note that this maps to the normal SIMD8
1245     * untyped surface read message, but that's OK because unused
1246     * channels will be masked out.
1247     */
1248    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
1249                                  src_reg(offset),
1250                                  brw_imm_ud(surf_index), brw_imm_d(1));
1251    inst->mlen = 1;
1252 }
1253
1254 void
1255 vec4_visitor::emit_ndc_computation()
1256 {
1257    if (output_reg[VARYING_SLOT_POS].file == BAD_FILE)
1258       return;
1259
1260    /* Get the position */
1261    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
1262
1263    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1264    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1265    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
1266
1267    current_annotation = "NDC";
1268    dst_reg ndc_w = ndc;
1269    ndc_w.writemask = WRITEMASK_W;
1270    src_reg pos_w = pos;
1271    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1272    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1273
1274    dst_reg ndc_xyz = ndc;
1275    ndc_xyz.writemask = WRITEMASK_XYZ;
1276
1277    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1278 }
1279
1280 void
1281 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1282 {
1283    if (devinfo->gen < 6 &&
1284        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1285         output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
1286         devinfo->has_negative_rhw_bug)) {
1287       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1288       dst_reg header1_w = header1;
1289       header1_w.writemask = WRITEMASK_W;
1290
1291       emit(MOV(header1, brw_imm_ud(0u)));
1292
1293       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1294          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1295
1296          current_annotation = "Point size";
1297          emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1298          emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1299       }
1300
1301       if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
1302          current_annotation = "Clipping flags";
1303          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1304          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1305
1306          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1307          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1308          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1309
1310          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1311          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1312          emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1313          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1314       }
1315
1316       /* i965 clipping workaround:
1317        * 1) Test for -ve rhw
1318        * 2) If set,
1319        *      set ndc = (0,0,0,0)
1320        *      set ucp[6] = 1
1321        *
1322        * Later, clipping will detect ucp[6] and ensure the primitive is
1323        * clipped against all fixed planes.
1324        */
1325       if (devinfo->has_negative_rhw_bug &&
1326           output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE) {
1327          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
1328          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1329          emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1330          vec4_instruction *inst;
1331          inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1332          inst->predicate = BRW_PREDICATE_NORMAL;
1333          output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
1334          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], brw_imm_f(0.0f)));
1335          inst->predicate = BRW_PREDICATE_NORMAL;
1336       }
1337
1338       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1339    } else if (devinfo->gen < 6) {
1340       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1341    } else {
1342       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1343       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1344          dst_reg reg_w = reg;
1345          reg_w.writemask = WRITEMASK_W;
1346          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1347          reg_as_src.type = reg_w.type;
1348          reg_as_src.swizzle = brw_swizzle_for_size(1);
1349          emit(MOV(reg_w, reg_as_src));
1350       }
1351       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1352          dst_reg reg_y = reg;
1353          reg_y.writemask = WRITEMASK_Y;
1354          reg_y.type = BRW_REGISTER_TYPE_D;
1355          output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
1356          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
1357       }
1358       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1359          dst_reg reg_z = reg;
1360          reg_z.writemask = WRITEMASK_Z;
1361          reg_z.type = BRW_REGISTER_TYPE_D;
1362          output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
1363          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
1364       }
1365    }
1366 }
1367
1368 vec4_instruction *
1369 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
1370 {
1371    assert(varying < VARYING_SLOT_MAX);
1372    assert(output_reg[varying].type == reg.type);
1373    current_annotation = output_reg_annotation[varying];
1374    if (output_reg[varying].file != BAD_FILE)
1375       return emit(MOV(reg, src_reg(output_reg[varying])));
1376    else
1377       return NULL;
1378 }
1379
1380 void
1381 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1382 {
1383    reg.type = BRW_REGISTER_TYPE_F;
1384    output_reg[varying].type = reg.type;
1385
1386    switch (varying) {
1387    case VARYING_SLOT_PSIZ:
1388    {
1389       /* PSIZ is always in slot 0, and is coupled with other flags. */
1390       current_annotation = "indices, point width, clip flags";
1391       emit_psiz_and_flags(reg);
1392       break;
1393    }
1394    case BRW_VARYING_SLOT_NDC:
1395       current_annotation = "NDC";
1396       if (output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE)
1397          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
1398       break;
1399    case VARYING_SLOT_POS:
1400       current_annotation = "gl_Position";
1401       if (output_reg[VARYING_SLOT_POS].file != BAD_FILE)
1402          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
1403       break;
1404    case VARYING_SLOT_EDGE:
1405       /* This is present when doing unfilled polygons.  We're supposed to copy
1406        * the edge flag from the user-provided vertex array
1407        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1408        * of that attribute (starts as 1.0f).  This is then used in clipping to
1409        * determine which edges should be drawn as wireframe.
1410        */
1411       current_annotation = "edge flag";
1412       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1413                                     glsl_type::float_type, WRITEMASK_XYZW))));
1414       break;
1415    case BRW_VARYING_SLOT_PAD:
1416       /* No need to write to this slot */
1417       break;
1418    default:
1419       emit_generic_urb_slot(reg, varying);
1420       break;
1421    }
1422 }
1423
1424 static int
1425 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
1426 {
1427    if (devinfo->gen >= 6) {
1428       /* URB data written (does not include the message header reg) must
1429        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1430        * section 5.4.3.2.2: URB_INTERLEAVED.
1431        *
1432        * URB entries are allocated on a multiple of 1024 bits, so an
1433        * extra 128 bits written here to make the end align to 256 is
1434        * no problem.
1435        */
1436       if ((mlen % 2) != 1)
1437          mlen++;
1438    }
1439
1440    return mlen;
1441 }
1442
1443
1444 /**
1445  * Generates the VUE payload plus the necessary URB write instructions to
1446  * output it.
1447  *
1448  * The VUE layout is documented in Volume 2a.
1449  */
1450 void
1451 vec4_visitor::emit_vertex()
1452 {
1453    /* MRF 0 is reserved for the debugger, so start with message header
1454     * in MRF 1.
1455     */
1456    int base_mrf = 1;
1457    int mrf = base_mrf;
1458    /* In the process of generating our URB write message contents, we
1459     * may need to unspill a register or load from an array.  Those
1460     * reads would use MRFs 14-15.
1461     */
1462    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1463
1464    /* The following assertion verifies that max_usable_mrf causes an
1465     * even-numbered amount of URB write data, which will meet gen6's
1466     * requirements for length alignment.
1467     */
1468    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1469
1470    /* First mrf is the g0-based message header containing URB handles and
1471     * such.
1472     */
1473    emit_urb_write_header(mrf++);
1474
1475    if (devinfo->gen < 6) {
1476       emit_ndc_computation();
1477    }
1478
1479    /* We may need to split this up into several URB writes, so do them in a
1480     * loop.
1481     */
1482    int slot = 0;
1483    bool complete = false;
1484    do {
1485       /* URB offset is in URB row increments, and each of our MRFs is half of
1486        * one of those, since we're doing interleaved writes.
1487        */
1488       int offset = slot / 2;
1489
1490       mrf = base_mrf + 1;
1491       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1492          emit_urb_slot(dst_reg(MRF, mrf++),
1493                        prog_data->vue_map.slot_to_varying[slot]);
1494
1495          /* If this was max_usable_mrf, we can't fit anything more into this
1496           * URB WRITE. Same thing if we reached the maximum length available.
1497           */
1498          if (mrf > max_usable_mrf ||
1499              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1500             slot++;
1501             break;
1502          }
1503       }
1504
1505       complete = slot >= prog_data->vue_map.num_slots;
1506       current_annotation = "URB write";
1507       vec4_instruction *inst = emit_urb_write_opcode(complete);
1508       inst->base_mrf = base_mrf;
1509       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1510       inst->offset += offset;
1511    } while(!complete);
1512 }
1513
1514
1515 src_reg
1516 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1517                                  src_reg *reladdr, int reg_offset)
1518 {
1519    /* Because we store the values to scratch interleaved like our
1520     * vertex data, we need to scale the vec4 index by 2.
1521     */
1522    int message_header_scale = 2;
1523
1524    /* Pre-gen6, the message header uses byte offsets instead of vec4
1525     * (16-byte) offset units.
1526     */
1527    if (devinfo->gen < 6)
1528       message_header_scale *= 16;
1529
1530    if (reladdr) {
1531       src_reg index = src_reg(this, glsl_type::int_type);
1532
1533       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1534                                    brw_imm_d(reg_offset)));
1535       emit_before(block, inst, MUL(dst_reg(index), index,
1536                                    brw_imm_d(message_header_scale)));
1537
1538       return index;
1539    } else {
1540       return brw_imm_d(reg_offset * message_header_scale);
1541    }
1542 }
1543
1544 src_reg
1545 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
1546                                        src_reg *reladdr, int reg_offset)
1547 {
1548    if (reladdr) {
1549       src_reg index = src_reg(this, glsl_type::int_type);
1550
1551       emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1552                                    brw_imm_d(reg_offset)));
1553
1554       /* Pre-gen6, the message header uses byte offsets instead of vec4
1555        * (16-byte) offset units.
1556        */
1557       if (devinfo->gen < 6) {
1558          emit_before(block, inst, MUL(dst_reg(index), index, brw_imm_d(16)));
1559       }
1560
1561       return index;
1562    } else if (devinfo->gen >= 8) {
1563       /* Store the offset in a GRF so we can send-from-GRF. */
1564       src_reg offset = src_reg(this, glsl_type::int_type);
1565       emit_before(block, inst, MOV(dst_reg(offset), brw_imm_d(reg_offset)));
1566       return offset;
1567    } else {
1568       int message_header_scale = devinfo->gen < 6 ? 16 : 1;
1569       return brw_imm_d(reg_offset * message_header_scale);
1570    }
1571 }
1572
1573 /**
1574  * Emits an instruction before @inst to load the value named by @orig_src
1575  * from scratch space at @base_offset to @temp.
1576  *
1577  * @base_offset is measured in 32-byte units (the size of a register).
1578  */
1579 void
1580 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1581                                 dst_reg temp, src_reg orig_src,
1582                                 int base_offset)
1583 {
1584    int reg_offset = base_offset + orig_src.reg_offset;
1585    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1586                                       reg_offset);
1587
1588    emit_before(block, inst, SCRATCH_READ(temp, index));
1589 }
1590
1591 /**
1592  * Emits an instruction after @inst to store the value to be written
1593  * to @orig_dst to scratch space at @base_offset, from @temp.
1594  *
1595  * @base_offset is measured in 32-byte units (the size of a register).
1596  */
1597 void
1598 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1599                                  int base_offset)
1600 {
1601    int reg_offset = base_offset + inst->dst.reg_offset;
1602    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1603                                       reg_offset);
1604
1605    /* Create a temporary register to store *inst's result in.
1606     *
1607     * We have to be careful in MOVing from our temporary result register in
1608     * the scratch write.  If we swizzle from channels of the temporary that
1609     * weren't initialized, it will confuse live interval analysis, which will
1610     * make spilling fail to make progress.
1611     */
1612    const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
1613                                        inst->dst.type),
1614                                 brw_swizzle_for_mask(inst->dst.writemask));
1615    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1616                                        inst->dst.writemask));
1617    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1618    if (inst->opcode != BRW_OPCODE_SEL)
1619       write->predicate = inst->predicate;
1620    write->ir = inst->ir;
1621    write->annotation = inst->annotation;
1622    inst->insert_after(block, write);
1623
1624    inst->dst.file = temp.file;
1625    inst->dst.nr = temp.nr;
1626    inst->dst.reg_offset = temp.reg_offset;
1627    inst->dst.reladdr = NULL;
1628 }
1629
1630 /**
1631  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1632  * adds the scratch read(s) before \p inst. The function also checks for
1633  * recursive reladdr scratch accesses, issuing the corresponding scratch
1634  * loads and rewriting reladdr references accordingly.
1635  *
1636  * \return \p src if it did not require a scratch load, otherwise, the
1637  * register holding the result of the scratch load that the caller should
1638  * use to rewrite src.
1639  */
1640 src_reg
1641 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1642                                    vec4_instruction *inst, src_reg src)
1643 {
1644    /* Resolve recursive reladdr scratch access by calling ourselves
1645     * with src.reladdr
1646     */
1647    if (src.reladdr)
1648       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1649                                           *src.reladdr);
1650
1651    /* Now handle scratch access on src */
1652    if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1653       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1654       emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1655       src.nr = temp.nr;
1656       src.reg_offset = temp.reg_offset;
1657       src.reladdr = NULL;
1658    }
1659
1660    return src;
1661 }
1662
1663 /**
1664  * We can't generally support array access in GRF space, because a
1665  * single instruction's destination can only span 2 contiguous
1666  * registers.  So, we send all GRF arrays that get variable index
1667  * access to scratch space.
1668  */
1669 void
1670 vec4_visitor::move_grf_array_access_to_scratch()
1671 {
1672    int scratch_loc[this->alloc.count];
1673    memset(scratch_loc, -1, sizeof(scratch_loc));
1674
1675    /* First, calculate the set of virtual GRFs that need to be punted
1676     * to scratch due to having any array access on them, and where in
1677     * scratch.
1678     */
1679    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1680       if (inst->dst.file == VGRF && inst->dst.reladdr) {
1681          if (scratch_loc[inst->dst.nr] == -1) {
1682             scratch_loc[inst->dst.nr] = last_scratch;
1683             last_scratch += this->alloc.sizes[inst->dst.nr];
1684          }
1685
1686          for (src_reg *iter = inst->dst.reladdr;
1687               iter->reladdr;
1688               iter = iter->reladdr) {
1689             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1690                scratch_loc[iter->nr] = last_scratch;
1691                last_scratch += this->alloc.sizes[iter->nr];
1692             }
1693          }
1694       }
1695
1696       for (int i = 0 ; i < 3; i++) {
1697          for (src_reg *iter = &inst->src[i];
1698               iter->reladdr;
1699               iter = iter->reladdr) {
1700             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1701                scratch_loc[iter->nr] = last_scratch;
1702                last_scratch += this->alloc.sizes[iter->nr];
1703             }
1704          }
1705       }
1706    }
1707
1708    /* Now, for anything that will be accessed through scratch, rewrite
1709     * it to load/store.  Note that this is a _safe list walk, because
1710     * we may generate a new scratch_write instruction after the one
1711     * we're processing.
1712     */
1713    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1714       /* Set up the annotation tracking for new generated instructions. */
1715       base_ir = inst->ir;
1716       current_annotation = inst->annotation;
1717
1718       /* First handle scratch access on the dst. Notice we have to handle
1719        * the case where the dst's reladdr also points to scratch space.
1720        */
1721       if (inst->dst.reladdr)
1722          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1723                                                    *inst->dst.reladdr);
1724
1725       /* Now that we have handled any (possibly recursive) reladdr scratch
1726        * accesses for dst we can safely do the scratch write for dst itself
1727        */
1728       if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1729          emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1730
1731       /* Now handle scratch access on any src. In this case, since inst->src[i]
1732        * already is a src_reg, we can just call emit_resolve_reladdr with
1733        * inst->src[i] and it will take care of handling scratch loads for
1734        * both src and src.reladdr (recursively).
1735        */
1736       for (int i = 0 ; i < 3; i++) {
1737          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1738                                              inst->src[i]);
1739       }
1740    }
1741 }
1742
1743 /**
1744  * Emits an instruction before @inst to load the value named by @orig_src
1745  * from the pull constant buffer (surface) at @base_offset to @temp.
1746  */
1747 void
1748 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1749                                       dst_reg temp, src_reg orig_src,
1750                                       int base_offset)
1751 {
1752    int reg_offset = base_offset + orig_src.reg_offset;
1753    const unsigned index = prog_data->base.binding_table.pull_constants_start;
1754    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
1755                                              reg_offset);
1756
1757    emit_pull_constant_load_reg(temp,
1758                                brw_imm_ud(index),
1759                                offset,
1760                                block, inst);
1761
1762    brw_mark_surface_used(&prog_data->base, index);
1763 }
1764
1765 /**
1766  * Implements array access of uniforms by inserting a
1767  * PULL_CONSTANT_LOAD instruction.
1768  *
1769  * Unlike temporary GRF array access (where we don't support it due to
1770  * the difficulty of doing relative addressing on instruction
1771  * destinations), we could potentially do array access of uniforms
1772  * that were loaded in GRF space as push constants.  In real-world
1773  * usage we've seen, though, the arrays being used are always larger
1774  * than we could load as push constants, so just always move all
1775  * uniform array access out to a pull constant buffer.
1776  */
1777 void
1778 vec4_visitor::move_uniform_array_access_to_pull_constants()
1779 {
1780    int pull_constant_loc[this->uniforms];
1781    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1782    bool nested_reladdr;
1783
1784    /* Walk through and find array access of uniforms.  Put a copy of that
1785     * uniform in the pull constant buffer.
1786     *
1787     * Note that we don't move constant-indexed accesses to arrays.  No
1788     * testing has been done of the performance impact of this choice.
1789     */
1790    do {
1791       nested_reladdr = false;
1792
1793       foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1794          for (int i = 0 ; i < 3; i++) {
1795             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1796                continue;
1797
1798             int uniform = inst->src[i].nr;
1799
1800             if (inst->src[i].reladdr->reladdr)
1801                nested_reladdr = true;  /* will need another pass */
1802
1803             /* If this array isn't already present in the pull constant buffer,
1804              * add it.
1805              */
1806             if (pull_constant_loc[uniform] == -1) {
1807                const gl_constant_value **values =
1808                   &stage_prog_data->param[uniform * 4];
1809
1810                pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
1811
1812                assert(uniform < uniform_array_size);
1813                for (int j = 0; j < uniform_size[uniform] * 4; j++) {
1814                   stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1815                      = values[j];
1816                }
1817             }
1818
1819             /* Set up the annotation tracking for new generated instructions. */
1820             base_ir = inst->ir;
1821             current_annotation = inst->annotation;
1822
1823             dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1824
1825             emit_pull_constant_load(block, inst, temp, inst->src[i],
1826                                     pull_constant_loc[uniform]);
1827
1828             inst->src[i].file = temp.file;
1829             inst->src[i].nr = temp.nr;
1830             inst->src[i].reg_offset = temp.reg_offset;
1831             inst->src[i].reladdr = NULL;
1832          }
1833       }
1834    } while (nested_reladdr);
1835
1836    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1837     * no need to track them as larger-than-vec4 objects.  This will be
1838     * relied on in cutting out unused uniform vectors from push
1839     * constants.
1840     */
1841    split_uniform_registers();
1842 }
1843
1844 void
1845 vec4_visitor::resolve_ud_negate(src_reg *reg)
1846 {
1847    if (reg->type != BRW_REGISTER_TYPE_UD ||
1848        !reg->negate)
1849       return;
1850
1851    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1852    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1853    *reg = temp;
1854 }
1855
1856 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1857                            void *log_data,
1858                            const struct brw_sampler_prog_key_data *key_tex,
1859                            struct brw_vue_prog_data *prog_data,
1860                            const nir_shader *shader,
1861                            void *mem_ctx,
1862                            bool no_spills,
1863                            int shader_time_index)
1864    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1865      key_tex(key_tex),
1866      prog_data(prog_data),
1867      fail_msg(NULL),
1868      first_non_payload_grf(0),
1869      need_all_constants_in_pull_buffer(false),
1870      no_spills(no_spills),
1871      shader_time_index(shader_time_index),
1872      last_scratch(0)
1873 {
1874    this->failed = false;
1875
1876    this->base_ir = NULL;
1877    this->current_annotation = NULL;
1878    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1879
1880    this->virtual_grf_start = NULL;
1881    this->virtual_grf_end = NULL;
1882    this->live_intervals = NULL;
1883
1884    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1885
1886    this->uniforms = 0;
1887
1888    /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
1889     * at least one. See setup_uniforms() in brw_vec4.cpp.
1890     */
1891    this->uniform_array_size = 1;
1892    if (prog_data) {
1893       this->uniform_array_size =
1894          MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
1895    }
1896
1897    this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
1898 }
1899
1900 vec4_visitor::~vec4_visitor()
1901 {
1902 }
1903
1904
1905 void
1906 vec4_visitor::fail(const char *format, ...)
1907 {
1908    va_list va;
1909    char *msg;
1910
1911    if (failed)
1912       return;
1913
1914    failed = true;
1915
1916    va_start(va, format);
1917    msg = ralloc_vasprintf(mem_ctx, format, va);
1918    va_end(va);
1919    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1920
1921    this->fail_msg = msg;
1922
1923    if (debug_enabled) {
1924       fprintf(stderr, "%s",  msg);
1925    }
1926 }
1927
1928 } /* namespace brw */