src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_cfg.h"
  26 #include "brw_eu.h"
  27 #include "brw_program.h"
  28
  29 namespace brw {
  30
  31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
  32                                    const src_reg &src0, const src_reg &src1,
  33                                    const src_reg &src2)
  34 {
  35    this->opcode = opcode;
  36    this->dst = dst;
  37    this->src[0] = src0;
  38    this->src[1] = src1;
  39    this->src[2] = src2;
  40    this->saturate = false;
  41    this->force_writemask_all = false;
  42    this->no_dd_clear = false;
  43    this->no_dd_check = false;
  44    this->writes_accumulator = false;
  45    this->conditional_mod = BRW_CONDITIONAL_NONE;
  46    this->predicate = BRW_PREDICATE_NONE;
  47    this->predicate_inverse = false;
  48    this->target = 0;
  49    this->shadow_compare = false;
  50    this->ir = NULL;
  51    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
  52    this->header_size = 0;
  53    this->flag_subreg = 0;
  54    this->mlen = 0;
  55    this->base_mrf = 0;
  56    this->offset = 0;
  57    this->exec_size = 8;
  58    this->group = 0;
  59    this->size_written = (dst.file == BAD_FILE ?
  60                          0 : this->exec_size * type_sz(dst.type));
  61    this->annotation = NULL;
  62 }
  63
  64 vec4_instruction *
  65 vec4_visitor::emit(vec4_instruction *inst)
  66 {
  67    inst->ir = this->base_ir;
  68    inst->annotation = this->current_annotation;
  69
  70    this->instructions.push_tail(inst);
  71
  72    return inst;
  73 }
  74
  75 vec4_instruction *
  76 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
  77                           vec4_instruction *new_inst)
  78 {
  79    new_inst->ir = inst->ir;
  80    new_inst->annotation = inst->annotation;
  81
  82    inst->insert_before(block, new_inst);
  83
  84    return inst;
  85 }
  86
  87 vec4_instruction *
  88 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  89                    const src_reg &src1, const src_reg &src2)
  90 {
  91    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
  92 }
  93
  94
  95 vec4_instruction *
  96 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
  97                    const src_reg &src1)
  98 {
  99    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
 100 }
 101
 102 vec4_instruction *
 103 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 104 {
 105    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 106 }
 107
 108 vec4_instruction *
 109 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 110 {
 111    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 112 }
 113
 114 vec4_instruction *
 115 vec4_visitor::emit(enum opcode opcode)
 116 {
 117    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 118 }
 119
 120 #define ALU1(op)                                                        \
 121    vec4_instruction *                                                   \
 122    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)            \
 123    {                                                                    \
 124       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
 125    }
 126
 127 #define ALU2(op)                                                        \
 128    vec4_instruction *                                                   \
 129    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 130                     const src_reg &src1)                                \
 131    {                                                                    \
 132       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 133                                            src0, src1);                 \
 134    }
 135
 136 #define ALU2_ACC(op)                                                    \
 137    vec4_instruction *                                                   \
 138    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 139                     const src_reg &src1)                                \
 140    {                                                                    \
 141       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
 142                        BRW_OPCODE_##op, dst, src0, src1);               \
 143       inst->writes_accumulator = true;                                  \
 144       return inst;                                                      \
 145    }
 146
 147 #define ALU3(op)                                                        \
 148    vec4_instruction *                                                   \
 149    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,            \
 150                     const src_reg &src1, const src_reg &src2)           \
 151    {                                                                    \
 152       assert(devinfo->gen >= 6);                                                \
 153       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
 154                                            src0, src1, src2);           \
 155    }
 156
 157 ALU1(NOT)
 158 ALU1(MOV)
 159 ALU1(FRC)
 160 ALU1(RNDD)
 161 ALU1(RNDE)
 162 ALU1(RNDZ)
 163 ALU1(F32TO16)
 164 ALU1(F16TO32)
 165 ALU2(ADD)
 166 ALU2(MUL)
 167 ALU2_ACC(MACH)
 168 ALU2(AND)
 169 ALU2(OR)
 170 ALU2(XOR)
 171 ALU2(DP3)
 172 ALU2(DP4)
 173 ALU2(DPH)
 174 ALU2(SHL)
 175 ALU2(SHR)
 176 ALU2(ASR)
 177 ALU3(LRP)
 178 ALU1(BFREV)
 179 ALU3(BFE)
 180 ALU2(BFI1)
 181 ALU3(BFI2)
 182 ALU1(FBH)
 183 ALU1(FBL)
 184 ALU1(CBIT)
 185 ALU3(MAD)
 186 ALU2_ACC(ADDC)
 187 ALU2_ACC(SUBB)
 188 ALU2(MAC)
 189 ALU1(DIM)
 190
 191 /** Gen4 predicated IF. */
 192 vec4_instruction *
 193 vec4_visitor::IF(enum brw_predicate predicate)
 194 {
 195    vec4_instruction *inst;
 196
 197    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
 198    inst->predicate = predicate;
 199
 200    return inst;
 201 }
 202
 203 /** Gen6 IF with embedded comparison. */
 204 vec4_instruction *
 205 vec4_visitor::IF(src_reg src0, src_reg src1,
 206                  enum brw_conditional_mod condition)
 207 {
 208    assert(devinfo->gen == 6);
 209
 210    vec4_instruction *inst;
 211
 212    resolve_ud_negate(&src0);
 213    resolve_ud_negate(&src1);
 214
 215    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 216                                         src0, src1);
 217    inst->conditional_mod = condition;
 218
 219    return inst;
 220 }
 221
 222 /**
 223  * CMP: Sets the low bit of the destination channels with the result
 224  * of the comparison, while the upper bits are undefined, and updates
 225  * the flag register with the packed 16 bits of the result.
 226  */
 227 vec4_instruction *
 228 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
 229                   enum brw_conditional_mod condition)
 230 {
 231    vec4_instruction *inst;
 232
 233    /* Take the instruction:
 234     *
 235     * CMP null<d> src0<f> src1<f>
 236     *
 237     * Original gen4 does type conversion to the destination type before
 238     * comparison, producing garbage results for floating point comparisons.
 239     *
 240     * The destination type doesn't matter on newer generations, so we set the
 241     * type to match src0 so we can compact the instruction.
 242     */
 243    dst.type = src0.type;
 244
 245    resolve_ud_negate(&src0);
 246    resolve_ud_negate(&src1);
 247
 248    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
 249    inst->conditional_mod = condition;
 250
 251    return inst;
 252 }
 253
 254 vec4_instruction *
 255 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 256 {
 257    vec4_instruction *inst;
 258
 259    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 260                                         dst, index);
 261    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
 262    inst->mlen = 2;
 263
 264    return inst;
 265 }
 266
 267 vec4_instruction *
 268 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 269                             const src_reg &index)
 270 {
 271    vec4_instruction *inst;
 272
 273    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 274                                         dst, src, index);
 275    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
 276    inst->mlen = 3;
 277
 278    return inst;
 279 }
 280
 281 src_reg
 282 vec4_visitor::fix_3src_operand(const src_reg &src)
 283 {
 284    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 285     * able to use vertical stride of zero to replicate the vec4 uniform, like
 286     *
 287     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 288     *
 289     * But you can't, since vertical stride is always four in three-source
 290     * instructions. Instead, insert a MOV instruction to do the replication so
 291     * that the three-source instruction can consume it.
 292     */
 293
 294    /* The MOV is only needed if the source is a uniform or immediate. */
 295    if (src.file != UNIFORM && src.file != IMM)
 296       return src;
 297
 298    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 299       return src;
 300
 301    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 302    expanded.type = src.type;
 303    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 304    return src_reg(expanded);
 305 }
 306
 307 src_reg
 308 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 309 {
 310    if (!src.abs && !src.negate)
 311       return src;
 312
 313    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
 314    resolved.type = src.type;
 315    emit(MOV(resolved, src));
 316
 317    return src_reg(resolved);
 318 }
 319
 320 src_reg
 321 vec4_visitor::fix_math_operand(const src_reg &src)
 322 {
 323    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
 324       return src;
 325
 326    /* The gen6 math instruction ignores the source modifiers --
 327     * swizzle, abs, negate, and at least some parts of the register
 328     * region description.
 329     *
 330     * Rather than trying to enumerate all these cases, *always* expand the
 331     * operand to a temp GRF for gen6.
 332     *
 333     * For gen7, keep the operand as-is, except if immediate, which gen7 still
 334     * can't use.
 335     */
 336
 337    if (devinfo->gen == 7 && src.file != IMM)
 338       return src;
 339
 340    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
 341    expanded.type = src.type;
 342    emit(MOV(expanded, src));
 343    return src_reg(expanded);
 344 }
 345
 346 vec4_instruction *
 347 vec4_visitor::emit_math(enum opcode opcode,
 348                         const dst_reg &dst,
 349                         const src_reg &src0, const src_reg &src1)
 350 {
 351    vec4_instruction *math =
 352       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
 353
 354    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
 355       /* MATH on Gen6 must be align1, so we can't do writemasks. */
 356       math->dst = dst_reg(this, glsl_type::vec4_type);
 357       math->dst.type = dst.type;
 358       math = emit(MOV(dst, src_reg(math->dst)));
 359    } else if (devinfo->gen < 6) {
 360       math->base_mrf = 1;
 361       math->mlen = src1.file == BAD_FILE ? 1 : 2;
 362    }
 363
 364    return math;
 365 }
 366
 367 void
 368 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 369 {
 370    if (devinfo->gen < 7) {
 371       unreachable("ir_unop_pack_half_2x16 should be lowered");
 372    }
 373
 374    assert(dst.type == BRW_REGISTER_TYPE_UD);
 375    assert(src0.type == BRW_REGISTER_TYPE_F);
 376
 377    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
 378     *
 379     *   Because this instruction does not have a 16-bit floating-point type,
 380     *   the destination data type must be Word (W).
 381     *
 382     *   The destination must be DWord-aligned and specify a horizontal stride
 383     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
 384     *   each destination channel and the upper word is not modified.
 385     *
 386     * The above restriction implies that the f32to16 instruction must use
 387     * align1 mode, because only in align1 mode is it possible to specify
 388     * horizontal stride.  We choose here to defy the hardware docs and emit
 389     * align16 instructions.
 390     *
 391     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
 392     * instructions. I was partially successful in that the code passed all
 393     * tests.  However, the code was dubiously correct and fragile, and the
 394     * tests were not harsh enough to probe that frailty. Not trusting the
 395     * code, I chose instead to remain in align16 mode in defiance of the hw
 396     * docs).
 397     *
 398     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
 399     * simulator, emitting a f32to16 in align16 mode with UD as destination
 400     * data type is safe. The behavior differs from that specified in the PRM
 401     * in that the upper word of each destination channel is cleared to 0.
 402     */
 403
 404    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 405    src_reg tmp_src(tmp_dst);
 406
 407 #if 0
 408    /* Verify the undocumented behavior on which the following instructions
 409     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
 410     * then the result of the bit-or instruction below will be incorrect.
 411     *
 412     * You should inspect the disasm output in order to verify that the MOV is
 413     * not optimized away.
 414     */
 415    emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
 416 #endif
 417
 418    /* Give tmp the form below, where "." means untouched.
 419     *
 420     *     w z          y          x w z          y          x
 421     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
 422     *
 423     * That the upper word of each write-channel be 0 is required for the
 424     * following bit-shift and bit-or instructions to work. Note that this
 425     * relies on the undocumented hardware behavior mentioned above.
 426     */
 427    tmp_dst.writemask = WRITEMASK_XY;
 428    emit(F32TO16(tmp_dst, src0));
 429
 430    /* Give the write-channels of dst the form:
 431     *   0xhhhh0000
 432     */
 433    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
 434    emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
 435
 436    /* Finally, give the write-channels of dst the form of packHalf2x16's
 437     * output:
 438     *   0xhhhhllll
 439     */
 440    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
 441    emit(OR(dst, src_reg(dst), tmp_src));
 442 }
 443
 444 void
 445 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 446 {
 447    if (devinfo->gen < 7) {
 448       unreachable("ir_unop_unpack_half_2x16 should be lowered");
 449    }
 450
 451    assert(dst.type == BRW_REGISTER_TYPE_F);
 452    assert(src0.type == BRW_REGISTER_TYPE_UD);
 453
 454    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
 455     *
 456     *   Because this instruction does not have a 16-bit floating-point type,
 457     *   the source data type must be Word (W). The destination type must be
 458     *   F (Float).
 459     *
 460     * To use W as the source data type, we must adjust horizontal strides,
 461     * which is only possible in align1 mode. All my [chadv] attempts at
 462     * emitting align1 instructions for unpackHalf2x16 failed to pass the
 463     * Piglit tests, so I gave up.
 464     *
 465     * I've verified that, on gen7 hardware and the simulator, it is safe to
 466     * emit f16to32 in align16 mode with UD as source data type.
 467     */
 468
 469    dst_reg tmp_dst(this, glsl_type::uvec2_type);
 470    src_reg tmp_src(tmp_dst);
 471
 472    tmp_dst.writemask = WRITEMASK_X;
 473    emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
 474
 475    tmp_dst.writemask = WRITEMASK_Y;
 476    emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
 477
 478    dst.writemask = WRITEMASK_XY;
 479    emit(F16TO32(dst, tmp_src));
 480 }
 481
 482 void
 483 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 484 {
 485    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 486     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 487     * is not suitable to generate the shift values, but we can use the packed
 488     * vector float and a type-converting MOV.
 489     */
 490    dst_reg shift(this, glsl_type::uvec4_type);
 491    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 492
 493    dst_reg shifted(this, glsl_type::uvec4_type);
 494    src0.swizzle = BRW_SWIZZLE_XXXX;
 495    emit(SHR(shifted, src0, src_reg(shift)));
 496
 497    shifted.type = BRW_REGISTER_TYPE_UB;
 498    dst_reg f(this, glsl_type::vec4_type);
 499    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 500
 501    emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
 502 }
 503
 504 void
 505 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 506 {
 507    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
 508     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
 509     * is not suitable to generate the shift values, but we can use the packed
 510     * vector float and a type-converting MOV.
 511     */
 512    dst_reg shift(this, glsl_type::uvec4_type);
 513    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
 514
 515    dst_reg shifted(this, glsl_type::uvec4_type);
 516    src0.swizzle = BRW_SWIZZLE_XXXX;
 517    emit(SHR(shifted, src0, src_reg(shift)));
 518
 519    shifted.type = BRW_REGISTER_TYPE_B;
 520    dst_reg f(this, glsl_type::vec4_type);
 521    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
 522
 523    dst_reg scaled(this, glsl_type::vec4_type);
 524    emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
 525
 526    dst_reg max(this, glsl_type::vec4_type);
 527    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
 528    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
 529 }
 530
 531 void
 532 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 533 {
 534    dst_reg saturated(this, glsl_type::vec4_type);
 535    vec4_instruction *inst = emit(MOV(saturated, src0));
 536    inst->saturate = true;
 537
 538    dst_reg scaled(this, glsl_type::vec4_type);
 539    emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
 540
 541    dst_reg rounded(this, glsl_type::vec4_type);
 542    emit(RNDE(rounded, src_reg(scaled)));
 543
 544    dst_reg u(this, glsl_type::uvec4_type);
 545    emit(MOV(u, src_reg(rounded)));
 546
 547    src_reg bytes(u);
 548    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 549 }
 550
 551 void
 552 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 553 {
 554    dst_reg max(this, glsl_type::vec4_type);
 555    emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
 556
 557    dst_reg min(this, glsl_type::vec4_type);
 558    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
 559
 560    dst_reg scaled(this, glsl_type::vec4_type);
 561    emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
 562
 563    dst_reg rounded(this, glsl_type::vec4_type);
 564    emit(RNDE(rounded, src_reg(scaled)));
 565
 566    dst_reg i(this, glsl_type::ivec4_type);
 567    emit(MOV(i, src_reg(rounded)));
 568
 569    src_reg bytes(i);
 570    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 571 }
 572
 573 /*
 574  * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
 575  * false) elements needed to pack a type.
 576  */
 577 static int
 578 type_size_xvec4(const struct glsl_type *type, bool as_vec4)
 579 {
 580    unsigned int i;
 581    int size;
 582
 583    switch (type->base_type) {
 584    case GLSL_TYPE_UINT:
 585    case GLSL_TYPE_INT:
 586    case GLSL_TYPE_FLOAT:
 587    case GLSL_TYPE_BOOL:
 588    case GLSL_TYPE_DOUBLE:
 589       if (type->is_matrix()) {
 590          const glsl_type *col_type = type->column_type();
 591          unsigned col_slots =
 592             (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
 593          return type->matrix_columns * col_slots;
 594       } else {
 595          /* Regardless of size of vector, it gets a vec4. This is bad
 596           * packing for things like floats, but otherwise arrays become a
 597           * mess.  Hopefully a later pass over the code can pack scalars
 598           * down if appropriate.
 599           */
 600          return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
 601       }
 602    case GLSL_TYPE_ARRAY:
 603       assert(type->length > 0);
 604       return type_size_xvec4(type->fields.array, as_vec4) * type->length;
 605    case GLSL_TYPE_STRUCT:
 606       size = 0;
 607       for (i = 0; i < type->length; i++) {
 608          size += type_size_xvec4(type->fields.structure[i].type, as_vec4);
 609       }
 610       return size;
 611    case GLSL_TYPE_SUBROUTINE:
 612       return 1;
 613
 614    case GLSL_TYPE_SAMPLER:
 615       /* Samplers take up no register space, since they're baked in at
 616        * link time.
 617        */
 618       return 0;
 619    case GLSL_TYPE_ATOMIC_UINT:
 620       return 0;
 621    case GLSL_TYPE_IMAGE:
 622       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
 623    case GLSL_TYPE_VOID:
 624    case GLSL_TYPE_ERROR:
 625    case GLSL_TYPE_INTERFACE:
 626    case GLSL_TYPE_FUNCTION:
 627       unreachable("not reached");
 628    }
 629
 630    return 0;
 631 }
 632
 633 /**
 634  * Returns the minimum number of vec4 elements needed to pack a type.
 635  *
 636  * For simple types, it will return 1 (a single vec4); for matrices, the
 637  * number of columns; for array and struct, the sum of the vec4_size of
 638  * each of its elements; and for sampler and atomic, zero.
 639  *
 640  * This method is useful to calculate how much register space is needed to
 641  * store a particular type.
 642  */
 643 extern "C" int
 644 type_size_vec4(const struct glsl_type *type)
 645 {
 646    return type_size_xvec4(type, true);
 647 }
 648
 649 /**
 650  * Returns the minimum number of dvec4 elements needed to pack a type.
 651  *
 652  * For simple types, it will return 1 (a single dvec4); for matrices, the
 653  * number of columns; for array and struct, the sum of the dvec4_size of
 654  * each of its elements; and for sampler and atomic, zero.
 655  *
 656  * This method is useful to calculate how much register space is needed to
 657  * store a particular type.
 658  *
 659  * Measuring double-precision vertex inputs as dvec4 is required because
 660  * ARB_vertex_attrib_64bit states that these uses the same number of locations
 661  * than the single-precision version. That is, two consecutives dvec4 would be
 662  * located in location "x" and location "x+1", not "x+2".
 663  *
 664  * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
 665  * remap_vs_attrs() will take in account both the location and also if the
 666  * type fits in one or two vec4 slots.
 667  */
 668 extern "C" int
 669 type_size_dvec4(const struct glsl_type *type)
 670 {
 671    return type_size_xvec4(type, false);
 672 }
 673
 674 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 675 {
 676    init();
 677
 678    this->file = VGRF;
 679    this->nr = v->alloc.allocate(type_size_vec4(type));
 680
 681    if (type->is_array() || type->is_record()) {
 682       this->swizzle = BRW_SWIZZLE_NOOP;
 683    } else {
 684       this->swizzle = brw_swizzle_for_size(type->vector_elements);
 685    }
 686
 687    this->type = brw_type_for_base_type(type);
 688 }
 689
 690 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 691 {
 692    assert(size > 0);
 693
 694    init();
 695
 696    this->file = VGRF;
 697    this->nr = v->alloc.allocate(type_size_vec4(type) * size);
 698
 699    this->swizzle = BRW_SWIZZLE_NOOP;
 700
 701    this->type = brw_type_for_base_type(type);
 702 }
 703
 704 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 705 {
 706    init();
 707
 708    this->file = VGRF;
 709    this->nr = v->alloc.allocate(type_size_vec4(type));
 710
 711    if (type->is_array() || type->is_record()) {
 712       this->writemask = WRITEMASK_XYZW;
 713    } else {
 714       this->writemask = (1 << type->vector_elements) - 1;
 715    }
 716
 717    this->type = brw_type_for_base_type(type);
 718 }
 719
 720 vec4_instruction *
 721 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
 722                           src_reg src0, src_reg src1)
 723 {
 724    vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 725    inst->conditional_mod = conditionalmod;
 726    return inst;
 727 }
 728
 729 vec4_instruction *
 730 vec4_visitor::emit_lrp(const dst_reg &dst,
 731                        const src_reg &x, const src_reg &y, const src_reg &a)
 732 {
 733    if (devinfo->gen >= 6) {
 734       /* Note that the instruction's argument order is reversed from GLSL
 735        * and the IR.
 736        */
 737      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
 738                      fix_3src_operand(x)));
 739    } else {
 740       /* Earlier generations don't support three source operations, so we
 741        * need to emit x*(1-a) + y*a.
 742        */
 743       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
 744       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
 745       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
 746       y_times_a.writemask           = dst.writemask;
 747       one_minus_a.writemask         = dst.writemask;
 748       x_times_one_minus_a.writemask = dst.writemask;
 749
 750       emit(MUL(y_times_a, y, a));
 751       emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
 752       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
 753       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
 754    }
 755 }
 756
 757 /**
 758  * Emits the instructions needed to perform a pull constant load. before_block
 759  * and before_inst can be NULL in which case the instruction will be appended
 760  * to the end of the instruction list.
 761  */
 762 void
 763 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
 764                                           src_reg surf_index,
 765                                           src_reg offset_reg,
 766                                           bblock_t *before_block,
 767                                           vec4_instruction *before_inst)
 768 {
 769    assert((before_inst == NULL && before_block == NULL) ||
 770           (before_inst && before_block));
 771
 772    vec4_instruction *pull;
 773
 774    if (devinfo->gen >= 9) {
 775       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 776       src_reg header(this, glsl_type::uvec4_type, 2);
 777
 778       pull = new(mem_ctx)
 779          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 780                           dst_reg(header));
 781
 782       if (before_inst)
 783          emit_before(before_block, before_inst, pull);
 784       else
 785          emit(pull);
 786
 787       dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
 788                                  offset_reg.type);
 789       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
 790
 791       if (before_inst)
 792          emit_before(before_block, before_inst, pull);
 793       else
 794          emit(pull);
 795
 796       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 797                                            dst,
 798                                            surf_index,
 799                                            header);
 800       pull->mlen = 2;
 801       pull->header_size = 1;
 802    } else if (devinfo->gen >= 7) {
 803       dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
 804
 805       grf_offset.type = offset_reg.type;
 806
 807       pull = MOV(grf_offset, offset_reg);
 808
 809       if (before_inst)
 810          emit_before(before_block, before_inst, pull);
 811       else
 812          emit(pull);
 813
 814       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
 815                                            dst,
 816                                            surf_index,
 817                                            src_reg(grf_offset));
 818       pull->mlen = 1;
 819    } else {
 820       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
 821                                            dst,
 822                                            surf_index,
 823                                            offset_reg);
 824       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
 825       pull->mlen = 1;
 826    }
 827
 828    if (before_inst)
 829       emit_before(before_block, before_inst, pull);
 830    else
 831       emit(pull);
 832 }
 833
 834 src_reg
 835 vec4_visitor::emit_uniformize(const src_reg &src)
 836 {
 837    const src_reg chan_index(this, glsl_type::uint_type);
 838    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
 839                               src.type);
 840
 841    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
 842       ->force_writemask_all = true;
 843    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
 844       ->force_writemask_all = true;
 845
 846    return src_reg(dst);
 847 }
 848
 849 src_reg
 850 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
 851                              src_reg coordinate, src_reg surface)
 852 {
 853    vec4_instruction *inst =
 854       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
 855                                     dst_reg(this, glsl_type::uvec4_type));
 856    inst->base_mrf = 2;
 857    inst->src[1] = surface;
 858    inst->src[2] = surface;
 859
 860    int param_base;
 861
 862    if (devinfo->gen >= 9) {
 863       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
 864       vec4_instruction *header_inst = new(mem_ctx)
 865          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
 866                           dst_reg(MRF, inst->base_mrf));
 867
 868       emit(header_inst);
 869
 870       inst->mlen = 2;
 871       inst->header_size = 1;
 872       param_base = inst->base_mrf + 1;
 873    } else {
 874       inst->mlen = 1;
 875       param_base = inst->base_mrf;
 876    }
 877
 878    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
 879    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
 880    int zero_mask = 0xf & ~coord_mask;
 881
 882    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
 883             coordinate));
 884
 885    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
 886             brw_imm_d(0)));
 887
 888    emit(inst);
 889    return src_reg(inst->dst);
 890 }
 891
 892 bool
 893 vec4_visitor::is_high_sampler(src_reg sampler)
 894 {
 895    if (devinfo->gen < 8 && !devinfo->is_haswell)
 896       return false;
 897
 898    return sampler.file != IMM || sampler.ud >= 16;
 899 }
 900
 901 void
 902 vec4_visitor::emit_texture(ir_texture_opcode op,
 903                            dst_reg dest,
 904                            const glsl_type *dest_type,
 905                            src_reg coordinate,
 906                            int coord_components,
 907                            src_reg shadow_comparator,
 908                            src_reg lod, src_reg lod2,
 909                            src_reg sample_index,
 910                            uint32_t constant_offset,
 911                            src_reg offset_value,
 912                            src_reg mcs,
 913                            uint32_t surface,
 914                            src_reg surface_reg,
 915                            src_reg sampler_reg)
 916 {
 917    /* The sampler can only meaningfully compute LOD for fragment shader
 918     * messages. For all other stages, we change the opcode to TXL and hardcode
 919     * the LOD to 0.
 920     *
 921     * textureQueryLevels() is implemented in terms of TXS so we need to pass a
 922     * valid LOD argument.
 923     */
 924    if (op == ir_tex || op == ir_query_levels) {
 925       assert(lod.file == BAD_FILE);
 926       lod = brw_imm_f(0.0f);
 927    }
 928
 929    enum opcode opcode;
 930    switch (op) {
 931    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
 932    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 933    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 934    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 935    case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
 936                              SHADER_OPCODE_TXF_CMS); break;
 937    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 938    case ir_tg4: opcode = offset_value.file != BAD_FILE
 939                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
 940    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
 941    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
 942    case ir_txb:
 943       unreachable("TXB is not valid for vertex shaders.");
 944    case ir_lod:
 945       unreachable("LOD is not valid for vertex shaders.");
 946    case ir_samples_identical: {
 947       /* There are some challenges implementing this for vec4, and it seems
 948        * unlikely to be used anyway.  For now, just return false ways.
 949        */
 950       emit(MOV(dest, brw_imm_ud(0u)));
 951       return;
 952    }
 953    default:
 954       unreachable("Unrecognized tex op");
 955    }
 956
 957    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
 958
 959    inst->offset = constant_offset;
 960
 961    /* The message header is necessary for:
 962     * - Gen4 (always)
 963     * - Gen9+ for selecting SIMD4x2
 964     * - Texel offsets
 965     * - Gather channel selection
 966     * - Sampler indices too large to fit in a 4-bit value.
 967     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
 968     */
 969    inst->header_size =
 970       (devinfo->gen < 5 || devinfo->gen >= 9 ||
 971        inst->offset != 0 || op == ir_tg4 ||
 972        op == ir_texture_samples ||
 973        is_high_sampler(sampler_reg)) ? 1 : 0;
 974    inst->base_mrf = 2;
 975    inst->mlen = inst->header_size;
 976    inst->dst.writemask = WRITEMASK_XYZW;
 977    inst->shadow_compare = shadow_comparator.file != BAD_FILE;
 978
 979    inst->src[1] = surface_reg;
 980    inst->src[2] = sampler_reg;
 981
 982    /* MRF for the first parameter */
 983    int param_base = inst->base_mrf + inst->header_size;
 984
 985    if (op == ir_txs || op == ir_query_levels) {
 986       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
 987       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
 988       inst->mlen++;
 989    } else if (op == ir_texture_samples) {
 990       inst->dst.writemask = WRITEMASK_X;
 991    } else {
 992       /* Load the coordinate */
 993       /* FINISHME: gl_clamp_mask and saturate */
 994       int coord_mask = (1 << coord_components) - 1;
 995       int zero_mask = 0xf & ~coord_mask;
 996
 997       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
 998                coordinate));
 999       inst->mlen++;
1000
1001       if (zero_mask != 0) {
1002          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
1003                   brw_imm_d(0)));
1004       }
1005       /* Load the shadow comparator */
1006       if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
1007          emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
1008                           WRITEMASK_X),
1009                   shadow_comparator));
1010          inst->mlen++;
1011       }
1012
1013       /* Load the LOD info */
1014       if (op == ir_tex || op == ir_txl) {
1015          int mrf, writemask;
1016          if (devinfo->gen >= 5) {
1017             mrf = param_base + 1;
1018             if (shadow_comparator.file != BAD_FILE) {
1019                writemask = WRITEMASK_Y;
1020                /* mlen already incremented */
1021             } else {
1022                writemask = WRITEMASK_X;
1023                inst->mlen++;
1024             }
1025          } else /* devinfo->gen == 4 */ {
1026             mrf = param_base;
1027             writemask = WRITEMASK_W;
1028          }
1029          emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1030       } else if (op == ir_txf) {
1031          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1032       } else if (op == ir_txf_ms) {
1033          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1034                   sample_index));
1035          if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1036             /* MCS data is stored in the first two channels of ‘mcs’, but we
1037              * need to get it into the .y and .z channels of the second vec4
1038              * of params.
1039              */
1040             mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1041             emit(MOV(dst_reg(MRF, param_base + 1,
1042                              glsl_type::uint_type, WRITEMASK_YZ),
1043                      mcs));
1044          } else if (devinfo->gen >= 7) {
1045             /* MCS data is in the first channel of `mcs`, but we need to get it into
1046              * the .y channel of the second vec4 of params, so replicate .x across
1047              * the whole vec4 and then mask off everything except .y
1048              */
1049             mcs.swizzle = BRW_SWIZZLE_XXXX;
1050             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1051                      mcs));
1052          }
1053          inst->mlen++;
1054       } else if (op == ir_txd) {
1055          const brw_reg_type type = lod.type;
1056
1057          if (devinfo->gen >= 5) {
1058             lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1059             lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1060             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1061             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1062             inst->mlen++;
1063
1064             if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
1065                lod.swizzle = BRW_SWIZZLE_ZZZZ;
1066                lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1067                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1068                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1069                inst->mlen++;
1070
1071                if (shadow_comparator.file != BAD_FILE) {
1072                   emit(MOV(dst_reg(MRF, param_base + 2,
1073                                    shadow_comparator.type, WRITEMASK_Z),
1074                            shadow_comparator));
1075                }
1076             }
1077          } else /* devinfo->gen == 4 */ {
1078             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1079             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1080             inst->mlen += 2;
1081          }
1082       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1083          if (shadow_comparator.file != BAD_FILE) {
1084             emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
1085                      shadow_comparator));
1086          }
1087
1088          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1089                   offset_value));
1090          inst->mlen++;
1091       }
1092    }
1093
1094    emit(inst);
1095
1096    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1097     * spec requires layers.
1098     */
1099    if (op == ir_txs && devinfo->gen < 7) {
1100       /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1101       emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1102                   src_reg(inst->dst), brw_imm_d(1));
1103    }
1104
1105    if (devinfo->gen == 6 && op == ir_tg4) {
1106       emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1107    }
1108
1109    if (op == ir_query_levels) {
1110       /* # levels is in .w */
1111       src_reg swizzled(dest);
1112       swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1113                                       SWIZZLE_W, SWIZZLE_W);
1114       emit(MOV(dest, swizzled));
1115    }
1116 }
1117
1118 /**
1119  * Apply workarounds for Gen6 gather with UINT/SINT
1120  */
1121 void
1122 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1123 {
1124    if (!wa)
1125       return;
1126
1127    int width = (wa & WA_8BIT) ? 8 : 16;
1128    dst_reg dst_f = dst;
1129    dst_f.type = BRW_REGISTER_TYPE_F;
1130
1131    /* Convert from UNORM to UINT */
1132    emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1133    emit(MOV(dst, src_reg(dst_f)));
1134
1135    if (wa & WA_SIGN) {
1136       /* Reinterpret the UINT value as a signed INT value by
1137        * shifting the sign bit into place, then shifting back
1138        * preserving sign.
1139        */
1140       emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1141       emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1142    }
1143 }
1144
1145 void
1146 vec4_visitor::gs_emit_vertex(int /* stream_id */)
1147 {
1148    unreachable("not reached");
1149 }
1150
1151 void
1152 vec4_visitor::gs_end_primitive()
1153 {
1154    unreachable("not reached");
1155 }
1156
1157 void
1158 vec4_visitor::emit_ndc_computation()
1159 {
1160    if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
1161       return;
1162
1163    /* Get the position */
1164    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
1165
1166    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1167    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1168    output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
1169    output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
1170
1171    current_annotation = "NDC";
1172    dst_reg ndc_w = ndc;
1173    ndc_w.writemask = WRITEMASK_W;
1174    src_reg pos_w = pos;
1175    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1176    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1177
1178    dst_reg ndc_xyz = ndc;
1179    ndc_xyz.writemask = WRITEMASK_XYZ;
1180
1181    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1182 }
1183
1184 void
1185 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1186 {
1187    if (devinfo->gen < 6 &&
1188        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1189         output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
1190         devinfo->has_negative_rhw_bug)) {
1191       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1192       dst_reg header1_w = header1;
1193       header1_w.writemask = WRITEMASK_W;
1194
1195       emit(MOV(header1, brw_imm_ud(0u)));
1196
1197       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1198          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1199
1200          current_annotation = "Point size";
1201          emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1202          emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1203       }
1204
1205       if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
1206          current_annotation = "Clipping flags";
1207          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1208          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1209
1210          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1211          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1212          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1213
1214          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1215          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1216          emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1217          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1218       }
1219
1220       /* i965 clipping workaround:
1221        * 1) Test for -ve rhw
1222        * 2) If set,
1223        *      set ndc = (0,0,0,0)
1224        *      set ucp[6] = 1
1225        *
1226        * Later, clipping will detect ucp[6] and ensure the primitive is
1227        * clipped against all fixed planes.
1228        */
1229       if (devinfo->has_negative_rhw_bug &&
1230           output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
1231          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
1232          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1233          emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1234          vec4_instruction *inst;
1235          inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1236          inst->predicate = BRW_PREDICATE_NORMAL;
1237          output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
1238          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
1239          inst->predicate = BRW_PREDICATE_NORMAL;
1240       }
1241
1242       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1243    } else if (devinfo->gen < 6) {
1244       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1245    } else {
1246       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1247       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1248          dst_reg reg_w = reg;
1249          reg_w.writemask = WRITEMASK_W;
1250          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1251          reg_as_src.type = reg_w.type;
1252          reg_as_src.swizzle = brw_swizzle_for_size(1);
1253          emit(MOV(reg_w, reg_as_src));
1254       }
1255       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1256          dst_reg reg_y = reg;
1257          reg_y.writemask = WRITEMASK_Y;
1258          reg_y.type = BRW_REGISTER_TYPE_D;
1259          output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
1260          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
1261       }
1262       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1263          dst_reg reg_z = reg;
1264          reg_z.writemask = WRITEMASK_Z;
1265          reg_z.type = BRW_REGISTER_TYPE_D;
1266          output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
1267          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
1268       }
1269    }
1270 }
1271
1272 vec4_instruction *
1273 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1274 {
1275    assert(varying < VARYING_SLOT_MAX);
1276
1277    unsigned num_comps = output_num_components[varying][component];
1278    if (num_comps == 0)
1279       return NULL;
1280
1281    assert(output_reg[varying][component].type == reg.type);
1282    current_annotation = output_reg_annotation[varying];
1283    if (output_reg[varying][component].file != BAD_FILE) {
1284       src_reg src = src_reg(output_reg[varying][component]);
1285       src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1286       reg.writemask =
1287          brw_writemask_for_component_packing(num_comps, component);
1288       return emit(MOV(reg, src));
1289    }
1290    return NULL;
1291 }
1292
1293 void
1294 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1295 {
1296    reg.type = BRW_REGISTER_TYPE_F;
1297    output_reg[varying][0].type = reg.type;
1298
1299    switch (varying) {
1300    case VARYING_SLOT_PSIZ:
1301    {
1302       /* PSIZ is always in slot 0, and is coupled with other flags. */
1303       current_annotation = "indices, point width, clip flags";
1304       emit_psiz_and_flags(reg);
1305       break;
1306    }
1307    case BRW_VARYING_SLOT_NDC:
1308       current_annotation = "NDC";
1309       if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
1310          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
1311       break;
1312    case VARYING_SLOT_POS:
1313       current_annotation = "gl_Position";
1314       if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
1315          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
1316       break;
1317    case VARYING_SLOT_EDGE:
1318       /* This is present when doing unfilled polygons.  We're supposed to copy
1319        * the edge flag from the user-provided vertex array
1320        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1321        * of that attribute (starts as 1.0f).  This is then used in clipping to
1322        * determine which edges should be drawn as wireframe.
1323        */
1324       current_annotation = "edge flag";
1325       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1326                                     glsl_type::float_type, WRITEMASK_XYZW))));
1327       break;
1328    case BRW_VARYING_SLOT_PAD:
1329       /* No need to write to this slot */
1330       break;
1331    default:
1332       for (int i = 0; i < 4; i++) {
1333          emit_generic_urb_slot(reg, varying, i);
1334       }
1335       break;
1336    }
1337 }
1338
1339 static int
1340 align_interleaved_urb_mlen(const struct gen_device_info *devinfo, int mlen)
1341 {
1342    if (devinfo->gen >= 6) {
1343       /* URB data written (does not include the message header reg) must
1344        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1345        * section 5.4.3.2.2: URB_INTERLEAVED.
1346        *
1347        * URB entries are allocated on a multiple of 1024 bits, so an
1348        * extra 128 bits written here to make the end align to 256 is
1349        * no problem.
1350        */
1351       if ((mlen % 2) != 1)
1352          mlen++;
1353    }
1354
1355    return mlen;
1356 }
1357
1358
1359 /**
1360  * Generates the VUE payload plus the necessary URB write instructions to
1361  * output it.
1362  *
1363  * The VUE layout is documented in Volume 2a.
1364  */
1365 void
1366 vec4_visitor::emit_vertex()
1367 {
1368    /* MRF 0 is reserved for the debugger, so start with message header
1369     * in MRF 1.
1370     */
1371    int base_mrf = 1;
1372    int mrf = base_mrf;
1373    /* In the process of generating our URB write message contents, we
1374     * may need to unspill a register or load from an array.  Those
1375     * reads would use MRFs 14-15.
1376     */
1377    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1378
1379    /* The following assertion verifies that max_usable_mrf causes an
1380     * even-numbered amount of URB write data, which will meet gen6's
1381     * requirements for length alignment.
1382     */
1383    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1384
1385    /* First mrf is the g0-based message header containing URB handles and
1386     * such.
1387     */
1388    emit_urb_write_header(mrf++);
1389
1390    if (devinfo->gen < 6) {
1391       emit_ndc_computation();
1392    }
1393
1394    /* We may need to split this up into several URB writes, so do them in a
1395     * loop.
1396     */
1397    int slot = 0;
1398    bool complete = false;
1399    do {
1400       /* URB offset is in URB row increments, and each of our MRFs is half of
1401        * one of those, since we're doing interleaved writes.
1402        */
1403       int offset = slot / 2;
1404
1405       mrf = base_mrf + 1;
1406       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1407          emit_urb_slot(dst_reg(MRF, mrf++),
1408                        prog_data->vue_map.slot_to_varying[slot]);
1409
1410          /* If this was max_usable_mrf, we can't fit anything more into this
1411           * URB WRITE. Same thing if we reached the maximum length available.
1412           */
1413          if (mrf > max_usable_mrf ||
1414              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1415             slot++;
1416             break;
1417          }
1418       }
1419
1420       complete = slot >= prog_data->vue_map.num_slots;
1421       current_annotation = "URB write";
1422       vec4_instruction *inst = emit_urb_write_opcode(complete);
1423       inst->base_mrf = base_mrf;
1424       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1425       inst->offset += offset;
1426    } while(!complete);
1427 }
1428
1429
1430 src_reg
1431 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1432                                  src_reg *reladdr, int reg_offset)
1433 {
1434    /* Because we store the values to scratch interleaved like our
1435     * vertex data, we need to scale the vec4 index by 2.
1436     */
1437    int message_header_scale = 2;
1438
1439    /* Pre-gen6, the message header uses byte offsets instead of vec4
1440     * (16-byte) offset units.
1441     */
1442    if (devinfo->gen < 6)
1443       message_header_scale *= 16;
1444
1445    if (reladdr) {
1446       /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1447        * to multiply the reladdr by 2. Notice that the reg_offset part
1448        * is in units of 16 bytes and is used to select the low/high 16-byte
1449        * chunk of a full dvec4, so we don't want to multiply that part.
1450        */
1451       src_reg index = src_reg(this, glsl_type::int_type);
1452       if (type_sz(inst->dst.type) < 8) {
1453          emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1454                                       brw_imm_d(reg_offset)));
1455          emit_before(block, inst, MUL(dst_reg(index), index,
1456                                       brw_imm_d(message_header_scale)));
1457       } else {
1458          emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1459                                       brw_imm_d(message_header_scale * 2)));
1460          emit_before(block, inst, ADD(dst_reg(index), index,
1461                                       brw_imm_d(reg_offset * message_header_scale)));
1462       }
1463       return index;
1464    } else {
1465       return brw_imm_d(reg_offset * message_header_scale);
1466    }
1467 }
1468
1469 /**
1470  * Emits an instruction before @inst to load the value named by @orig_src
1471  * from scratch space at @base_offset to @temp.
1472  *
1473  * @base_offset is measured in 32-byte units (the size of a register).
1474  */
1475 void
1476 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1477                                 dst_reg temp, src_reg orig_src,
1478                                 int base_offset)
1479 {
1480    assert(orig_src.offset % REG_SIZE == 0);
1481    int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1482    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1483                                       reg_offset);
1484
1485    if (type_sz(orig_src.type) < 8) {
1486       emit_before(block, inst, SCRATCH_READ(temp, index));
1487    } else {
1488       dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
1489       dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
1490       emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1491       index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1492       vec4_instruction *last_read =
1493          SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1494       emit_before(block, inst, last_read);
1495       shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
1496    }
1497 }
1498
1499 /**
1500  * Emits an instruction after @inst to store the value to be written
1501  * to @orig_dst to scratch space at @base_offset, from @temp.
1502  *
1503  * @base_offset is measured in 32-byte units (the size of a register).
1504  */
1505 void
1506 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1507                                  int base_offset)
1508 {
1509    assert(inst->dst.offset % REG_SIZE == 0);
1510    int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1511    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1512                                       reg_offset);
1513
1514    /* Create a temporary register to store *inst's result in.
1515     *
1516     * We have to be careful in MOVing from our temporary result register in
1517     * the scratch write.  If we swizzle from channels of the temporary that
1518     * weren't initialized, it will confuse live interval analysis, which will
1519     * make spilling fail to make progress.
1520     */
1521    bool is_64bit = type_sz(inst->dst.type) == 8;
1522    const glsl_type *alloc_type =
1523       is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
1524    const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1525                                        inst->dst.type),
1526                                 brw_swizzle_for_mask(inst->dst.writemask));
1527
1528    if (!is_64bit) {
1529       dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1530                                           inst->dst.writemask));
1531       vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1532       if (inst->opcode != BRW_OPCODE_SEL)
1533          write->predicate = inst->predicate;
1534       write->ir = inst->ir;
1535       write->annotation = inst->annotation;
1536       inst->insert_after(block, write);
1537    } else {
1538       dst_reg shuffled = dst_reg(this, alloc_type);
1539       vec4_instruction *last =
1540          shuffle_64bit_data(shuffled, temp, true, block, inst);
1541       src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
1542
1543       uint8_t mask = 0;
1544       if (inst->dst.writemask & WRITEMASK_X)
1545          mask |= WRITEMASK_XY;
1546       if (inst->dst.writemask & WRITEMASK_Y)
1547          mask |= WRITEMASK_ZW;
1548       if (mask) {
1549          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1550
1551          vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1552          if (inst->opcode != BRW_OPCODE_SEL)
1553             write->predicate = inst->predicate;
1554          write->ir = inst->ir;
1555          write->annotation = inst->annotation;
1556          last->insert_after(block, write);
1557       }
1558
1559       mask = 0;
1560       if (inst->dst.writemask & WRITEMASK_Z)
1561          mask |= WRITEMASK_XY;
1562       if (inst->dst.writemask & WRITEMASK_W)
1563          mask |= WRITEMASK_ZW;
1564       if (mask) {
1565          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1566
1567          src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1568                                             reg_offset + 1);
1569          vec4_instruction *write =
1570             SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1571          if (inst->opcode != BRW_OPCODE_SEL)
1572             write->predicate = inst->predicate;
1573          write->ir = inst->ir;
1574          write->annotation = inst->annotation;
1575          last->insert_after(block, write);
1576       }
1577    }
1578
1579    inst->dst.file = temp.file;
1580    inst->dst.nr = temp.nr;
1581    inst->dst.offset %= REG_SIZE;
1582    inst->dst.reladdr = NULL;
1583 }
1584
1585 /**
1586  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1587  * adds the scratch read(s) before \p inst. The function also checks for
1588  * recursive reladdr scratch accesses, issuing the corresponding scratch
1589  * loads and rewriting reladdr references accordingly.
1590  *
1591  * \return \p src if it did not require a scratch load, otherwise, the
1592  * register holding the result of the scratch load that the caller should
1593  * use to rewrite src.
1594  */
1595 src_reg
1596 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1597                                    vec4_instruction *inst, src_reg src)
1598 {
1599    /* Resolve recursive reladdr scratch access by calling ourselves
1600     * with src.reladdr
1601     */
1602    if (src.reladdr)
1603       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1604                                           *src.reladdr);
1605
1606    /* Now handle scratch access on src */
1607    if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1608       dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1609          glsl_type::dvec4_type : glsl_type::vec4_type);
1610       emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1611       src.nr = temp.nr;
1612       src.offset %= REG_SIZE;
1613       src.reladdr = NULL;
1614    }
1615
1616    return src;
1617 }
1618
1619 /**
1620  * We can't generally support array access in GRF space, because a
1621  * single instruction's destination can only span 2 contiguous
1622  * registers.  So, we send all GRF arrays that get variable index
1623  * access to scratch space.
1624  */
1625 void
1626 vec4_visitor::move_grf_array_access_to_scratch()
1627 {
1628    int scratch_loc[this->alloc.count];
1629    memset(scratch_loc, -1, sizeof(scratch_loc));
1630
1631    /* First, calculate the set of virtual GRFs that need to be punted
1632     * to scratch due to having any array access on them, and where in
1633     * scratch.
1634     */
1635    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1636       if (inst->dst.file == VGRF && inst->dst.reladdr) {
1637          if (scratch_loc[inst->dst.nr] == -1) {
1638             scratch_loc[inst->dst.nr] = last_scratch;
1639             last_scratch += this->alloc.sizes[inst->dst.nr];
1640          }
1641
1642          for (src_reg *iter = inst->dst.reladdr;
1643               iter->reladdr;
1644               iter = iter->reladdr) {
1645             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1646                scratch_loc[iter->nr] = last_scratch;
1647                last_scratch += this->alloc.sizes[iter->nr];
1648             }
1649          }
1650       }
1651
1652       for (int i = 0 ; i < 3; i++) {
1653          for (src_reg *iter = &inst->src[i];
1654               iter->reladdr;
1655               iter = iter->reladdr) {
1656             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1657                scratch_loc[iter->nr] = last_scratch;
1658                last_scratch += this->alloc.sizes[iter->nr];
1659             }
1660          }
1661       }
1662    }
1663
1664    /* Now, for anything that will be accessed through scratch, rewrite
1665     * it to load/store.  Note that this is a _safe list walk, because
1666     * we may generate a new scratch_write instruction after the one
1667     * we're processing.
1668     */
1669    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1670       /* Set up the annotation tracking for new generated instructions. */
1671       base_ir = inst->ir;
1672       current_annotation = inst->annotation;
1673
1674       /* First handle scratch access on the dst. Notice we have to handle
1675        * the case where the dst's reladdr also points to scratch space.
1676        */
1677       if (inst->dst.reladdr)
1678          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1679                                                    *inst->dst.reladdr);
1680
1681       /* Now that we have handled any (possibly recursive) reladdr scratch
1682        * accesses for dst we can safely do the scratch write for dst itself
1683        */
1684       if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1685          emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1686
1687       /* Now handle scratch access on any src. In this case, since inst->src[i]
1688        * already is a src_reg, we can just call emit_resolve_reladdr with
1689        * inst->src[i] and it will take care of handling scratch loads for
1690        * both src and src.reladdr (recursively).
1691        */
1692       for (int i = 0 ; i < 3; i++) {
1693          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1694                                              inst->src[i]);
1695       }
1696    }
1697 }
1698
1699 /**
1700  * Emits an instruction before @inst to load the value named by @orig_src
1701  * from the pull constant buffer (surface) at @base_offset to @temp.
1702  */
1703 void
1704 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1705                                       dst_reg temp, src_reg orig_src,
1706                                       int base_offset, src_reg indirect)
1707 {
1708    assert(orig_src.offset % 16 == 0);
1709    int reg_offset = base_offset + orig_src.offset / 16;
1710    const unsigned index = prog_data->base.binding_table.pull_constants_start;
1711
1712    src_reg offset;
1713    if (indirect.file != BAD_FILE) {
1714       offset = src_reg(this, glsl_type::uint_type);
1715
1716       emit_before(block, inst, ADD(dst_reg(offset), indirect,
1717                                    brw_imm_ud(reg_offset * 16)));
1718    } else if (devinfo->gen >= 8) {
1719       /* Store the offset in a GRF so we can send-from-GRF. */
1720       offset = src_reg(this, glsl_type::uint_type);
1721       emit_before(block, inst, MOV(dst_reg(offset), brw_imm_ud(reg_offset * 16)));
1722    } else {
1723       offset = brw_imm_d(reg_offset * 16);
1724    }
1725
1726    emit_pull_constant_load_reg(temp,
1727                                brw_imm_ud(index),
1728                                offset,
1729                                block, inst);
1730
1731    brw_mark_surface_used(&prog_data->base, index);
1732 }
1733
1734 /**
1735  * Implements array access of uniforms by inserting a
1736  * PULL_CONSTANT_LOAD instruction.
1737  *
1738  * Unlike temporary GRF array access (where we don't support it due to
1739  * the difficulty of doing relative addressing on instruction
1740  * destinations), we could potentially do array access of uniforms
1741  * that were loaded in GRF space as push constants.  In real-world
1742  * usage we've seen, though, the arrays being used are always larger
1743  * than we could load as push constants, so just always move all
1744  * uniform array access out to a pull constant buffer.
1745  */
1746 void
1747 vec4_visitor::move_uniform_array_access_to_pull_constants()
1748 {
1749    /* The vulkan dirver doesn't support pull constants other than UBOs so
1750     * everything has to be pushed regardless.
1751     */
1752    if (stage_prog_data->pull_param == NULL) {
1753       split_uniform_registers();
1754       return;
1755    }
1756
1757    int pull_constant_loc[this->uniforms];
1758    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1759
1760    /* First, walk through the instructions and determine which things need to
1761     * be pulled.  We mark something as needing to be pulled by setting
1762     * pull_constant_loc to 0.
1763     */
1764    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1765       /* We only care about MOV_INDIRECT of a uniform */
1766       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1767           inst->src[0].file != UNIFORM)
1768          continue;
1769
1770       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1771
1772       for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1773          pull_constant_loc[uniform_nr + j] = 0;
1774    }
1775
1776    /* Next, we walk the list of uniforms and assign real pull constant
1777     * locations and set their corresponding entries in pull_param.
1778     */
1779    for (int j = 0; j < this->uniforms; j++) {
1780       if (pull_constant_loc[j] < 0)
1781          continue;
1782
1783       pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1784
1785       for (int i = 0; i < 4; i++) {
1786          stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1787             = stage_prog_data->param[j * 4 + i];
1788       }
1789    }
1790
1791    /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1792     * instructions to actual uniform pulls.
1793     */
1794    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1795       /* We only care about MOV_INDIRECT of a uniform */
1796       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1797           inst->src[0].file != UNIFORM)
1798          continue;
1799
1800       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1801
1802       assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1803
1804       emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1805                               pull_constant_loc[uniform_nr], inst->src[1]);
1806       inst->remove(block);
1807    }
1808
1809    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1810     * no need to track them as larger-than-vec4 objects.  This will be
1811     * relied on in cutting out unused uniform vectors from push
1812     * constants.
1813     */
1814    split_uniform_registers();
1815 }
1816
1817 void
1818 vec4_visitor::resolve_ud_negate(src_reg *reg)
1819 {
1820    if (reg->type != BRW_REGISTER_TYPE_UD ||
1821        !reg->negate)
1822       return;
1823
1824    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1825    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1826    *reg = temp;
1827 }
1828
1829 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1830                            void *log_data,
1831                            const struct brw_sampler_prog_key_data *key_tex,
1832                            struct brw_vue_prog_data *prog_data,
1833                            const nir_shader *shader,
1834                            void *mem_ctx,
1835                            bool no_spills,
1836                            int shader_time_index)
1837    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1838      key_tex(key_tex),
1839      prog_data(prog_data),
1840      fail_msg(NULL),
1841      first_non_payload_grf(0),
1842      need_all_constants_in_pull_buffer(false),
1843      no_spills(no_spills),
1844      shader_time_index(shader_time_index),
1845      last_scratch(0)
1846 {
1847    this->failed = false;
1848
1849    this->base_ir = NULL;
1850    this->current_annotation = NULL;
1851    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1852
1853    memset(this->output_num_components, 0, sizeof(this->output_num_components));
1854
1855    this->virtual_grf_start = NULL;
1856    this->virtual_grf_end = NULL;
1857    this->live_intervals = NULL;
1858
1859    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1860
1861    this->uniforms = 0;
1862 }
1863
1864 vec4_visitor::~vec4_visitor()
1865 {
1866 }
1867
1868
1869 void
1870 vec4_visitor::fail(const char *format, ...)
1871 {
1872    va_list va;
1873    char *msg;
1874
1875    if (failed)
1876       return;
1877
1878    failed = true;
1879
1880    va_start(va, format);
1881    msg = ralloc_vasprintf(mem_ctx, format, va);
1882    va_end(va);
1883    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1884
1885    this->fail_msg = msg;
1886
1887    if (debug_enabled) {
1888       fprintf(stderr, "%s",  msg);
1889    }
1890 }
1891
1892 } /* namespace brw */