src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cs.h"
  46 #include "brw_vec4_gs_visitor.h"
  47 #include "brw_cfg.h"
  48 #include "brw_dead_control_flow.h"
  49 #include "main/uniforms.h"
  50 #include "brw_fs_live_variables.h"
  51 #include "glsl/nir/glsl_types.h"
  52 #include "program/sampler.h"
  53
  54 using namespace brw;
  55
  56 void
  57 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  58               const fs_reg *src, unsigned sources)
  59 {
  60    memset(this, 0, sizeof(*this));
  61
  62    this->src = new fs_reg[MAX2(sources, 3)];
  63    for (unsigned i = 0; i < sources; i++)
  64       this->src[i] = src[i];
  65
  66    this->opcode = opcode;
  67    this->dst = dst;
  68    this->sources = sources;
  69    this->exec_size = exec_size;
  70
  71    assert(dst.file != IMM && dst.file != UNIFORM);
  72
  73    assert(this->exec_size != 0);
  74
  75    this->conditional_mod = BRW_CONDITIONAL_NONE;
  76
  77    /* This will be the case for almost all instructions. */
  78    switch (dst.file) {
  79    case VGRF:
  80    case ARF:
  81    case FIXED_GRF:
  82    case MRF:
  83    case ATTR:
  84       this->regs_written = DIV_ROUND_UP(dst.component_size(exec_size),
  85                                         REG_SIZE);
  86       break;
  87    case BAD_FILE:
  88       this->regs_written = 0;
  89       break;
  90    case IMM:
  91    case UNIFORM:
  92       unreachable("Invalid destination register file");
  93    }
  94
  95    this->writes_accumulator = false;
  96 }
  97
  98 fs_inst::fs_inst()
  99 {
 100    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 101 }
 102
 103 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 104 {
 105    init(opcode, exec_size, reg_undef, NULL, 0);
 106 }
 107
 108 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
 109 {
 110    init(opcode, exec_size, dst, NULL, 0);
 111 }
 112
 113 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 114                  const fs_reg &src0)
 115 {
 116    const fs_reg src[1] = { src0 };
 117    init(opcode, exec_size, dst, src, 1);
 118 }
 119
 120 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 121                  const fs_reg &src0, const fs_reg &src1)
 122 {
 123    const fs_reg src[2] = { src0, src1 };
 124    init(opcode, exec_size, dst, src, 2);
 125 }
 126
 127 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 128                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 129 {
 130    const fs_reg src[3] = { src0, src1, src2 };
 131    init(opcode, exec_size, dst, src, 3);
 132 }
 133
 134 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 135                  const fs_reg src[], unsigned sources)
 136 {
 137    init(opcode, exec_width, dst, src, sources);
 138 }
 139
 140 fs_inst::fs_inst(const fs_inst &that)
 141 {
 142    memcpy(this, &that, sizeof(that));
 143
 144    this->src = new fs_reg[MAX2(that.sources, 3)];
 145
 146    for (unsigned i = 0; i < that.sources; i++)
 147       this->src[i] = that.src[i];
 148 }
 149
 150 fs_inst::~fs_inst()
 151 {
 152    delete[] this->src;
 153 }
 154
 155 void
 156 fs_inst::resize_sources(uint8_t num_sources)
 157 {
 158    if (this->sources != num_sources) {
 159       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 160
 161       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 162          src[i] = this->src[i];
 163
 164       delete[] this->src;
 165       this->src = src;
 166       this->sources = num_sources;
 167    }
 168 }
 169
 170 void
 171 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
 172                                        const fs_reg &dst,
 173                                        const fs_reg &surf_index,
 174                                        const fs_reg &varying_offset,
 175                                        uint32_t const_offset)
 176 {
 177    /* We have our constant surface use a pitch of 4 bytes, so our index can
 178     * be any component of a vector, and then we load 4 contiguous
 179     * components starting from that.
 180     *
 181     * We break down the const_offset to a portion added to the variable
 182     * offset and a portion done using reg_offset, which means that if you
 183     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 184     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 185     * CSE can later notice that those loads are all the same and eliminate
 186     * the redundant ones.
 187     */
 188    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 189    bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
 190
 191    int scale = 1;
 192    if (devinfo->gen == 4 && bld.dispatch_width() == 8) {
 193       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 194        * u, v, r) as parameters, or we can just use the SIMD16 message
 195        * consisting of (header, u).  We choose the second, at the cost of a
 196        * longer return length.
 197        */
 198       scale = 2;
 199    }
 200
 201    enum opcode op;
 202    if (devinfo->gen >= 7)
 203       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 204    else
 205       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 206
 207    int regs_written = 4 * (bld.dispatch_width() / 8) * scale;
 208    fs_reg vec4_result = fs_reg(VGRF, alloc.allocate(regs_written), dst.type);
 209    fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
 210    inst->regs_written = regs_written;
 211
 212    if (devinfo->gen < 7) {
 213       inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen);
 214       inst->header_size = 1;
 215       if (devinfo->gen == 4)
 216          inst->mlen = 3;
 217       else
 218          inst->mlen = 1 + bld.dispatch_width() / 8;
 219    }
 220
 221    bld.MOV(dst, offset(vec4_result, bld, (const_offset & 3) * scale));
 222 }
 223
 224 /**
 225  * A helper for MOV generation for fixing up broken hardware SEND dependency
 226  * handling.
 227  */
 228 void
 229 fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
 230 {
 231    /* The caller always wants uncompressed to emit the minimal extra
 232     * dependencies, and to avoid having to deal with aligning its regs to 2.
 233     */
 234    const fs_builder ubld = bld.annotate("send dependency resolve")
 235                               .half(0);
 236
 237    ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F));
 238 }
 239
 240 bool
 241 fs_inst::equals(fs_inst *inst) const
 242 {
 243    return (opcode == inst->opcode &&
 244            dst.equals(inst->dst) &&
 245            src[0].equals(inst->src[0]) &&
 246            src[1].equals(inst->src[1]) &&
 247            src[2].equals(inst->src[2]) &&
 248            saturate == inst->saturate &&
 249            predicate == inst->predicate &&
 250            conditional_mod == inst->conditional_mod &&
 251            mlen == inst->mlen &&
 252            base_mrf == inst->base_mrf &&
 253            target == inst->target &&
 254            eot == inst->eot &&
 255            header_size == inst->header_size &&
 256            shadow_compare == inst->shadow_compare &&
 257            exec_size == inst->exec_size &&
 258            offset == inst->offset);
 259 }
 260
 261 bool
 262 fs_inst::overwrites_reg(const fs_reg &reg) const
 263 {
 264    return reg.in_range(dst, regs_written);
 265 }
 266
 267 bool
 268 fs_inst::is_send_from_grf() const
 269 {
 270    switch (opcode) {
 271    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 272    case SHADER_OPCODE_SHADER_TIME_ADD:
 273    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 274    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 275    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 276    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 277    case SHADER_OPCODE_UNTYPED_ATOMIC:
 278    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 279    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 280    case SHADER_OPCODE_TYPED_ATOMIC:
 281    case SHADER_OPCODE_TYPED_SURFACE_READ:
 282    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 283    case SHADER_OPCODE_URB_WRITE_SIMD8:
 284    case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
 285    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
 286    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
 287    case SHADER_OPCODE_URB_READ_SIMD8:
 288    case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
 289       return true;
 290    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 291       return src[1].file == VGRF;
 292    case FS_OPCODE_FB_WRITE:
 293       return src[0].file == VGRF;
 294    default:
 295       if (is_tex())
 296          return src[0].file == VGRF;
 297
 298       return false;
 299    }
 300 }
 301
 302 bool
 303 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
 304 {
 305    if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
 306       return false;
 307
 308    fs_reg reg = this->src[0];
 309    if (reg.file != VGRF || reg.reg_offset != 0 || reg.stride == 0)
 310       return false;
 311
 312    if (grf_alloc.sizes[reg.nr] != this->regs_written)
 313       return false;
 314
 315    for (int i = 0; i < this->sources; i++) {
 316       reg.type = this->src[i].type;
 317       if (!this->src[i].equals(reg))
 318          return false;
 319
 320       if (i < this->header_size) {
 321          reg.reg_offset += 1;
 322       } else {
 323          reg.reg_offset += this->exec_size / 8;
 324       }
 325    }
 326
 327    return true;
 328 }
 329
 330 bool
 331 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
 332 {
 333    if (devinfo->gen == 6 && is_math())
 334       return false;
 335
 336    if (is_send_from_grf())
 337       return false;
 338
 339    if (!backend_instruction::can_do_source_mods())
 340       return false;
 341
 342    return true;
 343 }
 344
 345 bool
 346 fs_inst::can_change_types() const
 347 {
 348    return dst.type == src[0].type &&
 349           !src[0].abs && !src[0].negate && !saturate &&
 350           (opcode == BRW_OPCODE_MOV ||
 351            (opcode == BRW_OPCODE_SEL &&
 352             dst.type == src[1].type &&
 353             predicate != BRW_PREDICATE_NONE &&
 354             !src[1].abs && !src[1].negate));
 355 }
 356
 357 bool
 358 fs_inst::has_side_effects() const
 359 {
 360    return this->eot || backend_instruction::has_side_effects();
 361 }
 362
 363 void
 364 fs_reg::init()
 365 {
 366    memset(this, 0, sizeof(*this));
 367    stride = 1;
 368 }
 369
 370 /** Generic unset register constructor. */
 371 fs_reg::fs_reg()
 372 {
 373    init();
 374    this->file = BAD_FILE;
 375 }
 376
 377 /** Immediate value constructor. */
 378 fs_reg::fs_reg(float f)
 379 {
 380    init();
 381    this->file = IMM;
 382    this->type = BRW_REGISTER_TYPE_F;
 383    this->stride = 0;
 384    this->f = f;
 385 }
 386
 387 /** Immediate value constructor. */
 388 fs_reg::fs_reg(int32_t i)
 389 {
 390    init();
 391    this->file = IMM;
 392    this->type = BRW_REGISTER_TYPE_D;
 393    this->stride = 0;
 394    this->d = i;
 395 }
 396
 397 /** Immediate value constructor. */
 398 fs_reg::fs_reg(uint32_t u)
 399 {
 400    init();
 401    this->file = IMM;
 402    this->type = BRW_REGISTER_TYPE_UD;
 403    this->stride = 0;
 404    this->ud = u;
 405 }
 406
 407 /** Vector float immediate value constructor. */
 408 fs_reg::fs_reg(uint8_t vf[4])
 409 {
 410    init();
 411    this->file = IMM;
 412    this->type = BRW_REGISTER_TYPE_VF;
 413    memcpy(&this->ud, vf, sizeof(unsigned));
 414 }
 415
 416 /** Vector float immediate value constructor. */
 417 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 418 {
 419    init();
 420    this->file = IMM;
 421    this->type = BRW_REGISTER_TYPE_VF;
 422    this->ud = (vf0 <<  0) | (vf1 <<  8) | (vf2 << 16) | (vf3 << 24);
 423 }
 424
 425 fs_reg::fs_reg(struct brw_reg reg) :
 426    backend_reg(reg)
 427 {
 428    this->reg_offset = 0;
 429    this->subreg_offset = 0;
 430    this->reladdr = NULL;
 431    this->stride = 1;
 432    if (this->file == IMM &&
 433        (this->type != BRW_REGISTER_TYPE_V &&
 434         this->type != BRW_REGISTER_TYPE_UV &&
 435         this->type != BRW_REGISTER_TYPE_VF)) {
 436       this->stride = 0;
 437    }
 438 }
 439
 440 bool
 441 fs_reg::equals(const fs_reg &r) const
 442 {
 443    return (memcmp((brw_reg *)this, (brw_reg *)&r, sizeof(brw_reg)) == 0 &&
 444            reg_offset == r.reg_offset &&
 445            subreg_offset == r.subreg_offset &&
 446            !reladdr && !r.reladdr &&
 447            stride == r.stride);
 448 }
 449
 450 fs_reg &
 451 fs_reg::set_smear(unsigned subreg)
 452 {
 453    assert(file != ARF && file != FIXED_GRF && file != IMM);
 454    subreg_offset = subreg * type_sz(type);
 455    stride = 0;
 456    return *this;
 457 }
 458
 459 bool
 460 fs_reg::is_contiguous() const
 461 {
 462    return stride == 1;
 463 }
 464
 465 unsigned
 466 fs_reg::component_size(unsigned width) const
 467 {
 468    const unsigned stride = ((file != ARF && file != FIXED_GRF) ? this->stride :
 469                             hstride == 0 ? 0 :
 470                             1 << (hstride - 1));
 471    return MAX2(width * stride, 1) * type_sz(type);
 472 }
 473
 474 extern "C" int
 475 type_size_scalar(const struct glsl_type *type)
 476 {
 477    unsigned int size, i;
 478
 479    switch (type->base_type) {
 480    case GLSL_TYPE_UINT:
 481    case GLSL_TYPE_INT:
 482    case GLSL_TYPE_FLOAT:
 483    case GLSL_TYPE_BOOL:
 484       return type->components();
 485    case GLSL_TYPE_ARRAY:
 486       return type_size_scalar(type->fields.array) * type->length;
 487    case GLSL_TYPE_STRUCT:
 488       size = 0;
 489       for (i = 0; i < type->length; i++) {
 490          size += type_size_scalar(type->fields.structure[i].type);
 491       }
 492       return size;
 493    case GLSL_TYPE_SAMPLER:
 494       /* Samplers take up no register space, since they're baked in at
 495        * link time.
 496        */
 497       return 0;
 498    case GLSL_TYPE_ATOMIC_UINT:
 499       return 0;
 500    case GLSL_TYPE_SUBROUTINE:
 501       return 1;
 502    case GLSL_TYPE_IMAGE:
 503       return BRW_IMAGE_PARAM_SIZE;
 504    case GLSL_TYPE_VOID:
 505    case GLSL_TYPE_ERROR:
 506    case GLSL_TYPE_INTERFACE:
 507    case GLSL_TYPE_DOUBLE:
 508       unreachable("not reached");
 509    }
 510
 511    return 0;
 512 }
 513
 514 /**
 515  * Returns the number of scalar components needed to store type, assuming
 516  * that vectors are padded out to vec4.
 517  *
 518  * This has the packing rules of type_size_vec4(), but counts components
 519  * similar to type_size_scalar().
 520  */
 521 extern "C" int
 522 type_size_vec4_times_4(const struct glsl_type *type)
 523 {
 524    return 4 * type_size_vec4(type);
 525 }
 526
 527 /**
 528  * Create a MOV to read the timestamp register.
 529  *
 530  * The caller is responsible for emitting the MOV.  The return value is
 531  * the destination of the MOV, with extra parameters set.
 532  */
 533 fs_reg
 534 fs_visitor::get_timestamp(const fs_builder &bld)
 535 {
 536    assert(devinfo->gen >= 7);
 537
 538    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 539                                           BRW_ARF_TIMESTAMP,
 540                                           0),
 541                              BRW_REGISTER_TYPE_UD));
 542
 543    fs_reg dst = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
 544
 545    /* We want to read the 3 fields we care about even if it's not enabled in
 546     * the dispatch.
 547     */
 548    bld.group(4, 0).exec_all().MOV(dst, ts);
 549
 550    return dst;
 551 }
 552
 553 void
 554 fs_visitor::emit_shader_time_begin()
 555 {
 556    shader_start_time = get_timestamp(bld.annotate("shader time start"));
 557
 558    /* We want only the low 32 bits of the timestamp.  Since it's running
 559     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 560     * which is plenty of time for our purposes.  It is identical across the
 561     * EUs, but since it's tracking GPU core speed it will increment at a
 562     * varying rate as render P-states change.
 563     */
 564    shader_start_time.set_smear(0);
 565 }
 566
 567 void
 568 fs_visitor::emit_shader_time_end()
 569 {
 570    /* Insert our code just before the final SEND with EOT. */
 571    exec_node *end = this->instructions.get_tail();
 572    assert(end && ((fs_inst *) end)->eot);
 573    const fs_builder ibld = bld.annotate("shader time end")
 574                               .exec_all().at(NULL, end);
 575
 576    fs_reg shader_end_time = get_timestamp(ibld);
 577
 578    /* We only use the low 32 bits of the timestamp - see
 579     * emit_shader_time_begin()).
 580     *
 581     * We could also check if render P-states have changed (or anything
 582     * else that might disrupt timing) by setting smear to 2 and checking if
 583     * that field is != 0.
 584     */
 585    shader_end_time.set_smear(0);
 586
 587    /* Check that there weren't any timestamp reset events (assuming these
 588     * were the only two timestamp reads that happened).
 589     */
 590    fs_reg reset = shader_end_time;
 591    reset.set_smear(2);
 592    set_condmod(BRW_CONDITIONAL_Z,
 593                ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u)));
 594    ibld.IF(BRW_PREDICATE_NORMAL);
 595
 596    fs_reg start = shader_start_time;
 597    start.negate = true;
 598    fs_reg diff = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
 599    diff.set_smear(0);
 600
 601    const fs_builder cbld = ibld.group(1, 0);
 602    cbld.group(1, 0).ADD(diff, start, shader_end_time);
 603
 604    /* If there were no instructions between the two timestamp gets, the diff
 605     * is 2 cycles.  Remove that overhead, so I can forget about that when
 606     * trying to determine the time taken for single instructions.
 607     */
 608    cbld.ADD(diff, diff, fs_reg(-2u));
 609    SHADER_TIME_ADD(cbld, 0, diff);
 610    SHADER_TIME_ADD(cbld, 1, fs_reg(1u));
 611    ibld.emit(BRW_OPCODE_ELSE);
 612    SHADER_TIME_ADD(cbld, 2, fs_reg(1u));
 613    ibld.emit(BRW_OPCODE_ENDIF);
 614 }
 615
 616 void
 617 fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
 618                             int shader_time_subindex,
 619                             fs_reg value)
 620 {
 621    int index = shader_time_index * 3 + shader_time_subindex;
 622    fs_reg offset = fs_reg(index * SHADER_TIME_STRIDE);
 623
 624    fs_reg payload;
 625    if (dispatch_width == 8)
 626       payload = vgrf(glsl_type::uvec2_type);
 627    else
 628       payload = vgrf(glsl_type::uint_type);
 629
 630    bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
 631 }
 632
 633 void
 634 fs_visitor::vfail(const char *format, va_list va)
 635 {
 636    char *msg;
 637
 638    if (failed)
 639       return;
 640
 641    failed = true;
 642
 643    msg = ralloc_vasprintf(mem_ctx, format, va);
 644    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 645
 646    this->fail_msg = msg;
 647
 648    if (debug_enabled) {
 649       fprintf(stderr, "%s",  msg);
 650    }
 651 }
 652
 653 void
 654 fs_visitor::fail(const char *format, ...)
 655 {
 656    va_list va;
 657
 658    va_start(va, format);
 659    vfail(format, va);
 660    va_end(va);
 661 }
 662
 663 /**
 664  * Mark this program as impossible to compile in SIMD16 mode.
 665  *
 666  * During the SIMD8 compile (which happens first), we can detect and flag
 667  * things that are unsupported in SIMD16 mode, so the compiler can skip
 668  * the SIMD16 compile altogether.
 669  *
 670  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 671  */
 672 void
 673 fs_visitor::no16(const char *msg)
 674 {
 675    if (dispatch_width == 16) {
 676       fail("%s", msg);
 677    } else {
 678       simd16_unsupported = true;
 679
 680       compiler->shader_perf_log(log_data,
 681                                 "SIMD16 shader failed to compile: %s", msg);
 682    }
 683 }
 684
 685 /**
 686  * Returns true if the instruction has a flag that means it won't
 687  * update an entire destination register.
 688  *
 689  * For example, dead code elimination and live variable analysis want to know
 690  * when a write to a variable screens off any preceding values that were in
 691  * it.
 692  */
 693 bool
 694 fs_inst::is_partial_write() const
 695 {
 696    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 697            (this->exec_size * type_sz(this->dst.type)) < 32 ||
 698            !this->dst.is_contiguous());
 699 }
 700
 701 unsigned
 702 fs_inst::components_read(unsigned i) const
 703 {
 704    switch (opcode) {
 705    case FS_OPCODE_LINTERP:
 706       if (i == 0)
 707          return 2;
 708       else
 709          return 1;
 710
 711    case FS_OPCODE_PIXEL_X:
 712    case FS_OPCODE_PIXEL_Y:
 713       assert(i == 0);
 714       return 2;
 715
 716    case FS_OPCODE_FB_WRITE_LOGICAL:
 717       assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
 718       /* First/second FB write color. */
 719       if (i < 2)
 720          return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
 721       else
 722          return 1;
 723
 724    case SHADER_OPCODE_TEX_LOGICAL:
 725    case SHADER_OPCODE_TXD_LOGICAL:
 726    case SHADER_OPCODE_TXF_LOGICAL:
 727    case SHADER_OPCODE_TXL_LOGICAL:
 728    case SHADER_OPCODE_TXS_LOGICAL:
 729    case FS_OPCODE_TXB_LOGICAL:
 730    case SHADER_OPCODE_TXF_CMS_LOGICAL:
 731    case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
 732    case SHADER_OPCODE_TXF_UMS_LOGICAL:
 733    case SHADER_OPCODE_TXF_MCS_LOGICAL:
 734    case SHADER_OPCODE_LOD_LOGICAL:
 735    case SHADER_OPCODE_TG4_LOGICAL:
 736    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
 737       assert(src[8].file == IMM && src[9].file == IMM);
 738       /* Texture coordinates. */
 739       if (i == 0)
 740          return src[8].ud;
 741       /* Texture derivatives. */
 742       else if ((i == 2 || i == 3) && opcode == SHADER_OPCODE_TXD_LOGICAL)
 743          return src[9].ud;
 744       /* Texture offset. */
 745       else if (i == 7)
 746          return 2;
 747       /* MCS */
 748       else if (i == 5 && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
 749          return 2;
 750       else
 751          return 1;
 752
 753    case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
 754    case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
 755       assert(src[3].file == IMM);
 756       /* Surface coordinates. */
 757       if (i == 0)
 758          return src[3].ud;
 759       /* Surface operation source (ignored for reads). */
 760       else if (i == 1)
 761          return 0;
 762       else
 763          return 1;
 764
 765    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
 766    case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
 767       assert(src[3].file == IMM &&
 768              src[4].file == IMM);
 769       /* Surface coordinates. */
 770       if (i == 0)
 771          return src[3].ud;
 772       /* Surface operation source. */
 773       else if (i == 1)
 774          return src[4].ud;
 775       else
 776          return 1;
 777
 778    case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
 779    case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
 780       assert(src[3].file == IMM &&
 781              src[4].file == IMM);
 782       const unsigned op = src[4].ud;
 783       /* Surface coordinates. */
 784       if (i == 0)
 785          return src[3].ud;
 786       /* Surface operation source. */
 787       else if (i == 1 && op == BRW_AOP_CMPWR)
 788          return 2;
 789       else if (i == 1 && (op == BRW_AOP_INC || op == BRW_AOP_DEC ||
 790                           op == BRW_AOP_PREDEC))
 791          return 0;
 792       else
 793          return 1;
 794    }
 795
 796    default:
 797       return 1;
 798    }
 799 }
 800
 801 int
 802 fs_inst::regs_read(int arg) const
 803 {
 804    switch (opcode) {
 805    case FS_OPCODE_FB_WRITE:
 806    case SHADER_OPCODE_URB_WRITE_SIMD8:
 807    case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
 808    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
 809    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
 810    case SHADER_OPCODE_URB_READ_SIMD8:
 811    case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
 812    case SHADER_OPCODE_UNTYPED_ATOMIC:
 813    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 814    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 815    case SHADER_OPCODE_TYPED_ATOMIC:
 816    case SHADER_OPCODE_TYPED_SURFACE_READ:
 817    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 818    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 819       if (arg == 0)
 820          return mlen;
 821       break;
 822
 823    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
 824       /* The payload is actually stored in src1 */
 825       if (arg == 1)
 826          return mlen;
 827       break;
 828
 829    case FS_OPCODE_LINTERP:
 830       if (arg == 1)
 831          return 1;
 832       break;
 833
 834    case SHADER_OPCODE_LOAD_PAYLOAD:
 835       if (arg < this->header_size)
 836          return 1;
 837       break;
 838
 839    case CS_OPCODE_CS_TERMINATE:
 840    case SHADER_OPCODE_BARRIER:
 841       return 1;
 842
 843    default:
 844       if (is_tex() && arg == 0 && src[0].file == VGRF)
 845          return mlen;
 846       break;
 847    }
 848
 849    switch (src[arg].file) {
 850    case BAD_FILE:
 851       return 0;
 852    case UNIFORM:
 853    case IMM:
 854       return 1;
 855    case ARF:
 856    case FIXED_GRF:
 857    case VGRF:
 858    case ATTR:
 859       return DIV_ROUND_UP(components_read(arg) *
 860                           src[arg].component_size(exec_size),
 861                           REG_SIZE);
 862    case MRF:
 863       unreachable("MRF registers are not allowed as sources");
 864    }
 865    return 0;
 866 }
 867
 868 bool
 869 fs_inst::reads_flag() const
 870 {
 871    return predicate;
 872 }
 873
 874 bool
 875 fs_inst::writes_flag() const
 876 {
 877    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 878                                opcode != BRW_OPCODE_IF &&
 879                                opcode != BRW_OPCODE_WHILE)) ||
 880           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 881 }
 882
 883 /**
 884  * Returns how many MRFs an FS opcode will write over.
 885  *
 886  * Note that this is not the 0 or 1 implied writes in an actual gen
 887  * instruction -- the FS opcodes often generate MOVs in addition.
 888  */
 889 int
 890 fs_visitor::implied_mrf_writes(fs_inst *inst)
 891 {
 892    if (inst->mlen == 0)
 893       return 0;
 894
 895    if (inst->base_mrf == -1)
 896       return 0;
 897
 898    switch (inst->opcode) {
 899    case SHADER_OPCODE_RCP:
 900    case SHADER_OPCODE_RSQ:
 901    case SHADER_OPCODE_SQRT:
 902    case SHADER_OPCODE_EXP2:
 903    case SHADER_OPCODE_LOG2:
 904    case SHADER_OPCODE_SIN:
 905    case SHADER_OPCODE_COS:
 906       return 1 * dispatch_width / 8;
 907    case SHADER_OPCODE_POW:
 908    case SHADER_OPCODE_INT_QUOTIENT:
 909    case SHADER_OPCODE_INT_REMAINDER:
 910       return 2 * dispatch_width / 8;
 911    case SHADER_OPCODE_TEX:
 912    case FS_OPCODE_TXB:
 913    case SHADER_OPCODE_TXD:
 914    case SHADER_OPCODE_TXF:
 915    case SHADER_OPCODE_TXF_CMS:
 916    case SHADER_OPCODE_TXF_CMS_W:
 917    case SHADER_OPCODE_TXF_MCS:
 918    case SHADER_OPCODE_TG4:
 919    case SHADER_OPCODE_TG4_OFFSET:
 920    case SHADER_OPCODE_TXL:
 921    case SHADER_OPCODE_TXS:
 922    case SHADER_OPCODE_LOD:
 923    case SHADER_OPCODE_SAMPLEINFO:
 924       return 1;
 925    case FS_OPCODE_FB_WRITE:
 926       return 2;
 927    case FS_OPCODE_GET_BUFFER_SIZE:
 928    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 929    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 930       return 1;
 931    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 932       return inst->mlen;
 933    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 934       return inst->mlen;
 935    case SHADER_OPCODE_UNTYPED_ATOMIC:
 936    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 937    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 938    case SHADER_OPCODE_TYPED_ATOMIC:
 939    case SHADER_OPCODE_TYPED_SURFACE_READ:
 940    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 941    case SHADER_OPCODE_URB_WRITE_SIMD8:
 942    case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
 943    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
 944    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
 945    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 946    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 947    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 948    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 949       return 0;
 950    default:
 951       unreachable("not reached");
 952    }
 953 }
 954
 955 fs_reg
 956 fs_visitor::vgrf(const glsl_type *const type)
 957 {
 958    int reg_width = dispatch_width / 8;
 959    return fs_reg(VGRF, alloc.allocate(type_size_scalar(type) * reg_width),
 960                  brw_type_for_base_type(type));
 961 }
 962
 963 fs_reg::fs_reg(enum brw_reg_file file, int nr)
 964 {
 965    init();
 966    this->file = file;
 967    this->nr = nr;
 968    this->type = BRW_REGISTER_TYPE_F;
 969    this->stride = (file == UNIFORM ? 0 : 1);
 970 }
 971
 972 fs_reg::fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type)
 973 {
 974    init();
 975    this->file = file;
 976    this->nr = nr;
 977    this->type = type;
 978    this->stride = (file == UNIFORM ? 0 : 1);
 979 }
 980
 981 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 982  * This brings in those uniform definitions
 983  */
 984 void
 985 fs_visitor::import_uniforms(fs_visitor *v)
 986 {
 987    this->push_constant_loc = v->push_constant_loc;
 988    this->pull_constant_loc = v->pull_constant_loc;
 989    this->uniforms = v->uniforms;
 990    this->param_size = v->param_size;
 991 }
 992
 993 fs_reg *
 994 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
 995                                          bool origin_upper_left)
 996 {
 997    assert(stage == MESA_SHADER_FRAGMENT);
 998    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 999    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1000    fs_reg wpos = *reg;
1001    bool flip = !origin_upper_left ^ key->render_to_fbo;
1002
1003    /* gl_FragCoord.x */
1004    if (pixel_center_integer) {
1005       bld.MOV(wpos, this->pixel_x);
1006    } else {
1007       bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
1008    }
1009    wpos = offset(wpos, bld, 1);
1010
1011    /* gl_FragCoord.y */
1012    if (!flip && pixel_center_integer) {
1013       bld.MOV(wpos, this->pixel_y);
1014    } else {
1015       fs_reg pixel_y = this->pixel_y;
1016       float offset = (pixel_center_integer ? 0.0f : 0.5f);
1017
1018       if (flip) {
1019          pixel_y.negate = true;
1020          offset += key->drawable_height - 1.0f;
1021       }
1022
1023       bld.ADD(wpos, pixel_y, fs_reg(offset));
1024    }
1025    wpos = offset(wpos, bld, 1);
1026
1027    /* gl_FragCoord.z */
1028    if (devinfo->gen >= 6) {
1029       bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
1030    } else {
1031       bld.emit(FS_OPCODE_LINTERP, wpos,
1032            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1033            interp_reg(VARYING_SLOT_POS, 2));
1034    }
1035    wpos = offset(wpos, bld, 1);
1036
1037    /* gl_FragCoord.w: Already set up in emit_interpolation */
1038    bld.MOV(wpos, this->wpos_w);
1039
1040    return reg;
1041 }
1042
1043 fs_inst *
1044 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1045                          glsl_interp_qualifier interpolation_mode,
1046                          bool is_centroid, bool is_sample)
1047 {
1048    brw_wm_barycentric_interp_mode barycoord_mode;
1049    if (devinfo->gen >= 6) {
1050       if (is_centroid) {
1051          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1052             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1053          else
1054             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1055       } else if (is_sample) {
1056           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1057             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1058          else
1059             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1060       } else {
1061          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1062             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1063          else
1064             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1065       }
1066    } else {
1067       /* On Ironlake and below, there is only one interpolation mode.
1068        * Centroid interpolation doesn't mean anything on this hardware --
1069        * there is no multisampling.
1070        */
1071       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1072    }
1073    return bld.emit(FS_OPCODE_LINTERP, attr,
1074                    this->delta_xy[barycoord_mode], interp);
1075 }
1076
1077 void
1078 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1079                                        const glsl_type *type,
1080                                        glsl_interp_qualifier interpolation_mode,
1081                                        int location, bool mod_centroid,
1082                                        bool mod_sample)
1083 {
1084    attr.type = brw_type_for_base_type(type->get_scalar_type());
1085
1086    assert(stage == MESA_SHADER_FRAGMENT);
1087    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1088    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1089
1090    unsigned int array_elements;
1091
1092    if (type->is_array()) {
1093       array_elements = type->arrays_of_arrays_size();
1094       if (array_elements == 0) {
1095          fail("dereferenced array '%s' has length 0\n", name);
1096       }
1097       type = type->without_array();
1098    } else {
1099       array_elements = 1;
1100    }
1101
1102    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1103       bool is_gl_Color =
1104          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1105       if (key->flat_shade && is_gl_Color) {
1106          interpolation_mode = INTERP_QUALIFIER_FLAT;
1107       } else {
1108          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1109       }
1110    }
1111
1112    for (unsigned int i = 0; i < array_elements; i++) {
1113       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1114          if (prog_data->urb_setup[location] == -1) {
1115             /* If there's no incoming setup data for this slot, don't
1116              * emit interpolation for it.
1117              */
1118             attr = offset(attr, bld, type->vector_elements);
1119             location++;
1120             continue;
1121          }
1122
1123          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1124             /* Constant interpolation (flat shading) case. The SF has
1125              * handed us defined values in only the constant offset
1126              * field of the setup reg.
1127              */
1128             for (unsigned int k = 0; k < type->vector_elements; k++) {
1129                struct brw_reg interp = interp_reg(location, k);
1130                interp = suboffset(interp, 3);
1131                interp.type = attr.type;
1132                bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1133                attr = offset(attr, bld, 1);
1134             }
1135          } else {
1136             /* Smooth/noperspective interpolation case. */
1137             for (unsigned int k = 0; k < type->vector_elements; k++) {
1138                struct brw_reg interp = interp_reg(location, k);
1139                if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1140                   /* Get the pixel/sample mask into f0 so that we know
1141                    * which pixels are lit.  Then, for each channel that is
1142                    * unlit, replace the centroid data with non-centroid
1143                    * data.
1144                    */
1145                   bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1146
1147                   fs_inst *inst;
1148                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1149                                       false, false);
1150                   inst->predicate = BRW_PREDICATE_NORMAL;
1151                   inst->predicate_inverse = true;
1152                   if (devinfo->has_pln)
1153                      inst->no_dd_clear = true;
1154
1155                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1156                                       mod_centroid && !key->persample_shading,
1157                                       mod_sample || key->persample_shading);
1158                   inst->predicate = BRW_PREDICATE_NORMAL;
1159                   inst->predicate_inverse = false;
1160                   if (devinfo->has_pln)
1161                      inst->no_dd_check = true;
1162
1163                } else {
1164                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1165                                mod_centroid && !key->persample_shading,
1166                                mod_sample || key->persample_shading);
1167                }
1168                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1169                   bld.MUL(attr, attr, this->pixel_w);
1170                }
1171                attr = offset(attr, bld, 1);
1172             }
1173
1174          }
1175          location++;
1176       }
1177    }
1178 }
1179
1180 fs_reg *
1181 fs_visitor::emit_frontfacing_interpolation()
1182 {
1183    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1184
1185    if (devinfo->gen >= 6) {
1186       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1187        * a boolean result from this (~0/true or 0/false).
1188        *
1189        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1190        * this task in only one instruction:
1191        *    - a negation source modifier will flip the bit; and
1192        *    - a W -> D type conversion will sign extend the bit into the high
1193        *      word of the destination.
1194        *
1195        * An ASR 15 fills the low word of the destination.
1196        */
1197       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1198       g0.negate = true;
1199
1200       bld.ASR(*reg, g0, fs_reg(15));
1201    } else {
1202       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1203        * a boolean result from this (1/true or 0/false).
1204        *
1205        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1206        * the negation source modifier to flip it. Unfortunately the SHR
1207        * instruction only operates on UD (or D with an abs source modifier)
1208        * sources without negation.
1209        *
1210        * Instead, use ASR (which will give ~0/true or 0/false).
1211        */
1212       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1213       g1_6.negate = true;
1214
1215       bld.ASR(*reg, g1_6, fs_reg(31));
1216    }
1217
1218    return reg;
1219 }
1220
1221 void
1222 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1223 {
1224    assert(stage == MESA_SHADER_FRAGMENT);
1225    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1226    assert(dst.type == BRW_REGISTER_TYPE_F);
1227
1228    if (key->compute_pos_offset) {
1229       /* Convert int_sample_pos to floating point */
1230       bld.MOV(dst, int_sample_pos);
1231       /* Scale to the range [0, 1] */
1232       bld.MUL(dst, dst, fs_reg(1 / 16.0f));
1233    }
1234    else {
1235       /* From ARB_sample_shading specification:
1236        * "When rendering to a non-multisample buffer, or if multisample
1237        *  rasterization is disabled, gl_SamplePosition will always be
1238        *  (0.5, 0.5).
1239        */
1240       bld.MOV(dst, fs_reg(0.5f));
1241    }
1242 }
1243
1244 fs_reg *
1245 fs_visitor::emit_samplepos_setup()
1246 {
1247    assert(devinfo->gen >= 6);
1248
1249    const fs_builder abld = bld.annotate("compute sample position");
1250    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1251    fs_reg pos = *reg;
1252    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1253    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1254
1255    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1256     * mode will be enabled.
1257     *
1258     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1259     * R31.1:0         Position Offset X/Y for Slot[3:0]
1260     * R31.3:2         Position Offset X/Y for Slot[7:4]
1261     * .....
1262     *
1263     * The X, Y sample positions come in as bytes in  thread payload. So, read
1264     * the positions using vstride=16, width=8, hstride=2.
1265     */
1266    struct brw_reg sample_pos_reg =
1267       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1268                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1269
1270    if (dispatch_width == 8) {
1271       abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
1272    } else {
1273       abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
1274       abld.half(1).MOV(half(int_sample_x, 1),
1275                        fs_reg(suboffset(sample_pos_reg, 16)));
1276    }
1277    /* Compute gl_SamplePosition.x */
1278    compute_sample_position(pos, int_sample_x);
1279    pos = offset(pos, abld, 1);
1280    if (dispatch_width == 8) {
1281       abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
1282    } else {
1283       abld.half(0).MOV(half(int_sample_y, 0),
1284                        fs_reg(suboffset(sample_pos_reg, 1)));
1285       abld.half(1).MOV(half(int_sample_y, 1),
1286                        fs_reg(suboffset(sample_pos_reg, 17)));
1287    }
1288    /* Compute gl_SamplePosition.y */
1289    compute_sample_position(pos, int_sample_y);
1290    return reg;
1291 }
1292
1293 fs_reg *
1294 fs_visitor::emit_sampleid_setup()
1295 {
1296    assert(stage == MESA_SHADER_FRAGMENT);
1297    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1298    assert(devinfo->gen >= 6);
1299
1300    const fs_builder abld = bld.annotate("compute sample id");
1301    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1302
1303    if (key->compute_sample_id) {
1304       fs_reg t1(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_D);
1305       t1.set_smear(0);
1306       fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
1307
1308       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1309        * 8x multisampling, subspan 0 will represent sample N (where N
1310        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1311        * 7. We can find the value of N by looking at R0.0 bits 7:6
1312        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1313        * (since samples are always delivered in pairs). That is, we
1314        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1315        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1316        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1317        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1318        * populating a temporary variable with the sequence (0, 1, 2, 3),
1319        * and then reading from it using vstride=1, width=4, hstride=0.
1320        * These computations hold good for 4x multisampling as well.
1321        *
1322        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1323        * the first four slots are sample 0 of subspan 0; the next four
1324        * are sample 1 of subspan 0; the third group is sample 0 of
1325        * subspan 1, and finally sample 1 of subspan 1.
1326        */
1327
1328       /* SKL+ has an extra bit for the Starting Sample Pair Index to
1329        * accomodate 16x MSAA.
1330        */
1331       unsigned sspi_mask = devinfo->gen >= 9 ? 0x1c0 : 0xc0;
1332
1333       abld.exec_all().group(1, 0)
1334           .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1335                fs_reg(sspi_mask));
1336       abld.exec_all().group(1, 0).SHR(t1, t1, fs_reg(5));
1337
1338       /* This works for both SIMD8 and SIMD16 */
1339       abld.exec_all().group(4, 0)
1340           .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
1341
1342       /* This special instruction takes care of setting vstride=1,
1343        * width=4, hstride=0 of t2 during an ADD instruction.
1344        */
1345       abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1346    } else {
1347       /* As per GL_ARB_sample_shading specification:
1348        * "When rendering to a non-multisample buffer, or if multisample
1349        *  rasterization is disabled, gl_SampleID will always be zero."
1350        */
1351       abld.MOV(*reg, fs_reg(0));
1352    }
1353
1354    return reg;
1355 }
1356
1357 fs_reg
1358 fs_visitor::resolve_source_modifiers(const fs_reg &src)
1359 {
1360    if (!src.abs && !src.negate)
1361       return src;
1362
1363    fs_reg temp = bld.vgrf(src.type);
1364    bld.MOV(temp, src);
1365
1366    return temp;
1367 }
1368
1369 void
1370 fs_visitor::emit_discard_jump()
1371 {
1372    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1373
1374    /* For performance, after a discard, jump to the end of the
1375     * shader if all relevant channels have been discarded.
1376     */
1377    fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
1378    discard_jump->flag_subreg = 1;
1379
1380    discard_jump->predicate = (dispatch_width == 8)
1381                              ? BRW_PREDICATE_ALIGN1_ANY8H
1382                              : BRW_PREDICATE_ALIGN1_ANY16H;
1383    discard_jump->predicate_inverse = true;
1384 }
1385
1386 void
1387 fs_visitor::emit_gs_thread_end()
1388 {
1389    assert(stage == MESA_SHADER_GEOMETRY);
1390
1391    struct brw_gs_prog_data *gs_prog_data =
1392       (struct brw_gs_prog_data *) prog_data;
1393
1394    if (gs_compile->control_data_header_size_bits > 0) {
1395       emit_gs_control_data_bits(this->final_gs_vertex_count);
1396    }
1397
1398    const fs_builder abld = bld.annotate("thread end");
1399    fs_inst *inst;
1400
1401    if (gs_prog_data->static_vertex_count != -1) {
1402       foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
1403          if (prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8 ||
1404              prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
1405              prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
1406              prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) {
1407             prev->eot = true;
1408
1409             /* Delete now dead instructions. */
1410             foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
1411                if (dead == prev)
1412                   break;
1413                dead->remove();
1414             }
1415             return;
1416          } else if (prev->is_control_flow() || prev->has_side_effects()) {
1417             break;
1418          }
1419       }
1420       fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1421       abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)));
1422       inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, hdr);
1423       inst->mlen = 1;
1424    } else {
1425       fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2);
1426       fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
1427       sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
1428       sources[1] = this->final_gs_vertex_count;
1429       abld.LOAD_PAYLOAD(payload, sources, 2, 2);
1430       inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
1431       inst->mlen = 2;
1432    }
1433    inst->eot = true;
1434    inst->offset = 0;
1435 }
1436
1437 void
1438 fs_visitor::assign_curb_setup()
1439 {
1440    if (dispatch_width == 8) {
1441       prog_data->dispatch_grf_start_reg = payload.num_regs;
1442    } else {
1443       if (stage == MESA_SHADER_FRAGMENT) {
1444          brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1445          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1446       } else if (stage == MESA_SHADER_COMPUTE) {
1447          brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1448          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1449       } else {
1450          unreachable("Unsupported shader type!");
1451       }
1452    }
1453
1454    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1455
1456    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1457    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1458       for (unsigned int i = 0; i < inst->sources; i++) {
1459          if (inst->src[i].file == UNIFORM) {
1460             int uniform_nr = inst->src[i].nr + inst->src[i].reg_offset;
1461             int constant_nr;
1462             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1463                constant_nr = push_constant_loc[uniform_nr];
1464             } else {
1465                /* Section 5.11 of the OpenGL 4.1 spec says:
1466                 * "Out-of-bounds reads return undefined values, which include
1467                 *  values from other variables of the active program or zero."
1468                 * Just return the first push constant.
1469                 */
1470                constant_nr = 0;
1471             }
1472
1473             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1474                                                   constant_nr / 8,
1475                                                   constant_nr % 8);
1476             brw_reg.abs = inst->src[i].abs;
1477             brw_reg.negate = inst->src[i].negate;
1478
1479             assert(inst->src[i].stride == 0);
1480             inst->src[i] = byte_offset(
1481                retype(brw_reg, inst->src[i].type),
1482                inst->src[i].subreg_offset);
1483          }
1484       }
1485    }
1486
1487    /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
1488    this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length;
1489 }
1490
1491 void
1492 fs_visitor::calculate_urb_setup()
1493 {
1494    assert(stage == MESA_SHADER_FRAGMENT);
1495    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1496    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1497
1498    memset(prog_data->urb_setup, -1,
1499           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1500
1501    int urb_next = 0;
1502    /* Figure out where each of the incoming setup attributes lands. */
1503    if (devinfo->gen >= 6) {
1504       if (_mesa_bitcount_64(nir->info.inputs_read &
1505                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1506          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1507           * first 16 varying inputs, so we can put them wherever we want.
1508           * Just put them in order.
1509           *
1510           * This is useful because it means that (a) inputs not used by the
1511           * fragment shader won't take up valuable register space, and (b) we
1512           * won't have to recompile the fragment shader if it gets paired with
1513           * a different vertex (or geometry) shader.
1514           */
1515          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1516             if (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
1517                 BITFIELD64_BIT(i)) {
1518                prog_data->urb_setup[i] = urb_next++;
1519             }
1520          }
1521       } else {
1522          bool include_vue_header =
1523             nir->info.inputs_read & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
1524
1525          /* We have enough input varyings that the SF/SBE pipeline stage can't
1526           * arbitrarily rearrange them to suit our whim; we have to put them
1527           * in an order that matches the output of the previous pipeline stage
1528           * (geometry or vertex shader).
1529           */
1530          struct brw_vue_map prev_stage_vue_map;
1531          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1532                              key->input_slots_valid,
1533                              nir->info.separate_shader);
1534          int first_slot =
1535             include_vue_header ? 0 : 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1536
1537          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1538          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1539               slot++) {
1540             int varying = prev_stage_vue_map.slot_to_varying[slot];
1541             if (varying != BRW_VARYING_SLOT_PAD &&
1542                 (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
1543                  BITFIELD64_BIT(varying))) {
1544                prog_data->urb_setup[varying] = slot - first_slot;
1545             }
1546          }
1547          urb_next = prev_stage_vue_map.num_slots - first_slot;
1548       }
1549    } else {
1550       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1551       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1552          /* Point size is packed into the header, not as a general attribute */
1553          if (i == VARYING_SLOT_PSIZ)
1554             continue;
1555
1556          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1557             /* The back color slot is skipped when the front color is
1558              * also written to.  In addition, some slots can be
1559              * written in the vertex shader and not read in the
1560              * fragment shader.  So the register number must always be
1561              * incremented, mapped or not.
1562              */
1563             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1564                prog_data->urb_setup[i] = urb_next;
1565             urb_next++;
1566          }
1567       }
1568
1569       /*
1570        * It's a FS only attribute, and we did interpolation for this attribute
1571        * in SF thread. So, count it here, too.
1572        *
1573        * See compile_sf_prog() for more info.
1574        */
1575       if (nir->info.inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1576          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1577    }
1578
1579    prog_data->num_varying_inputs = urb_next;
1580 }
1581
1582 void
1583 fs_visitor::assign_urb_setup()
1584 {
1585    assert(stage == MESA_SHADER_FRAGMENT);
1586    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1587
1588    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1589
1590    /* Offset all the urb_setup[] index by the actual position of the
1591     * setup regs, now that the location of the constants has been chosen.
1592     */
1593    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1594       if (inst->opcode == FS_OPCODE_LINTERP) {
1595          assert(inst->src[1].file == FIXED_GRF);
1596          inst->src[1].nr += urb_start;
1597       }
1598
1599       if (inst->opcode == FS_OPCODE_CINTERP) {
1600          assert(inst->src[0].file == FIXED_GRF);
1601          inst->src[0].nr += urb_start;
1602       }
1603    }
1604
1605    /* Each attribute is 4 setup channels, each of which is half a reg. */
1606    this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
1607 }
1608
1609 void
1610 fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
1611 {
1612    for (int i = 0; i < inst->sources; i++) {
1613       if (inst->src[i].file == ATTR) {
1614          int grf = payload.num_regs +
1615                    prog_data->curb_read_length +
1616                    inst->src[i].nr +
1617                    inst->src[i].reg_offset;
1618
1619          unsigned width = inst->src[i].stride == 0 ? 1 : inst->exec_size;
1620          struct brw_reg reg =
1621             stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1622                                inst->src[i].subreg_offset),
1623                    inst->exec_size * inst->src[i].stride,
1624                    width, inst->src[i].stride);
1625          reg.abs = inst->src[i].abs;
1626          reg.negate = inst->src[i].negate;
1627
1628          inst->src[i] = reg;
1629       }
1630    }
1631 }
1632
1633 void
1634 fs_visitor::assign_vs_urb_setup()
1635 {
1636    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1637
1638    assert(stage == MESA_SHADER_VERTEX);
1639    int count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1640    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1641       count++;
1642
1643    /* Each attribute is 4 regs. */
1644    this->first_non_payload_grf += 4 * vs_prog_data->nr_attributes;
1645
1646    assert(vs_prog_data->base.urb_read_length <= 15);
1647
1648    /* Rewrite all ATTR file references to the hw grf that they land in. */
1649    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1650       convert_attr_sources_to_hw_regs(inst);
1651    }
1652 }
1653
1654 void
1655 fs_visitor::assign_gs_urb_setup()
1656 {
1657    assert(stage == MESA_SHADER_GEOMETRY);
1658
1659    brw_vue_prog_data *vue_prog_data = (brw_vue_prog_data *) prog_data;
1660
1661    first_non_payload_grf +=
1662       8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
1663
1664    const unsigned first_icp_handle = payload.num_regs -
1665       (vue_prog_data->include_vue_handles ? nir->info.gs.vertices_in : 0);
1666
1667    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1668       /* Lower URB_READ_SIMD8 opcodes into real messages. */
1669       if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8) {
1670          assert(inst->src[0].file == IMM);
1671          inst->src[0] = retype(brw_vec8_grf(first_icp_handle +
1672                                             inst->src[0].ud,
1673                                             0), BRW_REGISTER_TYPE_UD);
1674          /* for now, assume constant - we can do per-slot offsets later */
1675          assert(inst->src[1].file == IMM);
1676          inst->offset = inst->src[1].ud;
1677          inst->src[1] = fs_reg();
1678          inst->mlen = 1;
1679          inst->base_mrf = -1;
1680       }
1681
1682       /* Rewrite all ATTR file references to GRFs. */
1683       convert_attr_sources_to_hw_regs(inst);
1684    }
1685 }
1686
1687
1688 /**
1689  * Split large virtual GRFs into separate components if we can.
1690  *
1691  * This is mostly duplicated with what brw_fs_vector_splitting does,
1692  * but that's really conservative because it's afraid of doing
1693  * splitting that doesn't result in real progress after the rest of
1694  * the optimization phases, which would cause infinite looping in
1695  * optimization.  We can do it once here, safely.  This also has the
1696  * opportunity to split interpolated values, or maybe even uniforms,
1697  * which we don't have at the IR level.
1698  *
1699  * We want to split, because virtual GRFs are what we register
1700  * allocate and spill (due to contiguousness requirements for some
1701  * instructions), and they're what we naturally generate in the
1702  * codegen process, but most virtual GRFs don't actually need to be
1703  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1704  * live intervals and better dead code elimination and coalescing.
1705  */
1706 void
1707 fs_visitor::split_virtual_grfs()
1708 {
1709    int num_vars = this->alloc.count;
1710
1711    /* Count the total number of registers */
1712    int reg_count = 0;
1713    int vgrf_to_reg[num_vars];
1714    for (int i = 0; i < num_vars; i++) {
1715       vgrf_to_reg[i] = reg_count;
1716       reg_count += alloc.sizes[i];
1717    }
1718
1719    /* An array of "split points".  For each register slot, this indicates
1720     * if this slot can be separated from the previous slot.  Every time an
1721     * instruction uses multiple elements of a register (as a source or
1722     * destination), we mark the used slots as inseparable.  Then we go
1723     * through and split the registers into the smallest pieces we can.
1724     */
1725    bool split_points[reg_count];
1726    memset(split_points, 0, sizeof(split_points));
1727
1728    /* Mark all used registers as fully splittable */
1729    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1730       if (inst->dst.file == VGRF) {
1731          int reg = vgrf_to_reg[inst->dst.nr];
1732          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
1733             split_points[reg + j] = true;
1734       }
1735
1736       for (int i = 0; i < inst->sources; i++) {
1737          if (inst->src[i].file == VGRF) {
1738             int reg = vgrf_to_reg[inst->src[i].nr];
1739             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
1740                split_points[reg + j] = true;
1741          }
1742       }
1743    }
1744
1745    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1746       if (inst->dst.file == VGRF) {
1747          int reg = vgrf_to_reg[inst->dst.nr] + inst->dst.reg_offset;
1748          for (int j = 1; j < inst->regs_written; j++)
1749             split_points[reg + j] = false;
1750       }
1751       for (int i = 0; i < inst->sources; i++) {
1752          if (inst->src[i].file == VGRF) {
1753             int reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].reg_offset;
1754             for (int j = 1; j < inst->regs_read(i); j++)
1755                split_points[reg + j] = false;
1756          }
1757       }
1758    }
1759
1760    int new_virtual_grf[reg_count];
1761    int new_reg_offset[reg_count];
1762
1763    int reg = 0;
1764    for (int i = 0; i < num_vars; i++) {
1765       /* The first one should always be 0 as a quick sanity check. */
1766       assert(split_points[reg] == false);
1767
1768       /* j = 0 case */
1769       new_reg_offset[reg] = 0;
1770       reg++;
1771       int offset = 1;
1772
1773       /* j > 0 case */
1774       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1775          /* If this is a split point, reset the offset to 0 and allocate a
1776           * new virtual GRF for the previous offset many registers
1777           */
1778          if (split_points[reg]) {
1779             assert(offset <= MAX_VGRF_SIZE);
1780             int grf = alloc.allocate(offset);
1781             for (int k = reg - offset; k < reg; k++)
1782                new_virtual_grf[k] = grf;
1783             offset = 0;
1784          }
1785          new_reg_offset[reg] = offset;
1786          offset++;
1787          reg++;
1788       }
1789
1790       /* The last one gets the original register number */
1791       assert(offset <= MAX_VGRF_SIZE);
1792       alloc.sizes[i] = offset;
1793       for (int k = reg - offset; k < reg; k++)
1794          new_virtual_grf[k] = i;
1795    }
1796    assert(reg == reg_count);
1797
1798    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1799       if (inst->dst.file == VGRF) {
1800          reg = vgrf_to_reg[inst->dst.nr] + inst->dst.reg_offset;
1801          inst->dst.nr = new_virtual_grf[reg];
1802          inst->dst.reg_offset = new_reg_offset[reg];
1803          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1804       }
1805       for (int i = 0; i < inst->sources; i++) {
1806          if (inst->src[i].file == VGRF) {
1807             reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].reg_offset;
1808             inst->src[i].nr = new_virtual_grf[reg];
1809             inst->src[i].reg_offset = new_reg_offset[reg];
1810             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1811          }
1812       }
1813    }
1814    invalidate_live_intervals();
1815 }
1816
1817 /**
1818  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1819  *
1820  * During code generation, we create tons of temporary variables, many of
1821  * which get immediately killed and are never used again.  Yet, in later
1822  * optimization and analysis passes, such as compute_live_intervals, we need
1823  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1824  * overhead.
1825  */
1826 bool
1827 fs_visitor::compact_virtual_grfs()
1828 {
1829    bool progress = false;
1830    int remap_table[this->alloc.count];
1831    memset(remap_table, -1, sizeof(remap_table));
1832
1833    /* Mark which virtual GRFs are used. */
1834    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1835       if (inst->dst.file == VGRF)
1836          remap_table[inst->dst.nr] = 0;
1837
1838       for (int i = 0; i < inst->sources; i++) {
1839          if (inst->src[i].file == VGRF)
1840             remap_table[inst->src[i].nr] = 0;
1841       }
1842    }
1843
1844    /* Compact the GRF arrays. */
1845    int new_index = 0;
1846    for (unsigned i = 0; i < this->alloc.count; i++) {
1847       if (remap_table[i] == -1) {
1848          /* We just found an unused register.  This means that we are
1849           * actually going to compact something.
1850           */
1851          progress = true;
1852       } else {
1853          remap_table[i] = new_index;
1854          alloc.sizes[new_index] = alloc.sizes[i];
1855          invalidate_live_intervals();
1856          ++new_index;
1857       }
1858    }
1859
1860    this->alloc.count = new_index;
1861
1862    /* Patch all the instructions to use the newly renumbered registers */
1863    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1864       if (inst->dst.file == VGRF)
1865          inst->dst.nr = remap_table[inst->dst.nr];
1866
1867       for (int i = 0; i < inst->sources; i++) {
1868          if (inst->src[i].file == VGRF)
1869             inst->src[i].nr = remap_table[inst->src[i].nr];
1870       }
1871    }
1872
1873    /* Patch all the references to delta_xy, since they're used in register
1874     * allocation.  If they're unused, switch them to BAD_FILE so we don't
1875     * think some random VGRF is delta_xy.
1876     */
1877    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
1878       if (delta_xy[i].file == VGRF) {
1879          if (remap_table[delta_xy[i].nr] != -1) {
1880             delta_xy[i].nr = remap_table[delta_xy[i].nr];
1881          } else {
1882             delta_xy[i].file = BAD_FILE;
1883          }
1884       }
1885    }
1886
1887    return progress;
1888 }
1889
1890 /**
1891  * Assign UNIFORM file registers to either push constants or pull constants.
1892  *
1893  * We allow a fragment shader to have more than the specified minimum
1894  * maximum number of fragment shader uniform components (64).  If
1895  * there are too many of these, they'd fill up all of register space.
1896  * So, this will push some of them out to the pull constant buffer and
1897  * update the program to load them.  We also use pull constants for all
1898  * indirect constant loads because we don't support indirect accesses in
1899  * registers yet.
1900  */
1901 void
1902 fs_visitor::assign_constant_locations()
1903 {
1904    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1905    if (dispatch_width != 8)
1906       return;
1907
1908    unsigned int num_pull_constants = 0;
1909
1910    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1911    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
1912
1913    bool is_live[uniforms];
1914    memset(is_live, 0, sizeof(is_live));
1915
1916    /* First, we walk through the instructions and do two things:
1917     *
1918     *  1) Figure out which uniforms are live.
1919     *
1920     *  2) Find all indirect access of uniform arrays and flag them as needing
1921     *     to go into the pull constant buffer.
1922     *
1923     * Note that we don't move constant-indexed accesses to arrays.  No
1924     * testing has been done of the performance impact of this choice.
1925     */
1926    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
1927       for (int i = 0 ; i < inst->sources; i++) {
1928          if (inst->src[i].file != UNIFORM)
1929             continue;
1930
1931          if (inst->src[i].reladdr) {
1932             int uniform = inst->src[i].nr;
1933
1934             /* If this array isn't already present in the pull constant buffer,
1935              * add it.
1936              */
1937             if (pull_constant_loc[uniform] == -1) {
1938                assert(param_size[uniform]);
1939                for (int j = 0; j < param_size[uniform]; j++)
1940                   pull_constant_loc[uniform + j] = num_pull_constants++;
1941             }
1942          } else {
1943             /* Mark the the one accessed uniform as live */
1944             int constant_nr = inst->src[i].nr + inst->src[i].reg_offset;
1945             if (constant_nr >= 0 && constant_nr < (int) uniforms)
1946                is_live[constant_nr] = true;
1947          }
1948       }
1949    }
1950
1951    /* Only allow 16 registers (128 uniform components) as push constants.
1952     *
1953     * Just demote the end of the list.  We could probably do better
1954     * here, demoting things that are rarely used in the program first.
1955     *
1956     * If changing this value, note the limitation about total_regs in
1957     * brw_curbe.c.
1958     */
1959    unsigned int max_push_components = 16 * 8;
1960    unsigned int num_push_constants = 0;
1961
1962    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1963
1964    for (unsigned int i = 0; i < uniforms; i++) {
1965       if (!is_live[i] || pull_constant_loc[i] != -1) {
1966          /* This UNIFORM register is either dead, or has already been demoted
1967           * to a pull const.  Mark it as no longer living in the param[] array.
1968           */
1969          push_constant_loc[i] = -1;
1970          continue;
1971       }
1972
1973       if (num_push_constants < max_push_components) {
1974          /* Retain as a push constant.  Record the location in the params[]
1975           * array.
1976           */
1977          push_constant_loc[i] = num_push_constants++;
1978       } else {
1979          /* Demote to a pull constant. */
1980          push_constant_loc[i] = -1;
1981          pull_constant_loc[i] = num_pull_constants++;
1982       }
1983    }
1984
1985    stage_prog_data->nr_params = num_push_constants;
1986    stage_prog_data->nr_pull_params = num_pull_constants;
1987
1988    /* Up until now, the param[] array has been indexed by reg + reg_offset
1989     * of UNIFORM registers.  Move pull constants into pull_param[] and
1990     * condense param[] to only contain the uniforms we chose to push.
1991     *
1992     * NOTE: Because we are condensing the params[] array, we know that
1993     * push_constant_loc[i] <= i and we can do it in one smooth loop without
1994     * having to make a copy.
1995     */
1996    for (unsigned int i = 0; i < uniforms; i++) {
1997       const gl_constant_value *value = stage_prog_data->param[i];
1998
1999       if (pull_constant_loc[i] != -1) {
2000          stage_prog_data->pull_param[pull_constant_loc[i]] = value;
2001       } else if (push_constant_loc[i] != -1) {
2002          stage_prog_data->param[push_constant_loc[i]] = value;
2003       }
2004    }
2005 }
2006
2007 /**
2008  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2009  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2010  */
2011 void
2012 fs_visitor::demote_pull_constants()
2013 {
2014    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2015       for (int i = 0; i < inst->sources; i++) {
2016          if (inst->src[i].file != UNIFORM)
2017             continue;
2018
2019          int pull_index;
2020          unsigned location = inst->src[i].nr + inst->src[i].reg_offset;
2021          if (location >= uniforms) /* Out of bounds access */
2022             pull_index = -1;
2023          else
2024             pull_index = pull_constant_loc[location];
2025
2026          if (pull_index == -1)
2027             continue;
2028
2029          /* Set up the annotation tracking for new generated instructions. */
2030          const fs_builder ibld(this, block, inst);
2031          const unsigned index = stage_prog_data->binding_table.pull_constants_start;
2032          fs_reg dst = vgrf(glsl_type::float_type);
2033
2034          assert(inst->src[i].stride == 0);
2035
2036          /* Generate a pull load into dst. */
2037          if (inst->src[i].reladdr) {
2038             VARYING_PULL_CONSTANT_LOAD(ibld, dst,
2039                                        fs_reg(index),
2040                                        *inst->src[i].reladdr,
2041                                        pull_index);
2042             inst->src[i].reladdr = NULL;
2043             inst->src[i].stride = 1;
2044          } else {
2045             const fs_builder ubld = ibld.exec_all().group(8, 0);
2046             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2047             ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
2048                       dst, fs_reg(index), offset);
2049             inst->src[i].set_smear(pull_index & 3);
2050          }
2051          brw_mark_surface_used(prog_data, index);
2052
2053          /* Rewrite the instruction to use the temporary VGRF. */
2054          inst->src[i].file = VGRF;
2055          inst->src[i].nr = dst.nr;
2056          inst->src[i].reg_offset = 0;
2057       }
2058    }
2059    invalidate_live_intervals();
2060 }
2061
2062 bool
2063 fs_visitor::opt_algebraic()
2064 {
2065    bool progress = false;
2066
2067    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2068       switch (inst->opcode) {
2069       case BRW_OPCODE_MOV:
2070          if (inst->src[0].file != IMM)
2071             break;
2072
2073          if (inst->saturate) {
2074             if (inst->dst.type != inst->src[0].type)
2075                assert(!"unimplemented: saturate mixed types");
2076
2077             if (brw_saturate_immediate(inst->dst.type, &inst->src[0])) {
2078                inst->saturate = false;
2079                progress = true;
2080             }
2081          }
2082          break;
2083
2084       case BRW_OPCODE_MUL:
2085          if (inst->src[1].file != IMM)
2086             continue;
2087
2088          /* a * 1.0 = a */
2089          if (inst->src[1].is_one()) {
2090             inst->opcode = BRW_OPCODE_MOV;
2091             inst->src[1] = reg_undef;
2092             progress = true;
2093             break;
2094          }
2095
2096          /* a * -1.0 = -a */
2097          if (inst->src[1].is_negative_one()) {
2098             inst->opcode = BRW_OPCODE_MOV;
2099             inst->src[0].negate = !inst->src[0].negate;
2100             inst->src[1] = reg_undef;
2101             progress = true;
2102             break;
2103          }
2104
2105          /* a * 0.0 = 0.0 */
2106          if (inst->src[1].is_zero()) {
2107             inst->opcode = BRW_OPCODE_MOV;
2108             inst->src[0] = inst->src[1];
2109             inst->src[1] = reg_undef;
2110             progress = true;
2111             break;
2112          }
2113
2114          if (inst->src[0].file == IMM) {
2115             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2116             inst->opcode = BRW_OPCODE_MOV;
2117             inst->src[0].f *= inst->src[1].f;
2118             inst->src[1] = reg_undef;
2119             progress = true;
2120             break;
2121          }
2122          break;
2123       case BRW_OPCODE_ADD:
2124          if (inst->src[1].file != IMM)
2125             continue;
2126
2127          /* a + 0.0 = a */
2128          if (inst->src[1].is_zero()) {
2129             inst->opcode = BRW_OPCODE_MOV;
2130             inst->src[1] = reg_undef;
2131             progress = true;
2132             break;
2133          }
2134
2135          if (inst->src[0].file == IMM) {
2136             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2137             inst->opcode = BRW_OPCODE_MOV;
2138             inst->src[0].f += inst->src[1].f;
2139             inst->src[1] = reg_undef;
2140             progress = true;
2141             break;
2142          }
2143          break;
2144       case BRW_OPCODE_OR:
2145          if (inst->src[0].equals(inst->src[1])) {
2146             inst->opcode = BRW_OPCODE_MOV;
2147             inst->src[1] = reg_undef;
2148             progress = true;
2149             break;
2150          }
2151          break;
2152       case BRW_OPCODE_LRP:
2153          if (inst->src[1].equals(inst->src[2])) {
2154             inst->opcode = BRW_OPCODE_MOV;
2155             inst->src[0] = inst->src[1];
2156             inst->src[1] = reg_undef;
2157             inst->src[2] = reg_undef;
2158             progress = true;
2159             break;
2160          }
2161          break;
2162       case BRW_OPCODE_CMP:
2163          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2164              inst->src[0].abs &&
2165              inst->src[0].negate &&
2166              inst->src[1].is_zero()) {
2167             inst->src[0].abs = false;
2168             inst->src[0].negate = false;
2169             inst->conditional_mod = BRW_CONDITIONAL_Z;
2170             progress = true;
2171             break;
2172          }
2173          break;
2174       case BRW_OPCODE_SEL:
2175          if (inst->src[0].equals(inst->src[1])) {
2176             inst->opcode = BRW_OPCODE_MOV;
2177             inst->src[1] = reg_undef;
2178             inst->predicate = BRW_PREDICATE_NONE;
2179             inst->predicate_inverse = false;
2180             progress = true;
2181          } else if (inst->saturate && inst->src[1].file == IMM) {
2182             switch (inst->conditional_mod) {
2183             case BRW_CONDITIONAL_LE:
2184             case BRW_CONDITIONAL_L:
2185                switch (inst->src[1].type) {
2186                case BRW_REGISTER_TYPE_F:
2187                   if (inst->src[1].f >= 1.0f) {
2188                      inst->opcode = BRW_OPCODE_MOV;
2189                      inst->src[1] = reg_undef;
2190                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2191                      progress = true;
2192                   }
2193                   break;
2194                default:
2195                   break;
2196                }
2197                break;
2198             case BRW_CONDITIONAL_GE:
2199             case BRW_CONDITIONAL_G:
2200                switch (inst->src[1].type) {
2201                case BRW_REGISTER_TYPE_F:
2202                   if (inst->src[1].f <= 0.0f) {
2203                      inst->opcode = BRW_OPCODE_MOV;
2204                      inst->src[1] = reg_undef;
2205                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2206                      progress = true;
2207                   }
2208                   break;
2209                default:
2210                   break;
2211                }
2212             default:
2213                break;
2214             }
2215          }
2216          break;
2217       case BRW_OPCODE_MAD:
2218          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2219             inst->opcode = BRW_OPCODE_MOV;
2220             inst->src[1] = reg_undef;
2221             inst->src[2] = reg_undef;
2222             progress = true;
2223          } else if (inst->src[0].is_zero()) {
2224             inst->opcode = BRW_OPCODE_MUL;
2225             inst->src[0] = inst->src[2];
2226             inst->src[2] = reg_undef;
2227             progress = true;
2228          } else if (inst->src[1].is_one()) {
2229             inst->opcode = BRW_OPCODE_ADD;
2230             inst->src[1] = inst->src[2];
2231             inst->src[2] = reg_undef;
2232             progress = true;
2233          } else if (inst->src[2].is_one()) {
2234             inst->opcode = BRW_OPCODE_ADD;
2235             inst->src[2] = reg_undef;
2236             progress = true;
2237          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2238             inst->opcode = BRW_OPCODE_ADD;
2239             inst->src[1].f *= inst->src[2].f;
2240             inst->src[2] = reg_undef;
2241             progress = true;
2242          }
2243          break;
2244       case SHADER_OPCODE_RCP: {
2245          fs_inst *prev = (fs_inst *)inst->prev;
2246          if (prev->opcode == SHADER_OPCODE_SQRT) {
2247             if (inst->src[0].equals(prev->dst)) {
2248                inst->opcode = SHADER_OPCODE_RSQ;
2249                inst->src[0] = prev->src[0];
2250                progress = true;
2251             }
2252          }
2253          break;
2254       }
2255       case SHADER_OPCODE_BROADCAST:
2256          if (is_uniform(inst->src[0])) {
2257             inst->opcode = BRW_OPCODE_MOV;
2258             inst->sources = 1;
2259             inst->force_writemask_all = true;
2260             progress = true;
2261          } else if (inst->src[1].file == IMM) {
2262             inst->opcode = BRW_OPCODE_MOV;
2263             inst->src[0] = component(inst->src[0],
2264                                      inst->src[1].ud);
2265             inst->sources = 1;
2266             inst->force_writemask_all = true;
2267             progress = true;
2268          }
2269          break;
2270
2271       default:
2272          break;
2273       }
2274
2275       /* Swap if src[0] is immediate. */
2276       if (progress && inst->is_commutative()) {
2277          if (inst->src[0].file == IMM) {
2278             fs_reg tmp = inst->src[1];
2279             inst->src[1] = inst->src[0];
2280             inst->src[0] = tmp;
2281          }
2282       }
2283    }
2284    return progress;
2285 }
2286
2287 /**
2288  * Optimize sample messages that have constant zero values for the trailing
2289  * texture coordinates. We can just reduce the message length for these
2290  * instructions instead of reserving a register for it. Trailing parameters
2291  * that aren't sent default to zero anyway. This will cause the dead code
2292  * eliminator to remove the MOV instruction that would otherwise be emitted to
2293  * set up the zero value.
2294  */
2295 bool
2296 fs_visitor::opt_zero_samples()
2297 {
2298    /* Gen4 infers the texturing opcode based on the message length so we can't
2299     * change it.
2300     */
2301    if (devinfo->gen < 5)
2302       return false;
2303
2304    bool progress = false;
2305
2306    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2307       if (!inst->is_tex())
2308          continue;
2309
2310       fs_inst *load_payload = (fs_inst *) inst->prev;
2311
2312       if (load_payload->is_head_sentinel() ||
2313           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2314          continue;
2315
2316       /* We don't want to remove the message header or the first parameter.
2317        * Removing the first parameter is not allowed, see the Haswell PRM
2318        * volume 7, page 149:
2319        *
2320        *     "Parameter 0 is required except for the sampleinfo message, which
2321        *      has no parameter 0"
2322        */
2323       while (inst->mlen > inst->header_size + inst->exec_size / 8 &&
2324              load_payload->src[(inst->mlen - inst->header_size) /
2325                                (inst->exec_size / 8) +
2326                                inst->header_size - 1].is_zero()) {
2327          inst->mlen -= inst->exec_size / 8;
2328          progress = true;
2329       }
2330    }
2331
2332    if (progress)
2333       invalidate_live_intervals();
2334
2335    return progress;
2336 }
2337
2338 /**
2339  * Optimize sample messages which are followed by the final RT write.
2340  *
2341  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2342  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2343  * final texturing results copied to the framebuffer write payload and modify
2344  * them to write to the framebuffer directly.
2345  */
2346 bool
2347 fs_visitor::opt_sampler_eot()
2348 {
2349    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2350
2351    if (stage != MESA_SHADER_FRAGMENT)
2352       return false;
2353
2354    if (devinfo->gen < 9 && !devinfo->is_cherryview)
2355       return false;
2356
2357    /* FINISHME: It should be possible to implement this optimization when there
2358     * are multiple drawbuffers.
2359     */
2360    if (key->nr_color_regions != 1)
2361       return false;
2362
2363    /* Look for a texturing instruction immediately before the final FB_WRITE. */
2364    bblock_t *block = cfg->blocks[cfg->num_blocks - 1];
2365    fs_inst *fb_write = (fs_inst *)block->end();
2366    assert(fb_write->eot);
2367    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2368
2369    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2370
2371    /* There wasn't one; nothing to do. */
2372    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2373       return false;
2374
2375    /* 3D Sampler » Messages » Message Format
2376     *
2377     * “Response Length of zero is allowed on all SIMD8* and SIMD16* sampler
2378     *  messages except sample+killpix, resinfo, sampleinfo, LOD, and gather4*”
2379     */
2380    if (tex_inst->opcode == SHADER_OPCODE_TXS ||
2381        tex_inst->opcode == SHADER_OPCODE_SAMPLEINFO ||
2382        tex_inst->opcode == SHADER_OPCODE_LOD ||
2383        tex_inst->opcode == SHADER_OPCODE_TG4 ||
2384        tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2385       return false;
2386
2387    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2388     * It's very likely to be the previous instruction.
2389     */
2390    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2391    if (load_payload->is_head_sentinel() ||
2392        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2393       return false;
2394
2395    assert(!tex_inst->eot); /* We can't get here twice */
2396    assert((tex_inst->offset & (0xff << 24)) == 0);
2397
2398    const fs_builder ibld(this, block, tex_inst);
2399
2400    tex_inst->offset |= fb_write->target << 24;
2401    tex_inst->eot = true;
2402    tex_inst->dst = ibld.null_reg_ud();
2403    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2404
2405    /* If a header is present, marking the eot is sufficient. Otherwise, we need
2406     * to create a new LOAD_PAYLOAD command with the same sources and a space
2407     * saved for the header. Using a new destination register not only makes sure
2408     * we have enough space, but it will make sure the dead code eliminator kills
2409     * the instruction that this will replace.
2410     */
2411    if (tex_inst->header_size != 0)
2412       return true;
2413
2414    fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F,
2415                                   load_payload->sources + 1);
2416    fs_reg *new_sources =
2417       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2418
2419    new_sources[0] = fs_reg();
2420    for (int i = 0; i < load_payload->sources; i++)
2421       new_sources[i+1] = load_payload->src[i];
2422
2423    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2424     * requires a lot of information about the sources to appropriately figure
2425     * out the number of registers needed to be used. Given this stage in our
2426     * optimization, we may not have the appropriate GRFs required by
2427     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2428     * manually emit the instruction.
2429     */
2430    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2431                                                     load_payload->exec_size,
2432                                                     send_header,
2433                                                     new_sources,
2434                                                     load_payload->sources + 1);
2435
2436    new_load_payload->regs_written = load_payload->regs_written + 1;
2437    new_load_payload->header_size = 1;
2438    tex_inst->mlen++;
2439    tex_inst->header_size = 1;
2440    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2441    tex_inst->src[0] = send_header;
2442
2443    return true;
2444 }
2445
2446 bool
2447 fs_visitor::opt_register_renaming()
2448 {
2449    bool progress = false;
2450    int depth = 0;
2451
2452    int remap[alloc.count];
2453    memset(remap, -1, sizeof(int) * alloc.count);
2454
2455    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2456       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2457          depth++;
2458       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2459                  inst->opcode == BRW_OPCODE_WHILE) {
2460          depth--;
2461       }
2462
2463       /* Rewrite instruction sources. */
2464       for (int i = 0; i < inst->sources; i++) {
2465          if (inst->src[i].file == VGRF &&
2466              remap[inst->src[i].nr] != -1 &&
2467              remap[inst->src[i].nr] != inst->src[i].nr) {
2468             inst->src[i].nr = remap[inst->src[i].nr];
2469             progress = true;
2470          }
2471       }
2472
2473       const int dst = inst->dst.nr;
2474
2475       if (depth == 0 &&
2476           inst->dst.file == VGRF &&
2477           alloc.sizes[inst->dst.nr] == inst->exec_size / 8 &&
2478           !inst->is_partial_write()) {
2479          if (remap[dst] == -1) {
2480             remap[dst] = dst;
2481          } else {
2482             remap[dst] = alloc.allocate(inst->exec_size / 8);
2483             inst->dst.nr = remap[dst];
2484             progress = true;
2485          }
2486       } else if (inst->dst.file == VGRF &&
2487                  remap[dst] != -1 &&
2488                  remap[dst] != dst) {
2489          inst->dst.nr = remap[dst];
2490          progress = true;
2491       }
2492    }
2493
2494    if (progress) {
2495       invalidate_live_intervals();
2496
2497       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2498          if (delta_xy[i].file == VGRF && remap[delta_xy[i].nr] != -1) {
2499             delta_xy[i].nr = remap[delta_xy[i].nr];
2500          }
2501       }
2502    }
2503
2504    return progress;
2505 }
2506
2507 /**
2508  * Remove redundant or useless discard jumps.
2509  *
2510  * For example, we can eliminate jumps in the following sequence:
2511  *
2512  * discard-jump       (redundant with the next jump)
2513  * discard-jump       (useless; jumps to the next instruction)
2514  * placeholder-halt
2515  */
2516 bool
2517 fs_visitor::opt_redundant_discard_jumps()
2518 {
2519    bool progress = false;
2520
2521    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2522
2523    fs_inst *placeholder_halt = NULL;
2524    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2525       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2526          placeholder_halt = inst;
2527          break;
2528       }
2529    }
2530
2531    if (!placeholder_halt)
2532       return false;
2533
2534    /* Delete any HALTs immediately before the placeholder halt. */
2535    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2536         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2537         prev = (fs_inst *) placeholder_halt->prev) {
2538       prev->remove(last_bblock);
2539       progress = true;
2540    }
2541
2542    if (progress)
2543       invalidate_live_intervals();
2544
2545    return progress;
2546 }
2547
2548 bool
2549 fs_visitor::compute_to_mrf()
2550 {
2551    bool progress = false;
2552    int next_ip = 0;
2553
2554    /* No MRFs on Gen >= 7. */
2555    if (devinfo->gen >= 7)
2556       return false;
2557
2558    calculate_live_intervals();
2559
2560    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2561       int ip = next_ip;
2562       next_ip++;
2563
2564       if (inst->opcode != BRW_OPCODE_MOV ||
2565           inst->is_partial_write() ||
2566           inst->dst.file != MRF || inst->src[0].file != VGRF ||
2567           inst->dst.type != inst->src[0].type ||
2568           inst->src[0].abs || inst->src[0].negate ||
2569           !inst->src[0].is_contiguous() ||
2570           inst->src[0].subreg_offset)
2571          continue;
2572
2573       /* Work out which hardware MRF registers are written by this
2574        * instruction.
2575        */
2576       int mrf_low = inst->dst.nr & ~BRW_MRF_COMPR4;
2577       int mrf_high;
2578       if (inst->dst.nr & BRW_MRF_COMPR4) {
2579          mrf_high = mrf_low + 4;
2580       } else if (inst->exec_size == 16) {
2581          mrf_high = mrf_low + 1;
2582       } else {
2583          mrf_high = mrf_low;
2584       }
2585
2586       /* Can't compute-to-MRF this GRF if someone else was going to
2587        * read it later.
2588        */
2589       if (this->virtual_grf_end[inst->src[0].nr] > ip)
2590          continue;
2591
2592       /* Found a move of a GRF to a MRF.  Let's see if we can go
2593        * rewrite the thing that made this GRF to write into the MRF.
2594        */
2595       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
2596          if (scan_inst->dst.file == VGRF &&
2597             scan_inst->dst.nr == inst->src[0].nr) {
2598             /* Found the last thing to write our reg we want to turn
2599              * into a compute-to-MRF.
2600              */
2601
2602             /* If this one instruction didn't populate all the
2603              * channels, bail.  We might be able to rewrite everything
2604              * that writes that reg, but it would require smarter
2605              * tracking to delay the rewriting until complete success.
2606              */
2607             if (scan_inst->is_partial_write())
2608                break;
2609
2610             /* Things returning more than one register would need us to
2611              * understand coalescing out more than one MOV at a time.
2612              */
2613             if (scan_inst->regs_written > scan_inst->exec_size / 8)
2614                break;
2615
2616             /* SEND instructions can't have MRF as a destination. */
2617             if (scan_inst->mlen)
2618                break;
2619
2620             if (devinfo->gen == 6) {
2621                /* gen6 math instructions must have the destination be
2622                 * GRF, so no compute-to-MRF for them.
2623                 */
2624                if (scan_inst->is_math()) {
2625                   break;
2626                }
2627             }
2628
2629             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2630                /* Found the creator of our MRF's source value. */
2631                scan_inst->dst.file = MRF;
2632                scan_inst->dst.nr = inst->dst.nr;
2633                scan_inst->saturate |= inst->saturate;
2634                inst->remove(block);
2635                progress = true;
2636             }
2637             break;
2638          }
2639
2640          /* We don't handle control flow here.  Most computation of
2641           * values that end up in MRFs are shortly before the MRF
2642           * write anyway.
2643           */
2644          if (block->start() == scan_inst)
2645             break;
2646
2647          /* You can't read from an MRF, so if someone else reads our
2648           * MRF's source GRF that we wanted to rewrite, that stops us.
2649           */
2650          bool interfered = false;
2651          for (int i = 0; i < scan_inst->sources; i++) {
2652             if (scan_inst->src[i].file == VGRF &&
2653                 scan_inst->src[i].nr == inst->src[0].nr &&
2654                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2655                interfered = true;
2656             }
2657          }
2658          if (interfered)
2659             break;
2660
2661          if (scan_inst->dst.file == MRF) {
2662             /* If somebody else writes our MRF here, we can't
2663              * compute-to-MRF before that.
2664              */
2665             int scan_mrf_low = scan_inst->dst.nr & ~BRW_MRF_COMPR4;
2666             int scan_mrf_high;
2667
2668             if (scan_inst->dst.nr & BRW_MRF_COMPR4) {
2669                scan_mrf_high = scan_mrf_low + 4;
2670             } else if (scan_inst->exec_size == 16) {
2671                scan_mrf_high = scan_mrf_low + 1;
2672             } else {
2673                scan_mrf_high = scan_mrf_low;
2674             }
2675
2676             if (mrf_low == scan_mrf_low ||
2677                 mrf_low == scan_mrf_high ||
2678                 mrf_high == scan_mrf_low ||
2679                 mrf_high == scan_mrf_high) {
2680                break;
2681             }
2682          }
2683
2684          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2685             /* Found a SEND instruction, which means that there are
2686              * live values in MRFs from base_mrf to base_mrf +
2687              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2688              * above it.
2689              */
2690             if (mrf_low >= scan_inst->base_mrf &&
2691                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2692                break;
2693             }
2694             if (mrf_high >= scan_inst->base_mrf &&
2695                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2696                break;
2697             }
2698          }
2699       }
2700    }
2701
2702    if (progress)
2703       invalidate_live_intervals();
2704
2705    return progress;
2706 }
2707
2708 /**
2709  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2710  * flow.  We could probably do better here with some form of divergence
2711  * analysis.
2712  */
2713 bool
2714 fs_visitor::eliminate_find_live_channel()
2715 {
2716    bool progress = false;
2717    unsigned depth = 0;
2718
2719    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2720       switch (inst->opcode) {
2721       case BRW_OPCODE_IF:
2722       case BRW_OPCODE_DO:
2723          depth++;
2724          break;
2725
2726       case BRW_OPCODE_ENDIF:
2727       case BRW_OPCODE_WHILE:
2728          depth--;
2729          break;
2730
2731       case FS_OPCODE_DISCARD_JUMP:
2732          /* This can potentially make control flow non-uniform until the end
2733           * of the program.
2734           */
2735          return progress;
2736
2737       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2738          if (depth == 0) {
2739             inst->opcode = BRW_OPCODE_MOV;
2740             inst->src[0] = fs_reg(0u);
2741             inst->sources = 1;
2742             inst->force_writemask_all = true;
2743             progress = true;
2744          }
2745          break;
2746
2747       default:
2748          break;
2749       }
2750    }
2751
2752    return progress;
2753 }
2754
2755 /**
2756  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2757  * instructions to FS_OPCODE_REP_FB_WRITE.
2758  */
2759 void
2760 fs_visitor::emit_repclear_shader()
2761 {
2762    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2763    int base_mrf = 1;
2764    int color_mrf = base_mrf + 2;
2765
2766    fs_inst *mov = bld.exec_all().group(4, 0)
2767                      .MOV(brw_message_reg(color_mrf),
2768                           fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
2769
2770    fs_inst *write;
2771    if (key->nr_color_regions == 1) {
2772       write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2773       write->saturate = key->clamp_fragment_color;
2774       write->base_mrf = color_mrf;
2775       write->target = 0;
2776       write->header_size = 0;
2777       write->mlen = 1;
2778    } else {
2779       assume(key->nr_color_regions > 0);
2780       for (int i = 0; i < key->nr_color_regions; ++i) {
2781          write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2782          write->saturate = key->clamp_fragment_color;
2783          write->base_mrf = base_mrf;
2784          write->target = i;
2785          write->header_size = 2;
2786          write->mlen = 3;
2787       }
2788    }
2789    write->eot = true;
2790
2791    calculate_cfg();
2792
2793    assign_constant_locations();
2794    assign_curb_setup();
2795
2796    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2797    assert(mov->src[0].file == FIXED_GRF);
2798    mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
2799 }
2800
2801 /**
2802  * Walks through basic blocks, looking for repeated MRF writes and
2803  * removing the later ones.
2804  */
2805 bool
2806 fs_visitor::remove_duplicate_mrf_writes()
2807 {
2808    fs_inst *last_mrf_move[BRW_MAX_MRF(devinfo->gen)];
2809    bool progress = false;
2810
2811    /* Need to update the MRF tracking for compressed instructions. */
2812    if (dispatch_width == 16)
2813       return false;
2814
2815    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2816
2817    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2818       if (inst->is_control_flow()) {
2819          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2820       }
2821
2822       if (inst->opcode == BRW_OPCODE_MOV &&
2823           inst->dst.file == MRF) {
2824          fs_inst *prev_inst = last_mrf_move[inst->dst.nr];
2825          if (prev_inst && inst->equals(prev_inst)) {
2826             inst->remove(block);
2827             progress = true;
2828             continue;
2829          }
2830       }
2831
2832       /* Clear out the last-write records for MRFs that were overwritten. */
2833       if (inst->dst.file == MRF) {
2834          last_mrf_move[inst->dst.nr] = NULL;
2835       }
2836
2837       if (inst->mlen > 0 && inst->base_mrf != -1) {
2838          /* Found a SEND instruction, which will include two or fewer
2839           * implied MRF writes.  We could do better here.
2840           */
2841          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2842             last_mrf_move[inst->base_mrf + i] = NULL;
2843          }
2844       }
2845
2846       /* Clear out any MRF move records whose sources got overwritten. */
2847       if (inst->dst.file == VGRF) {
2848          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2849             if (last_mrf_move[i] &&
2850                 last_mrf_move[i]->src[0].nr == inst->dst.nr) {
2851                last_mrf_move[i] = NULL;
2852             }
2853          }
2854       }
2855
2856       if (inst->opcode == BRW_OPCODE_MOV &&
2857           inst->dst.file == MRF &&
2858           inst->src[0].file == VGRF &&
2859           !inst->is_partial_write()) {
2860          last_mrf_move[inst->dst.nr] = inst;
2861       }
2862    }
2863
2864    if (progress)
2865       invalidate_live_intervals();
2866
2867    return progress;
2868 }
2869
2870 static void
2871 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2872 {
2873    /* Clear the flag for registers that actually got read (as expected). */
2874    for (int i = 0; i < inst->sources; i++) {
2875       int grf;
2876       if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {
2877          grf = inst->src[i].nr;
2878       } else {
2879          continue;
2880       }
2881
2882       if (grf >= first_grf &&
2883           grf < first_grf + grf_len) {
2884          deps[grf - first_grf] = false;
2885          if (inst->exec_size == 16)
2886             deps[grf - first_grf + 1] = false;
2887       }
2888    }
2889 }
2890
2891 /**
2892  * Implements this workaround for the original 965:
2893  *
2894  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2895  *      check for post destination dependencies on this instruction, software
2896  *      must ensure that there is no destination hazard for the case of ‘write
2897  *      followed by a posted write’ shown in the following example.
2898  *
2899  *      1. mov r3 0
2900  *      2. send r3.xy <rest of send instruction>
2901  *      3. mov r2 r3
2902  *
2903  *      Due to no post-destination dependency check on the ‘send’, the above
2904  *      code sequence could have two instructions (1 and 2) in flight at the
2905  *      same time that both consider ‘r3’ as the target of their final writes.
2906  */
2907 void
2908 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2909                                                         fs_inst *inst)
2910 {
2911    int write_len = inst->regs_written;
2912    int first_write_grf = inst->dst.nr;
2913    bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
2914    assert(write_len < (int)sizeof(needs_dep) - 1);
2915
2916    memset(needs_dep, false, sizeof(needs_dep));
2917    memset(needs_dep, true, write_len);
2918
2919    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2920
2921    /* Walk backwards looking for writes to registers we're writing which
2922     * aren't read since being written.  If we hit the start of the program,
2923     * we assume that there are no outstanding dependencies on entry to the
2924     * program.
2925     */
2926    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
2927       /* If we hit control flow, assume that there *are* outstanding
2928        * dependencies, and force their cleanup before our instruction.
2929        */
2930       if (block->start() == scan_inst) {
2931          for (int i = 0; i < write_len; i++) {
2932             if (needs_dep[i])
2933                DEP_RESOLVE_MOV(fs_builder(this, block, inst),
2934                                first_write_grf + i);
2935          }
2936          return;
2937       }
2938
2939       /* We insert our reads as late as possible on the assumption that any
2940        * instruction but a MOV that might have left us an outstanding
2941        * dependency has more latency than a MOV.
2942        */
2943       if (scan_inst->dst.file == VGRF) {
2944          for (int i = 0; i < scan_inst->regs_written; i++) {
2945             int reg = scan_inst->dst.nr + i;
2946
2947             if (reg >= first_write_grf &&
2948                 reg < first_write_grf + write_len &&
2949                 needs_dep[reg - first_write_grf]) {
2950                DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
2951                needs_dep[reg - first_write_grf] = false;
2952                if (scan_inst->exec_size == 16)
2953                   needs_dep[reg - first_write_grf + 1] = false;
2954             }
2955          }
2956       }
2957
2958       /* Clear the flag for registers that actually got read (as expected). */
2959       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2960
2961       /* Continue the loop only if we haven't resolved all the dependencies */
2962       int i;
2963       for (i = 0; i < write_len; i++) {
2964          if (needs_dep[i])
2965             break;
2966       }
2967       if (i == write_len)
2968          return;
2969    }
2970 }
2971
2972 /**
2973  * Implements this workaround for the original 965:
2974  *
2975  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2976  *      used as a destination register until after it has been sourced by an
2977  *      instruction with a different destination register.
2978  */
2979 void
2980 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2981 {
2982    int write_len = inst->regs_written;
2983    int first_write_grf = inst->dst.nr;
2984    bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
2985    assert(write_len < (int)sizeof(needs_dep) - 1);
2986
2987    memset(needs_dep, false, sizeof(needs_dep));
2988    memset(needs_dep, true, write_len);
2989    /* Walk forwards looking for writes to registers we're writing which aren't
2990     * read before being written.
2991     */
2992    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst) {
2993       /* If we hit control flow, force resolve all remaining dependencies. */
2994       if (block->end() == scan_inst) {
2995          for (int i = 0; i < write_len; i++) {
2996             if (needs_dep[i])
2997                DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
2998                                first_write_grf + i);
2999          }
3000          return;
3001       }
3002
3003       /* Clear the flag for registers that actually got read (as expected). */
3004       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3005
3006       /* We insert our reads as late as possible since they're reading the
3007        * result of a SEND, which has massive latency.
3008        */
3009       if (scan_inst->dst.file == VGRF &&
3010           scan_inst->dst.nr >= first_write_grf &&
3011           scan_inst->dst.nr < first_write_grf + write_len &&
3012           needs_dep[scan_inst->dst.nr - first_write_grf]) {
3013          DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
3014                          scan_inst->dst.nr);
3015          needs_dep[scan_inst->dst.nr - first_write_grf] = false;
3016       }
3017
3018       /* Continue the loop only if we haven't resolved all the dependencies */
3019       int i;
3020       for (i = 0; i < write_len; i++) {
3021          if (needs_dep[i])
3022             break;
3023       }
3024       if (i == write_len)
3025          return;
3026    }
3027 }
3028
3029 void
3030 fs_visitor::insert_gen4_send_dependency_workarounds()
3031 {
3032    if (devinfo->gen != 4 || devinfo->is_g4x)
3033       return;
3034
3035    bool progress = false;
3036
3037    /* Note that we're done with register allocation, so GRF fs_regs always
3038     * have a .reg_offset of 0.
3039     */
3040
3041    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3042       if (inst->mlen != 0 && inst->dst.file == VGRF) {
3043          insert_gen4_pre_send_dependency_workarounds(block, inst);
3044          insert_gen4_post_send_dependency_workarounds(block, inst);
3045          progress = true;
3046       }
3047    }
3048
3049    if (progress)
3050       invalidate_live_intervals();
3051 }
3052
3053 /**
3054  * Turns the generic expression-style uniform pull constant load instruction
3055  * into a hardware-specific series of instructions for loading a pull
3056  * constant.
3057  *
3058  * The expression style allows the CSE pass before this to optimize out
3059  * repeated loads from the same offset, and gives the pre-register-allocation
3060  * scheduling full flexibility, while the conversion to native instructions
3061  * allows the post-register-allocation scheduler the best information
3062  * possible.
3063  *
3064  * Note that execution masking for setting up pull constant loads is special:
3065  * the channels that need to be written are unrelated to the current execution
3066  * mask, since a later instruction will use one of the result channels as a
3067  * source operand for all 8 or 16 of its channels.
3068  */
3069 void
3070 fs_visitor::lower_uniform_pull_constant_loads()
3071 {
3072    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3073       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3074          continue;
3075
3076       if (devinfo->gen >= 7) {
3077          /* The offset arg before was a vec4-aligned byte offset.  We need to
3078           * turn it into a dword offset.
3079           */
3080          fs_reg const_offset_reg = inst->src[1];
3081          assert(const_offset_reg.file == IMM &&
3082                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3083          const_offset_reg.ud /= 4;
3084
3085          fs_reg payload, offset;
3086          if (devinfo->gen >= 9) {
3087             /* We have to use a message header on Skylake to get SIMD4x2
3088              * mode.  Reserve space for the register.
3089             */
3090             offset = payload = fs_reg(VGRF, alloc.allocate(2));
3091             offset.reg_offset++;
3092             inst->mlen = 2;
3093          } else {
3094             offset = payload = fs_reg(VGRF, alloc.allocate(1));
3095             inst->mlen = 1;
3096          }
3097
3098          /* This is actually going to be a MOV, but since only the first dword
3099           * is accessed, we have a special opcode to do just that one.  Note
3100           * that this needs to be an operation that will be considered a def
3101           * by live variable analysis, or register allocation will explode.
3102           */
3103          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3104                                                8, offset, const_offset_reg);
3105          setup->force_writemask_all = true;
3106
3107          setup->ir = inst->ir;
3108          setup->annotation = inst->annotation;
3109          inst->insert_before(block, setup);
3110
3111          /* Similarly, this will only populate the first 4 channels of the
3112           * result register (since we only use smear values from 0-3), but we
3113           * don't tell the optimizer.
3114           */
3115          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3116          inst->src[1] = payload;
3117          inst->base_mrf = -1;
3118
3119          invalidate_live_intervals();
3120       } else {
3121          /* Before register allocation, we didn't tell the scheduler about the
3122           * MRF we use.  We know it's safe to use this MRF because nothing
3123           * else does except for register spill/unspill, which generates and
3124           * uses its MRF within a single IR instruction.
3125           */
3126          inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
3127          inst->mlen = 1;
3128       }
3129    }
3130 }
3131
3132 bool
3133 fs_visitor::lower_load_payload()
3134 {
3135    bool progress = false;
3136
3137    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3138       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3139          continue;
3140
3141       assert(inst->dst.file == MRF || inst->dst.file == VGRF);
3142       assert(inst->saturate == false);
3143       fs_reg dst = inst->dst;
3144
3145       /* Get rid of COMPR4.  We'll add it back in if we need it */
3146       if (dst.file == MRF)
3147          dst.nr = dst.nr & ~BRW_MRF_COMPR4;
3148
3149       const fs_builder ibld(this, block, inst);
3150       const fs_builder hbld = ibld.exec_all().group(8, 0);
3151
3152       for (uint8_t i = 0; i < inst->header_size; i++) {
3153          if (inst->src[i].file != BAD_FILE) {
3154             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3155             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3156             hbld.MOV(mov_dst, mov_src);
3157          }
3158          dst = offset(dst, hbld, 1);
3159       }
3160
3161       if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) &&
3162           inst->exec_size > 8) {
3163          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3164           * a straightforward copy.  Instead, the result of the
3165           * LOAD_PAYLOAD is treated as interleaved and the first four
3166           * non-header sources are unpacked as:
3167           *
3168           * m + 0: r0
3169           * m + 1: g0
3170           * m + 2: b0
3171           * m + 3: a0
3172           * m + 4: r1
3173           * m + 5: g1
3174           * m + 6: b1
3175           * m + 7: a1
3176           *
3177           * This is used for gen <= 5 fb writes.
3178           */
3179          assert(inst->exec_size == 16);
3180          assert(inst->header_size + 4 <= inst->sources);
3181          for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3182             if (inst->src[i].file != BAD_FILE) {
3183                if (devinfo->has_compr4) {
3184                   fs_reg compr4_dst = retype(dst, inst->src[i].type);
3185                   compr4_dst.nr |= BRW_MRF_COMPR4;
3186                   ibld.MOV(compr4_dst, inst->src[i]);
3187                } else {
3188                   /* Platform doesn't have COMPR4.  We have to fake it */
3189                   fs_reg mov_dst = retype(dst, inst->src[i].type);
3190                   ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
3191                   mov_dst.nr += 4;
3192                   ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
3193                }
3194             }
3195
3196             dst.nr++;
3197          }
3198
3199          /* The loop above only ever incremented us through the first set
3200           * of 4 registers.  However, thanks to the magic of COMPR4, we
3201           * actually wrote to the first 8 registers, so we need to take
3202           * that into account now.
3203           */
3204          dst.nr += 4;
3205
3206          /* The COMPR4 code took care of the first 4 sources.  We'll let
3207           * the regular path handle any remaining sources.  Yes, we are
3208           * modifying the instruction but we're about to delete it so
3209           * this really doesn't hurt anything.
3210           */
3211          inst->header_size += 4;
3212       }
3213
3214       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3215          if (inst->src[i].file != BAD_FILE)
3216             ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
3217          dst = offset(dst, ibld, 1);
3218       }
3219
3220       inst->remove(block);
3221       progress = true;
3222    }
3223
3224    if (progress)
3225       invalidate_live_intervals();
3226
3227    return progress;
3228 }
3229
3230 bool
3231 fs_visitor::lower_integer_multiplication()
3232 {
3233    bool progress = false;
3234
3235    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3236       const fs_builder ibld(this, block, inst);
3237
3238       if (inst->opcode == BRW_OPCODE_MUL) {
3239          if (inst->dst.is_accumulator() ||
3240              (inst->dst.type != BRW_REGISTER_TYPE_D &&
3241               inst->dst.type != BRW_REGISTER_TYPE_UD))
3242             continue;
3243
3244          /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit
3245           * operation directly, but CHV/BXT cannot.
3246           */
3247          if (devinfo->gen >= 8 &&
3248              !devinfo->is_cherryview && !devinfo->is_broxton)
3249             continue;
3250
3251          if (inst->src[1].file == IMM &&
3252              inst->src[1].ud < (1 << 16)) {
3253             /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3254              * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3255              * src1 are used.
3256              *
3257              * If multiplying by an immediate value that fits in 16-bits, do a
3258              * single MUL instruction with that value in the proper location.
3259              */
3260             if (devinfo->gen < 7) {
3261                fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8),
3262                           inst->dst.type);
3263                ibld.MOV(imm, inst->src[1]);
3264                ibld.MUL(inst->dst, imm, inst->src[0]);
3265             } else {
3266                ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
3267             }
3268          } else {
3269             /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3270              * do 32-bit integer multiplication in one instruction, but instead
3271              * must do a sequence (which actually calculates a 64-bit result):
3272              *
3273              *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
3274              *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
3275              *    mov(8)  g2<1>D     acc0<8,8,1>D
3276              *
3277              * But on Gen > 6, the ability to use second accumulator register
3278              * (acc1) for non-float data types was removed, preventing a simple
3279              * implementation in SIMD16. A 16-channel result can be calculated by
3280              * executing the three instructions twice in SIMD8, once with quarter
3281              * control of 1Q for the first eight channels and again with 2Q for
3282              * the second eight channels.
3283              *
3284              * Which accumulator register is implicitly accessed (by AccWrEnable
3285              * for instance) is determined by the quarter control. Unfortunately
3286              * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3287              * implicit accumulator access by an instruction with 2Q will access
3288              * acc1 regardless of whether the data type is usable in acc1.
3289              *
3290              * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3291              * integer data types.
3292              *
3293              * Since we only want the low 32-bits of the result, we can do two
3294              * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3295              * adjust the high result and add them (like the mach is doing):
3296              *
3297              *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
3298              *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
3299              *    shl(8)  g9<1>D     g8<8,8,1>D      16D
3300              *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
3301              *
3302              * We avoid the shl instruction by realizing that we only want to add
3303              * the low 16-bits of the "high" result to the high 16-bits of the
3304              * "low" result and using proper regioning on the add:
3305              *
3306              *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
3307              *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
3308              *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
3309              *
3310              * Since it does not use the (single) accumulator register, we can
3311              * schedule multi-component multiplications much better.
3312              */
3313
3314             fs_reg orig_dst = inst->dst;
3315             if (orig_dst.is_null() || orig_dst.file == MRF) {
3316                inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
3317                                   inst->dst.type);
3318             }
3319             fs_reg low = inst->dst;
3320             fs_reg high(VGRF, alloc.allocate(dispatch_width / 8),
3321                         inst->dst.type);
3322
3323             if (devinfo->gen >= 7) {
3324                fs_reg src1_0_w = inst->src[1];
3325                fs_reg src1_1_w = inst->src[1];
3326
3327                if (inst->src[1].file == IMM) {
3328                   src1_0_w.ud &= 0xffff;
3329                   src1_1_w.ud >>= 16;
3330                } else {
3331                   src1_0_w.type = BRW_REGISTER_TYPE_UW;
3332                   if (src1_0_w.stride != 0) {
3333                      assert(src1_0_w.stride == 1);
3334                      src1_0_w.stride = 2;
3335                   }
3336
3337                   src1_1_w.type = BRW_REGISTER_TYPE_UW;
3338                   if (src1_1_w.stride != 0) {
3339                      assert(src1_1_w.stride == 1);
3340                      src1_1_w.stride = 2;
3341                   }
3342                   src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3343                }
3344                ibld.MUL(low, inst->src[0], src1_0_w);
3345                ibld.MUL(high, inst->src[0], src1_1_w);
3346             } else {
3347                fs_reg src0_0_w = inst->src[0];
3348                fs_reg src0_1_w = inst->src[0];
3349
3350                src0_0_w.type = BRW_REGISTER_TYPE_UW;
3351                if (src0_0_w.stride != 0) {
3352                   assert(src0_0_w.stride == 1);
3353                   src0_0_w.stride = 2;
3354                }
3355
3356                src0_1_w.type = BRW_REGISTER_TYPE_UW;
3357                if (src0_1_w.stride != 0) {
3358                   assert(src0_1_w.stride == 1);
3359                   src0_1_w.stride = 2;
3360                }
3361                src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3362
3363                ibld.MUL(low, src0_0_w, inst->src[1]);
3364                ibld.MUL(high, src0_1_w, inst->src[1]);
3365             }
3366
3367             fs_reg dst = inst->dst;
3368             dst.type = BRW_REGISTER_TYPE_UW;
3369             dst.subreg_offset = 2;
3370             dst.stride = 2;
3371
3372             high.type = BRW_REGISTER_TYPE_UW;
3373             high.stride = 2;
3374
3375             low.type = BRW_REGISTER_TYPE_UW;
3376             low.subreg_offset = 2;
3377             low.stride = 2;
3378
3379             ibld.ADD(dst, low, high);
3380
3381             if (inst->conditional_mod || orig_dst.file == MRF) {
3382                set_condmod(inst->conditional_mod,
3383                            ibld.MOV(orig_dst, inst->dst));
3384             }
3385          }
3386
3387       } else if (inst->opcode == SHADER_OPCODE_MULH) {
3388          /* Should have been lowered to 8-wide. */
3389          assert(inst->exec_size <= 8);
3390          const fs_reg acc = retype(brw_acc_reg(inst->exec_size),
3391                                    inst->dst.type);
3392          fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
3393          fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
3394
3395          if (devinfo->gen >= 8) {
3396             /* Until Gen8, integer multiplies read 32-bits from one source,
3397              * and 16-bits from the other, and relying on the MACH instruction
3398              * to generate the high bits of the result.
3399              *
3400              * On Gen8, the multiply instruction does a full 32x32-bit
3401              * multiply, but in order to do a 64-bit multiply we can simulate
3402              * the previous behavior and then use a MACH instruction.
3403              *
3404              * FINISHME: Don't use source modifiers on src1.
3405              */
3406             assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
3407                    mul->src[1].type == BRW_REGISTER_TYPE_UD);
3408             mul->src[1].type = (type_is_signed(mul->src[1].type) ?
3409                                 BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW);
3410             mul->src[1].stride *= 2;
3411
3412          } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
3413                     inst->force_sechalf) {
3414             /* Among other things the quarter control bits influence which
3415              * accumulator register is used by the hardware for instructions
3416              * that access the accumulator implicitly (e.g. MACH).  A
3417              * second-half instruction would normally map to acc1, which
3418              * doesn't exist on Gen7 and up (the hardware does emulate it for
3419              * floating-point instructions *only* by taking advantage of the
3420              * extra precision of acc0 not normally used for floating point
3421              * arithmetic).
3422              *
3423              * HSW and up are careful enough not to try to access an
3424              * accumulator register that doesn't exist, but on earlier Gen7
3425              * hardware we need to make sure that the quarter control bits are
3426              * zero to avoid non-deterministic behaviour and emit an extra MOV
3427              * to get the result masked correctly according to the current
3428              * channel enables.
3429              */
3430             mach->force_sechalf = false;
3431             mach->force_writemask_all = true;
3432             mach->dst = ibld.vgrf(inst->dst.type);
3433             ibld.MOV(inst->dst, mach->dst);
3434          }
3435       } else {
3436          continue;
3437       }
3438
3439       inst->remove(block);
3440       progress = true;
3441    }
3442
3443    if (progress)
3444       invalidate_live_intervals();
3445
3446    return progress;
3447 }
3448
3449 static void
3450 setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
3451                     fs_reg *dst, fs_reg color, unsigned components)
3452 {
3453    if (key->clamp_fragment_color) {
3454       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
3455       assert(color.type == BRW_REGISTER_TYPE_F);
3456
3457       for (unsigned i = 0; i < components; i++)
3458          set_saturate(true,
3459                       bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
3460
3461       color = tmp;
3462    }
3463
3464    for (unsigned i = 0; i < components; i++)
3465       dst[i] = offset(color, bld, i);
3466 }
3467
3468 static void
3469 lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
3470                             const brw_wm_prog_data *prog_data,
3471                             const brw_wm_prog_key *key,
3472                             const fs_visitor::thread_payload &payload)
3473 {
3474    assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
3475    const brw_device_info *devinfo = bld.shader->devinfo;
3476    const fs_reg &color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
3477    const fs_reg &color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
3478    const fs_reg &src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
3479    const fs_reg &src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
3480    const fs_reg &dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
3481    const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
3482    fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
3483    const unsigned components =
3484       inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
3485
3486    /* We can potentially have a message length of up to 15, so we have to set
3487     * base_mrf to either 0 or 1 in order to fit in m0..m15.
3488     */
3489    fs_reg sources[15];
3490    int header_size = 2, payload_header_size;
3491    unsigned length = 0;
3492
3493    /* From the Sandy Bridge PRM, volume 4, page 198:
3494     *
3495     *     "Dispatched Pixel Enables. One bit per pixel indicating
3496     *      which pixels were originally enabled when the thread was
3497     *      dispatched. This field is only required for the end-of-
3498     *      thread message and on all dual-source messages."
3499     */
3500    if (devinfo->gen >= 6 &&
3501        (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
3502        color1.file == BAD_FILE &&
3503        key->nr_color_regions == 1) {
3504       header_size = 0;
3505    }
3506
3507    if (header_size != 0) {
3508       assert(header_size == 2);
3509       /* Allocate 2 registers for a header */
3510       length += 2;
3511    }
3512
3513    if (payload.aa_dest_stencil_reg) {
3514       sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
3515       bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
3516          .MOV(sources[length],
3517               fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
3518       length++;
3519    }
3520
3521    if (prog_data->uses_omask) {
3522       sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1),
3523                                BRW_REGISTER_TYPE_UD);
3524
3525       /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
3526        * relevant.  Since it's unsigned single words one vgrf is always
3527        * 16-wide, but only the lower or higher 8 channels will be used by the
3528        * hardware when doing a SIMD8 write depending on whether we have
3529        * selected the subspans for the first or second half respectively.
3530        */
3531       assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
3532       sample_mask.type = BRW_REGISTER_TYPE_UW;
3533       sample_mask.stride *= 2;
3534
3535       bld.exec_all().annotate("FB write oMask")
3536          .MOV(half(retype(sources[length], BRW_REGISTER_TYPE_UW),
3537                    inst->force_sechalf),
3538               sample_mask);
3539       length++;
3540    }
3541
3542    payload_header_size = length;
3543
3544    if (src0_alpha.file != BAD_FILE) {
3545       /* FIXME: This is being passed at the wrong location in the payload and
3546        * doesn't work when gl_SampleMask and MRTs are used simultaneously.
3547        * It's supposed to be immediately before oMask but there seems to be no
3548        * reasonable way to pass them in the correct order because LOAD_PAYLOAD
3549        * requires header sources to form a contiguous segment at the beginning
3550        * of the message and src0_alpha has per-channel semantics.
3551        */
3552       setup_color_payload(bld, key, &sources[length], src0_alpha, 1);
3553       length++;
3554    }
3555
3556    setup_color_payload(bld, key, &sources[length], color0, components);
3557    length += 4;
3558
3559    if (color1.file != BAD_FILE) {
3560       setup_color_payload(bld, key, &sources[length], color1, components);
3561       length += 4;
3562    }
3563
3564    if (src_depth.file != BAD_FILE) {
3565       sources[length] = src_depth;
3566       length++;
3567    }
3568
3569    if (dst_depth.file != BAD_FILE) {
3570       sources[length] = dst_depth;
3571       length++;
3572    }
3573
3574    if (src_stencil.file != BAD_FILE) {
3575       assert(devinfo->gen >= 9);
3576       assert(bld.dispatch_width() != 16);
3577
3578       sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD);
3579       bld.exec_all().annotate("FB write OS")
3580          .emit(FS_OPCODE_PACK_STENCIL_REF, sources[length],
3581                retype(src_stencil, BRW_REGISTER_TYPE_UB));
3582       length++;
3583    }
3584
3585    fs_inst *load;
3586    if (devinfo->gen >= 7) {
3587       /* Send from the GRF */
3588       fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F);
3589       load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
3590       payload.nr = bld.shader->alloc.allocate(load->regs_written);
3591       load->dst = payload;
3592
3593       inst->src[0] = payload;
3594       inst->resize_sources(1);
3595       inst->base_mrf = -1;
3596    } else {
3597       /* Send from the MRF */
3598       load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
3599                               sources, length, payload_header_size);
3600
3601       /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
3602        * will do this for us if we just give it a COMPR4 destination.
3603        */
3604       if (devinfo->gen < 6 && bld.dispatch_width() == 16)
3605          load->dst.nr |= BRW_MRF_COMPR4;
3606
3607       inst->resize_sources(0);
3608       inst->base_mrf = 1;
3609    }
3610
3611    inst->opcode = FS_OPCODE_FB_WRITE;
3612    inst->mlen = load->regs_written;
3613    inst->header_size = header_size;
3614 }
3615
3616 static void
3617 lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
3618                                 const fs_reg &coordinate,
3619                                 const fs_reg &shadow_c,
3620                                 const fs_reg &lod, const fs_reg &lod2,
3621                                 const fs_reg &sampler,
3622                                 unsigned coord_components,
3623                                 unsigned grad_components)
3624 {
3625    const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
3626                          op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
3627    fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
3628    fs_reg msg_end = msg_begin;
3629
3630    /* g0 header. */
3631    msg_end = offset(msg_end, bld.group(8, 0), 1);
3632
3633    for (unsigned i = 0; i < coord_components; i++)
3634       bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
3635               offset(coordinate, bld, i));
3636
3637    msg_end = offset(msg_end, bld, coord_components);
3638
3639    /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
3640     * require all three components to be present and zero if they are unused.
3641     */
3642    if (coord_components > 0 &&
3643        (has_lod || shadow_c.file != BAD_FILE ||
3644         (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
3645       for (unsigned i = coord_components; i < 3; i++)
3646          bld.MOV(offset(msg_end, bld, i), fs_reg(0.0f));
3647
3648       msg_end = offset(msg_end, bld, 3 - coord_components);
3649    }
3650
3651    if (op == SHADER_OPCODE_TXD) {
3652       /* TXD unsupported in SIMD16 mode. */
3653       assert(bld.dispatch_width() == 8);
3654
3655       /* the slots for u and v are always present, but r is optional */
3656       if (coord_components < 2)
3657          msg_end = offset(msg_end, bld, 2 - coord_components);
3658
3659       /*  P   = u, v, r
3660        * dPdx = dudx, dvdx, drdx
3661        * dPdy = dudy, dvdy, drdy
3662        *
3663        * 1-arg: Does not exist.
3664        *
3665        * 2-arg: dudx   dvdx   dudy   dvdy
3666        *        dPdx.x dPdx.y dPdy.x dPdy.y
3667        *        m4     m5     m6     m7
3668        *
3669        * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
3670        *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
3671        *        m5     m6     m7     m8     m9     m10
3672        */
3673       for (unsigned i = 0; i < grad_components; i++)
3674          bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
3675
3676       msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
3677
3678       for (unsigned i = 0; i < grad_components; i++)
3679          bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
3680
3681       msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
3682    }
3683
3684    if (has_lod) {
3685       /* Bias/LOD with shadow comparitor is unsupported in SIMD16 -- *Without*
3686        * shadow comparitor (including RESINFO) it's unsupported in SIMD8 mode.
3687        */
3688       assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
3689              bld.dispatch_width() == 16);
3690
3691       const brw_reg_type type =
3692          (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
3693           BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
3694       bld.MOV(retype(msg_end, type), lod);
3695       msg_end = offset(msg_end, bld, 1);
3696    }
3697
3698    if (shadow_c.file != BAD_FILE) {
3699       if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
3700          /* There's no plain shadow compare message, so we use shadow
3701           * compare with a bias of 0.0.
3702           */
3703          bld.MOV(msg_end, fs_reg(0.0f));
3704          msg_end = offset(msg_end, bld, 1);
3705       }
3706
3707       bld.MOV(msg_end, shadow_c);
3708       msg_end = offset(msg_end, bld, 1);
3709    }
3710
3711    inst->opcode = op;
3712    inst->src[0] = reg_undef;
3713    inst->src[1] = sampler;
3714    inst->resize_sources(2);
3715    inst->base_mrf = msg_begin.nr;
3716    inst->mlen = msg_end.nr - msg_begin.nr;
3717    inst->header_size = 1;
3718 }
3719
3720 static void
3721 lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
3722                                 fs_reg coordinate,
3723                                 const fs_reg &shadow_c,
3724                                 fs_reg lod, fs_reg lod2,
3725                                 const fs_reg &sample_index,
3726                                 const fs_reg &sampler,
3727                                 const fs_reg &offset_value,
3728                                 unsigned coord_components,
3729                                 unsigned grad_components)
3730 {
3731    fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
3732    fs_reg msg_coords = message;
3733    unsigned header_size = 0;
3734
3735    if (offset_value.file != BAD_FILE) {
3736       /* The offsets set up by the visitor are in the m1 header, so we can't
3737        * go headerless.
3738        */
3739       header_size = 1;
3740       message.nr--;
3741    }
3742
3743    for (unsigned i = 0; i < coord_components; i++) {
3744       bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type), coordinate);
3745       coordinate = offset(coordinate, bld, 1);
3746    }
3747    fs_reg msg_end = offset(msg_coords, bld, coord_components);
3748    fs_reg msg_lod = offset(msg_coords, bld, 4);
3749
3750    if (shadow_c.file != BAD_FILE) {
3751       fs_reg msg_shadow = msg_lod;
3752       bld.MOV(msg_shadow, shadow_c);
3753       msg_lod = offset(msg_shadow, bld, 1);
3754       msg_end = msg_lod;
3755    }
3756
3757    switch (op) {
3758    case SHADER_OPCODE_TXL:
3759    case FS_OPCODE_TXB:
3760       bld.MOV(msg_lod, lod);
3761       msg_end = offset(msg_lod, bld, 1);
3762       break;
3763    case SHADER_OPCODE_TXD:
3764       /**
3765        *  P   =  u,    v,    r
3766        * dPdx = dudx, dvdx, drdx
3767        * dPdy = dudy, dvdy, drdy
3768        *
3769        * Load up these values:
3770        * - dudx   dudy   dvdx   dvdy   drdx   drdy
3771        * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
3772        */
3773       msg_end = msg_lod;
3774       for (unsigned i = 0; i < grad_components; i++) {
3775          bld.MOV(msg_end, lod);
3776          lod = offset(lod, bld, 1);
3777          msg_end = offset(msg_end, bld, 1);
3778
3779          bld.MOV(msg_end, lod2);
3780          lod2 = offset(lod2, bld, 1);
3781          msg_end = offset(msg_end, bld, 1);
3782       }
3783       break;
3784    case SHADER_OPCODE_TXS:
3785       msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
3786       bld.MOV(msg_lod, lod);
3787       msg_end = offset(msg_lod, bld, 1);
3788       break;
3789    case SHADER_OPCODE_TXF:
3790       msg_lod = offset(msg_coords, bld, 3);
3791       bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
3792       msg_end = offset(msg_lod, bld, 1);
3793       break;
3794    case SHADER_OPCODE_TXF_CMS:
3795       msg_lod = offset(msg_coords, bld, 3);
3796       /* lod */
3797       bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
3798       /* sample index */
3799       bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
3800       msg_end = offset(msg_lod, bld, 2);
3801       break;
3802    default:
3803       break;
3804    }
3805
3806    inst->opcode = op;
3807    inst->src[0] = reg_undef;
3808    inst->src[1] = sampler;
3809    inst->resize_sources(2);
3810    inst->base_mrf = message.nr;
3811    inst->mlen = msg_end.nr - message.nr;
3812    inst->header_size = header_size;
3813
3814    /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
3815    assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
3816 }
3817
3818 static bool
3819 is_high_sampler(const struct brw_device_info *devinfo, const fs_reg &sampler)
3820 {
3821    if (devinfo->gen < 8 && !devinfo->is_haswell)
3822       return false;
3823
3824    return sampler.file != IMM || sampler.ud >= 16;
3825 }
3826
3827 static void
3828 lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
3829                                 fs_reg coordinate,
3830                                 const fs_reg &shadow_c,
3831                                 fs_reg lod, fs_reg lod2,
3832                                 const fs_reg &sample_index,
3833                                 const fs_reg &mcs, const fs_reg &sampler,
3834                                 fs_reg offset_value,
3835                                 unsigned coord_components,
3836                                 unsigned grad_components)
3837 {
3838    const brw_device_info *devinfo = bld.shader->devinfo;
3839    int reg_width = bld.dispatch_width() / 8;
3840    unsigned header_size = 0, length = 0;
3841    fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
3842    for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
3843       sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
3844
3845    if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
3846        offset_value.file != BAD_FILE ||
3847        is_high_sampler(devinfo, sampler)) {
3848       /* For general texture offsets (no txf workaround), we need a header to
3849        * put them in.  Note that we're only reserving space for it in the
3850        * message payload as it will be initialized implicitly by the
3851        * generator.
3852        *
3853        * TG4 needs to place its channel select in the header, for interaction
3854        * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
3855        * larger sampler numbers we need to offset the Sampler State Pointer in
3856        * the header.
3857        */
3858       header_size = 1;
3859       sources[0] = fs_reg();
3860       length++;
3861    }
3862
3863    if (shadow_c.file != BAD_FILE) {
3864       bld.MOV(sources[length], shadow_c);
3865       length++;
3866    }
3867
3868    bool coordinate_done = false;
3869
3870    /* The sampler can only meaningfully compute LOD for fragment shader
3871     * messages. For all other stages, we change the opcode to TXL and
3872     * hardcode the LOD to 0.
3873     */
3874    if (bld.shader->stage != MESA_SHADER_FRAGMENT &&
3875        op == SHADER_OPCODE_TEX) {
3876       op = SHADER_OPCODE_TXL;
3877       lod = fs_reg(0.0f);
3878    }
3879
3880    /* Set up the LOD info */
3881    switch (op) {
3882    case FS_OPCODE_TXB:
3883    case SHADER_OPCODE_TXL:
3884       bld.MOV(sources[length], lod);
3885       length++;
3886       break;
3887    case SHADER_OPCODE_TXD:
3888       /* TXD should have been lowered in SIMD16 mode. */
3889       assert(bld.dispatch_width() == 8);
3890
3891       /* Load dPdx and the coordinate together:
3892        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
3893        */
3894       for (unsigned i = 0; i < coord_components; i++) {
3895          bld.MOV(sources[length], coordinate);
3896          coordinate = offset(coordinate, bld, 1);
3897          length++;
3898
3899          /* For cube map array, the coordinate is (u,v,r,ai) but there are
3900           * only derivatives for (u, v, r).
3901           */
3902          if (i < grad_components) {
3903             bld.MOV(sources[length], lod);
3904             lod = offset(lod, bld, 1);
3905             length++;
3906
3907             bld.MOV(sources[length], lod2);
3908             lod2 = offset(lod2, bld, 1);
3909             length++;
3910          }
3911       }
3912
3913       coordinate_done = true;
3914       break;
3915    case SHADER_OPCODE_TXS:
3916       bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
3917       length++;
3918       break;
3919    case SHADER_OPCODE_TXF:
3920       /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
3921        * On Gen9 they are u, v, lod, r
3922        */
3923       bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
3924       coordinate = offset(coordinate, bld, 1);
3925       length++;
3926
3927       if (devinfo->gen >= 9) {
3928          if (coord_components >= 2) {
3929             bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
3930             coordinate = offset(coordinate, bld, 1);
3931          }
3932          length++;
3933       }
3934
3935       bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
3936       length++;
3937
3938       for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
3939          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
3940          coordinate = offset(coordinate, bld, 1);
3941          length++;
3942       }
3943
3944       coordinate_done = true;
3945       break;
3946    case SHADER_OPCODE_TXF_CMS:
3947    case SHADER_OPCODE_TXF_CMS_W:
3948    case SHADER_OPCODE_TXF_UMS:
3949    case SHADER_OPCODE_TXF_MCS:
3950       if (op == SHADER_OPCODE_TXF_UMS ||
3951           op == SHADER_OPCODE_TXF_CMS ||
3952           op == SHADER_OPCODE_TXF_CMS_W) {
3953          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
3954          length++;
3955       }
3956
3957       if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {
3958          /* Data from the multisample control surface. */
3959          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
3960          length++;
3961
3962          /* On Gen9+ we'll use ld2dms_w instead which has two registers for
3963           * the MCS data.
3964           */
3965          if (op == SHADER_OPCODE_TXF_CMS_W) {
3966             bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD),
3967                     mcs.file == IMM ?
3968                     mcs :
3969                     offset(mcs, bld, 1));
3970             length++;
3971          }
3972       }
3973
3974       /* There is no offsetting for this message; just copy in the integer
3975        * texture coordinates.
3976        */
3977       for (unsigned i = 0; i < coord_components; i++) {
3978          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
3979          coordinate = offset(coordinate, bld, 1);
3980          length++;
3981       }
3982
3983       coordinate_done = true;
3984       break;
3985    case SHADER_OPCODE_TG4_OFFSET:
3986       /* gather4_po_c should have been lowered in SIMD16 mode. */
3987       assert(bld.dispatch_width() == 8 || shadow_c.file == BAD_FILE);
3988
3989       /* More crazy intermixing */
3990       for (unsigned i = 0; i < 2; i++) { /* u, v */
3991          bld.MOV(sources[length], coordinate);
3992          coordinate = offset(coordinate, bld, 1);
3993          length++;
3994       }
3995
3996       for (unsigned i = 0; i < 2; i++) { /* offu, offv */
3997          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value);
3998          offset_value = offset(offset_value, bld, 1);
3999          length++;
4000       }
4001
4002       if (coord_components == 3) { /* r if present */
4003          bld.MOV(sources[length], coordinate);
4004          coordinate = offset(coordinate, bld, 1);
4005          length++;
4006       }
4007
4008       coordinate_done = true;
4009       break;
4010    default:
4011       break;
4012    }
4013
4014    /* Set up the coordinate (except for cases where it was done above) */
4015    if (!coordinate_done) {
4016       for (unsigned i = 0; i < coord_components; i++) {
4017          bld.MOV(sources[length], coordinate);
4018          coordinate = offset(coordinate, bld, 1);
4019          length++;
4020       }
4021    }
4022
4023    int mlen;
4024    if (reg_width == 2)
4025       mlen = length * reg_width - header_size;
4026    else
4027       mlen = length * reg_width;
4028
4029    const fs_reg src_payload = fs_reg(VGRF, bld.shader->alloc.allocate(mlen),
4030                                      BRW_REGISTER_TYPE_F);
4031    bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
4032
4033    /* Generate the SEND. */
4034    inst->opcode = op;
4035    inst->src[0] = src_payload;
4036    inst->src[1] = sampler;
4037    inst->resize_sources(2);
4038    inst->base_mrf = -1;
4039    inst->mlen = mlen;
4040    inst->header_size = header_size;
4041
4042    /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
4043    assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
4044 }
4045
4046 static void
4047 lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
4048 {
4049    const brw_device_info *devinfo = bld.shader->devinfo;
4050    const fs_reg &coordinate = inst->src[0];
4051    const fs_reg &shadow_c = inst->src[1];
4052    const fs_reg &lod = inst->src[2];
4053    const fs_reg &lod2 = inst->src[3];
4054    const fs_reg &sample_index = inst->src[4];
4055    const fs_reg &mcs = inst->src[5];
4056    const fs_reg &sampler = inst->src[6];
4057    const fs_reg &offset_value = inst->src[7];
4058    assert(inst->src[8].file == IMM && inst->src[9].file == IMM);
4059    const unsigned coord_components = inst->src[8].ud;
4060    const unsigned grad_components = inst->src[9].ud;
4061
4062    if (devinfo->gen >= 7) {
4063       lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
4064                                       shadow_c, lod, lod2, sample_index,
4065                                       mcs, sampler, offset_value,
4066                                       coord_components, grad_components);
4067    } else if (devinfo->gen >= 5) {
4068       lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
4069                                       shadow_c, lod, lod2, sample_index,
4070                                       sampler, offset_value,
4071                                       coord_components, grad_components);
4072    } else {
4073       lower_sampler_logical_send_gen4(bld, inst, op, coordinate,
4074                                       shadow_c, lod, lod2, sampler,
4075                                       coord_components, grad_components);
4076    }
4077 }
4078
4079 /**
4080  * Initialize the header present in some typed and untyped surface
4081  * messages.
4082  */
4083 static fs_reg
4084 emit_surface_header(const fs_builder &bld, const fs_reg &sample_mask)
4085 {
4086    fs_builder ubld = bld.exec_all().group(8, 0);
4087    const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4088    ubld.MOV(dst, fs_reg(0));
4089    ubld.MOV(component(dst, 7), sample_mask);
4090    return dst;
4091 }
4092
4093 static void
4094 lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
4095                            const fs_reg &sample_mask)
4096 {
4097    /* Get the logical send arguments. */
4098    const fs_reg &addr = inst->src[0];
4099    const fs_reg &src = inst->src[1];
4100    const fs_reg &surface = inst->src[2];
4101    const UNUSED fs_reg &dims = inst->src[3];
4102    const fs_reg &arg = inst->src[4];
4103
4104    /* Calculate the total number of components of the payload. */
4105    const unsigned addr_sz = inst->components_read(0);
4106    const unsigned src_sz = inst->components_read(1);
4107    const unsigned header_sz = (sample_mask.file == BAD_FILE ? 0 : 1);
4108    const unsigned sz = header_sz + addr_sz + src_sz;
4109
4110    /* Allocate space for the payload. */
4111    fs_reg *const components = new fs_reg[sz];
4112    const fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
4113    unsigned n = 0;
4114
4115    /* Construct the payload. */
4116    if (header_sz)
4117       components[n++] = emit_surface_header(bld, sample_mask);
4118
4119    for (unsigned i = 0; i < addr_sz; i++)
4120       components[n++] = offset(addr, bld, i);
4121
4122    for (unsigned i = 0; i < src_sz; i++)
4123       components[n++] = offset(src, bld, i);
4124
4125    bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
4126
4127    /* Update the original instruction. */
4128    inst->opcode = op;
4129    inst->mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
4130    inst->header_size = header_sz;
4131
4132    inst->src[0] = payload;
4133    inst->src[1] = surface;
4134    inst->src[2] = arg;
4135    inst->resize_sources(3);
4136
4137    delete[] components;
4138 }
4139
4140 bool
4141 fs_visitor::lower_logical_sends()
4142 {
4143    bool progress = false;
4144
4145    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
4146       const fs_builder ibld(this, block, inst);
4147
4148       switch (inst->opcode) {
4149       case FS_OPCODE_FB_WRITE_LOGICAL:
4150          assert(stage == MESA_SHADER_FRAGMENT);
4151          lower_fb_write_logical_send(ibld, inst,
4152                                      (const brw_wm_prog_data *)prog_data,
4153                                      (const brw_wm_prog_key *)key,
4154                                      payload);
4155          break;
4156
4157       case SHADER_OPCODE_TEX_LOGICAL:
4158          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
4159          break;
4160
4161       case SHADER_OPCODE_TXD_LOGICAL:
4162          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
4163          break;
4164
4165       case SHADER_OPCODE_TXF_LOGICAL:
4166          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
4167          break;
4168
4169       case SHADER_OPCODE_TXL_LOGICAL:
4170          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
4171          break;
4172
4173       case SHADER_OPCODE_TXS_LOGICAL:
4174          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
4175          break;
4176
4177       case FS_OPCODE_TXB_LOGICAL:
4178          lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
4179          break;
4180
4181       case SHADER_OPCODE_TXF_CMS_LOGICAL:
4182          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
4183          break;
4184
4185       case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
4186          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W);
4187          break;
4188
4189       case SHADER_OPCODE_TXF_UMS_LOGICAL:
4190          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
4191          break;
4192
4193       case SHADER_OPCODE_TXF_MCS_LOGICAL:
4194          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
4195          break;
4196
4197       case SHADER_OPCODE_LOD_LOGICAL:
4198          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
4199          break;
4200
4201       case SHADER_OPCODE_TG4_LOGICAL:
4202          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
4203          break;
4204
4205       case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
4206          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
4207          break;
4208
4209       case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
4210          lower_surface_logical_send(ibld, inst,
4211                                     SHADER_OPCODE_UNTYPED_SURFACE_READ,
4212                                     fs_reg());
4213          break;
4214
4215       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
4216          lower_surface_logical_send(ibld, inst,
4217                                     SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
4218                                     ibld.sample_mask_reg());
4219          break;
4220
4221       case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
4222          lower_surface_logical_send(ibld, inst,
4223                                     SHADER_OPCODE_UNTYPED_ATOMIC,
4224                                     ibld.sample_mask_reg());
4225          break;
4226
4227       case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
4228          lower_surface_logical_send(ibld, inst,
4229                                     SHADER_OPCODE_TYPED_SURFACE_READ,
4230                                     fs_reg(0xffff));
4231          break;
4232
4233       case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
4234          lower_surface_logical_send(ibld, inst,
4235                                     SHADER_OPCODE_TYPED_SURFACE_WRITE,
4236                                     ibld.sample_mask_reg());
4237          break;
4238
4239       case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
4240          lower_surface_logical_send(ibld, inst,
4241                                     SHADER_OPCODE_TYPED_ATOMIC,
4242                                     ibld.sample_mask_reg());
4243          break;
4244
4245       default:
4246          continue;
4247       }
4248
4249       progress = true;
4250    }
4251
4252    if (progress)
4253       invalidate_live_intervals();
4254
4255    return progress;
4256 }
4257
4258 /**
4259  * Get the closest native SIMD width supported by the hardware for instruction
4260  * \p inst.  The instruction will be left untouched by
4261  * fs_visitor::lower_simd_width() if the returned value is equal to the
4262  * original execution size.
4263  */
4264 static unsigned
4265 get_lowered_simd_width(const struct brw_device_info *devinfo,
4266                        const fs_inst *inst)
4267 {
4268    switch (inst->opcode) {
4269    case BRW_OPCODE_MOV:
4270    case BRW_OPCODE_SEL:
4271    case BRW_OPCODE_NOT:
4272    case BRW_OPCODE_AND:
4273    case BRW_OPCODE_OR:
4274    case BRW_OPCODE_XOR:
4275    case BRW_OPCODE_SHR:
4276    case BRW_OPCODE_SHL:
4277    case BRW_OPCODE_ASR:
4278    case BRW_OPCODE_CMP:
4279    case BRW_OPCODE_CMPN:
4280    case BRW_OPCODE_CSEL:
4281    case BRW_OPCODE_F32TO16:
4282    case BRW_OPCODE_F16TO32:
4283    case BRW_OPCODE_BFREV:
4284    case BRW_OPCODE_BFE:
4285    case BRW_OPCODE_BFI1:
4286    case BRW_OPCODE_BFI2:
4287    case BRW_OPCODE_ADD:
4288    case BRW_OPCODE_MUL:
4289    case BRW_OPCODE_AVG:
4290    case BRW_OPCODE_FRC:
4291    case BRW_OPCODE_RNDU:
4292    case BRW_OPCODE_RNDD:
4293    case BRW_OPCODE_RNDE:
4294    case BRW_OPCODE_RNDZ:
4295    case BRW_OPCODE_LZD:
4296    case BRW_OPCODE_FBH:
4297    case BRW_OPCODE_FBL:
4298    case BRW_OPCODE_CBIT:
4299    case BRW_OPCODE_SAD2:
4300    case BRW_OPCODE_MAD:
4301    case BRW_OPCODE_LRP:
4302    case SHADER_OPCODE_RCP:
4303    case SHADER_OPCODE_RSQ:
4304    case SHADER_OPCODE_SQRT:
4305    case SHADER_OPCODE_EXP2:
4306    case SHADER_OPCODE_LOG2:
4307    case SHADER_OPCODE_POW:
4308    case SHADER_OPCODE_INT_QUOTIENT:
4309    case SHADER_OPCODE_INT_REMAINDER:
4310    case SHADER_OPCODE_SIN:
4311    case SHADER_OPCODE_COS: {
4312       /* According to the PRMs:
4313        *  "A. In Direct Addressing mode, a source cannot span more than 2
4314        *      adjacent GRF registers.
4315        *   B. A destination cannot span more than 2 adjacent GRF registers."
4316        *
4317        * Look for the source or destination with the largest register region
4318        * which is the one that is going to limit the overal execution size of
4319        * the instruction due to this rule.
4320        */
4321       unsigned reg_count = inst->regs_written;
4322
4323       for (unsigned i = 0; i < inst->sources; i++)
4324          reg_count = MAX2(reg_count, (unsigned)inst->regs_read(i));
4325
4326       /* Calculate the maximum execution size of the instruction based on the
4327        * factor by which it goes over the hardware limit of 2 GRFs.
4328        */
4329       return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
4330    }
4331    case SHADER_OPCODE_MULH:
4332       /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
4333        * is 8-wide on Gen7+.
4334        */
4335       return (devinfo->gen >= 7 ? 8 : inst->exec_size);
4336
4337    case FS_OPCODE_FB_WRITE_LOGICAL:
4338       /* Gen6 doesn't support SIMD16 depth writes but we cannot handle them
4339        * here.
4340        */
4341       assert(devinfo->gen != 6 ||
4342              inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE ||
4343              inst->exec_size == 8);
4344       /* Dual-source FB writes are unsupported in SIMD16 mode. */
4345       return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
4346               8 : inst->exec_size);
4347
4348    case SHADER_OPCODE_TXD_LOGICAL:
4349       /* TXD is unsupported in SIMD16 mode. */
4350       return 8;
4351
4352    case SHADER_OPCODE_TG4_OFFSET_LOGICAL: {
4353       /* gather4_po_c is unsupported in SIMD16 mode. */
4354       const fs_reg &shadow_c = inst->src[1];
4355       return (shadow_c.file != BAD_FILE ? 8 : inst->exec_size);
4356    }
4357    case SHADER_OPCODE_TXL_LOGICAL:
4358    case FS_OPCODE_TXB_LOGICAL: {
4359       /* Gen4 doesn't have SIMD8 non-shadow-compare bias/LOD instructions, and
4360        * Gen4-6 can't support TXL and TXB with shadow comparison in SIMD16
4361        * mode because the message exceeds the maximum length of 11.
4362        */
4363       const fs_reg &shadow_c = inst->src[1];
4364       if (devinfo->gen == 4 && shadow_c.file == BAD_FILE)
4365          return 16;
4366       else if (devinfo->gen < 7 && shadow_c.file != BAD_FILE)
4367          return 8;
4368       else
4369          return inst->exec_size;
4370    }
4371    case SHADER_OPCODE_TXF_LOGICAL:
4372    case SHADER_OPCODE_TXS_LOGICAL:
4373       /* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
4374        * messages.  Use SIMD16 instead.
4375        */
4376       if (devinfo->gen == 4)
4377          return 16;
4378       else
4379          return inst->exec_size;
4380
4381    case SHADER_OPCODE_TXF_CMS_W_LOGICAL: {
4382       /* This opcode can take up to 6 arguments which means that in some
4383        * circumstances it can end up with a message that is too long in SIMD16
4384        * mode.
4385        */
4386       const unsigned coord_components = inst->src[8].ud;
4387       /* First three arguments are the sample index and the two arguments for
4388        * the MCS data.
4389        */
4390       if ((coord_components + 3) * 2 > MAX_SAMPLER_MESSAGE_SIZE)
4391          return 8;
4392       else
4393          return inst->exec_size;
4394    }
4395
4396    case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
4397    case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
4398    case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
4399       return 8;
4400
4401    default:
4402       return inst->exec_size;
4403    }
4404 }
4405
4406 /**
4407  * The \p rows array of registers represents a \p num_rows by \p num_columns
4408  * matrix in row-major order, write it in column-major order into the register
4409  * passed as destination.  \p stride gives the separation between matrix
4410  * elements in the input in fs_builder::dispatch_width() units.
4411  */
4412 static void
4413 emit_transpose(const fs_builder &bld,
4414                const fs_reg &dst, const fs_reg *rows,
4415                unsigned num_rows, unsigned num_columns, unsigned stride)
4416 {
4417    fs_reg *const components = new fs_reg[num_rows * num_columns];
4418
4419    for (unsigned i = 0; i < num_columns; ++i) {
4420       for (unsigned j = 0; j < num_rows; ++j)
4421          components[num_rows * i + j] = offset(rows[j], bld, stride * i);
4422    }
4423
4424    bld.LOAD_PAYLOAD(dst, components, num_rows * num_columns, 0);
4425
4426    delete[] components;
4427 }
4428
4429 bool
4430 fs_visitor::lower_simd_width()
4431 {
4432    bool progress = false;
4433
4434    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
4435       const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
4436
4437       if (lower_width != inst->exec_size) {
4438          /* Builder matching the original instruction.  We may also need to
4439           * emit an instruction of width larger than the original, set the
4440           * execution size of the builder to the highest of both for now so
4441           * we're sure that both cases can be handled.
4442           */
4443          const fs_builder ibld = bld.at(block, inst)
4444                                     .exec_all(inst->force_writemask_all)
4445                                     .group(MAX2(inst->exec_size, lower_width),
4446                                            inst->force_sechalf);
4447
4448          /* Split the copies in chunks of the execution width of either the
4449           * original or the lowered instruction, whichever is lower.
4450           */
4451          const unsigned copy_width = MIN2(lower_width, inst->exec_size);
4452          const unsigned n = inst->exec_size / copy_width;
4453          const unsigned dst_size = inst->regs_written * REG_SIZE /
4454             inst->dst.component_size(inst->exec_size);
4455          fs_reg dsts[4];
4456
4457          assert(n > 0 && n <= ARRAY_SIZE(dsts) &&
4458                 !inst->writes_accumulator && !inst->mlen);
4459
4460          for (unsigned i = 0; i < n; i++) {
4461             /* Emit a copy of the original instruction with the lowered width.
4462              * If the EOT flag was set throw it away except for the last
4463              * instruction to avoid killing the thread prematurely.
4464              */
4465             fs_inst split_inst = *inst;
4466             split_inst.exec_size = lower_width;
4467             split_inst.eot = inst->eot && i == n - 1;
4468
4469             /* Select the correct channel enables for the i-th group, then
4470              * transform the sources and destination and emit the lowered
4471              * instruction.
4472              */
4473             const fs_builder lbld = ibld.group(lower_width, i);
4474
4475             for (unsigned j = 0; j < inst->sources; j++) {
4476                if (inst->src[j].file != BAD_FILE &&
4477                    !is_uniform(inst->src[j])) {
4478                   /* Get the i-th copy_width-wide chunk of the source. */
4479                   const fs_reg src = horiz_offset(inst->src[j], copy_width * i);
4480                   const unsigned src_size = inst->components_read(j);
4481
4482                   /* Use a trivial transposition to copy one every n
4483                    * copy_width-wide components of the register into a
4484                    * temporary passed as source to the lowered instruction.
4485                    */
4486                   split_inst.src[j] = lbld.vgrf(inst->src[j].type, src_size);
4487                   emit_transpose(lbld.group(copy_width, 0),
4488                                  split_inst.src[j], &src, 1, src_size, n);
4489                }
4490             }
4491
4492             if (inst->regs_written) {
4493                /* Allocate enough space to hold the result of the lowered
4494                 * instruction and fix up the number of registers written.
4495                 */
4496                split_inst.dst = dsts[i] =
4497                   lbld.vgrf(inst->dst.type, dst_size);
4498                split_inst.regs_written =
4499                   DIV_ROUND_UP(inst->regs_written * lower_width,
4500                                inst->exec_size);
4501             }
4502
4503             lbld.emit(split_inst);
4504          }
4505
4506          if (inst->regs_written) {
4507             /* Distance between useful channels in the temporaries, skipping
4508              * garbage if the lowered instruction is wider than the original.
4509              */
4510             const unsigned m = lower_width / copy_width;
4511
4512             /* Interleave the components of the result from the lowered
4513              * instructions.  We need to set exec_all() when copying more than
4514              * one half per component, because LOAD_PAYLOAD (in terms of which
4515              * emit_transpose is implemented) can only use the same channel
4516              * enable signals for all of its non-header sources.
4517              */
4518             emit_transpose(ibld.exec_all(inst->exec_size > copy_width)
4519                                .group(copy_width, 0),
4520                            inst->dst, dsts, n, dst_size, m);
4521          }
4522
4523          inst->remove(block);
4524          progress = true;
4525       }
4526    }
4527
4528    if (progress)
4529       invalidate_live_intervals();
4530
4531    return progress;
4532 }
4533
4534 void
4535 fs_visitor::dump_instructions()
4536 {
4537    dump_instructions(NULL);
4538 }
4539
4540 void
4541 fs_visitor::dump_instructions(const char *name)
4542 {
4543    FILE *file = stderr;
4544    if (name && geteuid() != 0) {
4545       file = fopen(name, "w");
4546       if (!file)
4547          file = stderr;
4548    }
4549
4550    if (cfg) {
4551       calculate_register_pressure();
4552       int ip = 0, max_pressure = 0;
4553       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
4554          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
4555          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
4556          dump_instruction(inst, file);
4557          ip++;
4558       }
4559       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
4560    } else {
4561       int ip = 0;
4562       foreach_in_list(backend_instruction, inst, &instructions) {
4563          fprintf(file, "%4d: ", ip++);
4564          dump_instruction(inst, file);
4565       }
4566    }
4567
4568    if (file != stderr) {
4569       fclose(file);
4570    }
4571 }
4572
4573 void
4574 fs_visitor::dump_instruction(backend_instruction *be_inst)
4575 {
4576    dump_instruction(be_inst, stderr);
4577 }
4578
4579 void
4580 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
4581 {
4582    fs_inst *inst = (fs_inst *)be_inst;
4583
4584    if (inst->predicate) {
4585       fprintf(file, "(%cf0.%d) ",
4586              inst->predicate_inverse ? '-' : '+',
4587              inst->flag_subreg);
4588    }
4589
4590    fprintf(file, "%s", brw_instruction_name(inst->opcode));
4591    if (inst->saturate)
4592       fprintf(file, ".sat");
4593    if (inst->conditional_mod) {
4594       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
4595       if (!inst->predicate &&
4596           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
4597                               inst->opcode != BRW_OPCODE_IF &&
4598                               inst->opcode != BRW_OPCODE_WHILE))) {
4599          fprintf(file, ".f0.%d", inst->flag_subreg);
4600       }
4601    }
4602    fprintf(file, "(%d) ", inst->exec_size);
4603
4604    if (inst->mlen) {
4605       fprintf(file, "(mlen: %d) ", inst->mlen);
4606    }
4607
4608    switch (inst->dst.file) {
4609    case VGRF:
4610       fprintf(file, "vgrf%d", inst->dst.nr);
4611       if (alloc.sizes[inst->dst.nr] != inst->regs_written ||
4612           inst->dst.subreg_offset)
4613          fprintf(file, "+%d.%d",
4614                  inst->dst.reg_offset, inst->dst.subreg_offset);
4615       break;
4616    case FIXED_GRF:
4617       fprintf(file, "g%d", inst->dst.nr);
4618       break;
4619    case MRF:
4620       fprintf(file, "m%d", inst->dst.nr);
4621       break;
4622    case BAD_FILE:
4623       fprintf(file, "(null)");
4624       break;
4625    case UNIFORM:
4626       fprintf(file, "***u%d***", inst->dst.nr + inst->dst.reg_offset);
4627       break;
4628    case ATTR:
4629       fprintf(file, "***attr%d***", inst->dst.nr + inst->dst.reg_offset);
4630       break;
4631    case ARF:
4632       switch (inst->dst.nr) {
4633       case BRW_ARF_NULL:
4634          fprintf(file, "null");
4635          break;
4636       case BRW_ARF_ADDRESS:
4637          fprintf(file, "a0.%d", inst->dst.subnr);
4638          break;
4639       case BRW_ARF_ACCUMULATOR:
4640          fprintf(file, "acc%d", inst->dst.subnr);
4641          break;
4642       case BRW_ARF_FLAG:
4643          fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
4644          break;
4645       default:
4646          fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
4647          break;
4648       }
4649       if (inst->dst.subnr)
4650          fprintf(file, "+%d", inst->dst.subnr);
4651       break;
4652    case IMM:
4653       unreachable("not reached");
4654    }
4655    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
4656
4657    for (int i = 0; i < inst->sources; i++) {
4658       if (inst->src[i].negate)
4659          fprintf(file, "-");
4660       if (inst->src[i].abs)
4661          fprintf(file, "|");
4662       switch (inst->src[i].file) {
4663       case VGRF:
4664          fprintf(file, "vgrf%d", inst->src[i].nr);
4665          if (alloc.sizes[inst->src[i].nr] != (unsigned)inst->regs_read(i) ||
4666              inst->src[i].subreg_offset)
4667             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
4668                     inst->src[i].subreg_offset);
4669          break;
4670       case FIXED_GRF:
4671          fprintf(file, "g%d", inst->src[i].nr);
4672          break;
4673       case MRF:
4674          fprintf(file, "***m%d***", inst->src[i].nr);
4675          break;
4676       case ATTR:
4677          fprintf(file, "attr%d+%d", inst->src[i].nr, inst->src[i].reg_offset);
4678          break;
4679       case UNIFORM:
4680          fprintf(file, "u%d", inst->src[i].nr + inst->src[i].reg_offset);
4681          if (inst->src[i].reladdr) {
4682             fprintf(file, "+reladdr");
4683          } else if (inst->src[i].subreg_offset) {
4684             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
4685                     inst->src[i].subreg_offset);
4686          }
4687          break;
4688       case BAD_FILE:
4689          fprintf(file, "(null)");
4690          break;
4691       case IMM:
4692          switch (inst->src[i].type) {
4693          case BRW_REGISTER_TYPE_F:
4694             fprintf(file, "%ff", inst->src[i].f);
4695             break;
4696          case BRW_REGISTER_TYPE_W:
4697          case BRW_REGISTER_TYPE_D:
4698             fprintf(file, "%dd", inst->src[i].d);
4699             break;
4700          case BRW_REGISTER_TYPE_UW:
4701          case BRW_REGISTER_TYPE_UD:
4702             fprintf(file, "%uu", inst->src[i].ud);
4703             break;
4704          case BRW_REGISTER_TYPE_VF:
4705             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
4706                     brw_vf_to_float((inst->src[i].ud >>  0) & 0xff),
4707                     brw_vf_to_float((inst->src[i].ud >>  8) & 0xff),
4708                     brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
4709                     brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
4710             break;
4711          default:
4712             fprintf(file, "???");
4713             break;
4714          }
4715          break;
4716       case ARF:
4717          switch (inst->src[i].nr) {
4718          case BRW_ARF_NULL:
4719             fprintf(file, "null");
4720             break;
4721          case BRW_ARF_ADDRESS:
4722             fprintf(file, "a0.%d", inst->src[i].subnr);
4723             break;
4724          case BRW_ARF_ACCUMULATOR:
4725             fprintf(file, "acc%d", inst->src[i].subnr);
4726             break;
4727          case BRW_ARF_FLAG:
4728             fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
4729             break;
4730          default:
4731             fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
4732             break;
4733          }
4734          if (inst->src[i].subnr)
4735             fprintf(file, "+%d", inst->src[i].subnr);
4736          break;
4737       }
4738       if (inst->src[i].abs)
4739          fprintf(file, "|");
4740
4741       if (inst->src[i].file != IMM) {
4742          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
4743       }
4744
4745       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
4746          fprintf(file, ", ");
4747    }
4748
4749    fprintf(file, " ");
4750
4751    if (inst->force_writemask_all)
4752       fprintf(file, "NoMask ");
4753
4754    if (dispatch_width == 16 && inst->exec_size == 8) {
4755       if (inst->force_sechalf)
4756          fprintf(file, "2ndhalf ");
4757       else
4758          fprintf(file, "1sthalf ");
4759    }
4760
4761    fprintf(file, "\n");
4762 }
4763
4764 /**
4765  * Possibly returns an instruction that set up @param reg.
4766  *
4767  * Sometimes we want to take the result of some expression/variable
4768  * dereference tree and rewrite the instruction generating the result
4769  * of the tree.  When processing the tree, we know that the
4770  * instructions generated are all writing temporaries that are dead
4771  * outside of this tree.  So, if we have some instructions that write
4772  * a temporary, we're free to point that temp write somewhere else.
4773  *
4774  * Note that this doesn't guarantee that the instruction generated
4775  * only reg -- it might be the size=4 destination of a texture instruction.
4776  */
4777 fs_inst *
4778 fs_visitor::get_instruction_generating_reg(fs_inst *start,
4779                                            fs_inst *end,
4780                                            const fs_reg &reg)
4781 {
4782    if (end == start ||
4783        end->is_partial_write() ||
4784        reg.reladdr ||
4785        !reg.equals(end->dst)) {
4786       return NULL;
4787    } else {
4788       return end;
4789    }
4790 }
4791
4792 void
4793 fs_visitor::setup_payload_gen6()
4794 {
4795    bool uses_depth =
4796       (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
4797    unsigned barycentric_interp_modes =
4798       (stage == MESA_SHADER_FRAGMENT) ?
4799       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
4800
4801    assert(devinfo->gen >= 6);
4802
4803    /* R0-1: masks, pixel X/Y coordinates. */
4804    payload.num_regs = 2;
4805    /* R2: only for 32-pixel dispatch.*/
4806
4807    /* R3-26: barycentric interpolation coordinates.  These appear in the
4808     * same order that they appear in the brw_wm_barycentric_interp_mode
4809     * enum.  Each set of coordinates occupies 2 registers if dispatch width
4810     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
4811     * appear if they were enabled using the "Barycentric Interpolation
4812     * Mode" bits in WM_STATE.
4813     */
4814    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
4815       if (barycentric_interp_modes & (1 << i)) {
4816          payload.barycentric_coord_reg[i] = payload.num_regs;
4817          payload.num_regs += 2;
4818          if (dispatch_width == 16) {
4819             payload.num_regs += 2;
4820          }
4821       }
4822    }
4823
4824    /* R27: interpolated depth if uses source depth */
4825    if (uses_depth) {
4826       payload.source_depth_reg = payload.num_regs;
4827       payload.num_regs++;
4828       if (dispatch_width == 16) {
4829          /* R28: interpolated depth if not SIMD8. */
4830          payload.num_regs++;
4831       }
4832    }
4833    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
4834    if (uses_depth) {
4835       payload.source_w_reg = payload.num_regs;
4836       payload.num_regs++;
4837       if (dispatch_width == 16) {
4838          /* R30: interpolated W if not SIMD8. */
4839          payload.num_regs++;
4840       }
4841    }
4842
4843    if (stage == MESA_SHADER_FRAGMENT) {
4844       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
4845       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
4846       prog_data->uses_pos_offset = key->compute_pos_offset;
4847       /* R31: MSAA position offsets. */
4848       if (prog_data->uses_pos_offset) {
4849          payload.sample_pos_reg = payload.num_regs;
4850          payload.num_regs++;
4851       }
4852    }
4853
4854    /* R32: MSAA input coverage mask */
4855    if (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) {
4856       assert(devinfo->gen >= 7);
4857       payload.sample_mask_in_reg = payload.num_regs;
4858       payload.num_regs++;
4859       if (dispatch_width == 16) {
4860          /* R33: input coverage mask if not SIMD8. */
4861          payload.num_regs++;
4862       }
4863    }
4864
4865    /* R34-: bary for 32-pixel. */
4866    /* R58-59: interp W for 32-pixel. */
4867
4868    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
4869       source_depth_to_render_target = true;
4870    }
4871 }
4872
4873 void
4874 fs_visitor::setup_vs_payload()
4875 {
4876    /* R0: thread header, R1: urb handles */
4877    payload.num_regs = 2;
4878 }
4879
4880 /**
4881  * We are building the local ID push constant data using the simplest possible
4882  * method. We simply push the local IDs directly as they should appear in the
4883  * registers for the uvec3 gl_LocalInvocationID variable.
4884  *
4885  * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6
4886  * registers worth of push constant space.
4887  *
4888  * Note: Any updates to brw_cs_prog_local_id_payload_dwords,
4889  * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup need
4890  * to coordinated.
4891  *
4892  * FINISHME: There are a few easy optimizations to consider.
4893  *
4894  * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is
4895  *    no need for using push constant space for that dimension.
4896  *
4897  * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can
4898  *    easily use 16-bit words rather than 32-bit dwords in the push constant
4899  *    data.
4900  *
4901  * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for
4902  *    conveying the data, and thereby reduce push constant usage.
4903  *
4904  */
4905 void
4906 fs_visitor::setup_gs_payload()
4907 {
4908    assert(stage == MESA_SHADER_GEOMETRY);
4909
4910    struct brw_gs_prog_data *gs_prog_data =
4911       (struct brw_gs_prog_data *) prog_data;
4912    struct brw_vue_prog_data *vue_prog_data =
4913       (struct brw_vue_prog_data *) prog_data;
4914
4915    /* R0: thread header, R1: output URB handles */
4916    payload.num_regs = 2;
4917
4918    if (gs_prog_data->include_primitive_id) {
4919       /* R2: Primitive ID 0..7 */
4920       payload.num_regs++;
4921    }
4922
4923    /* Use a maximum of 32 registers for push-model inputs. */
4924    const unsigned max_push_components = 32;
4925
4926    /* If pushing our inputs would take too many registers, reduce the URB read
4927     * length (which is in HWords, or 8 registers), and resort to pulling.
4928     *
4929     * Note that the GS reads <URB Read Length> HWords for every vertex - so we
4930     * have to multiply by VerticesIn to obtain the total storage requirement.
4931     */
4932    if (8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in >
4933        max_push_components) {
4934       gs_prog_data->base.include_vue_handles = true;
4935
4936       /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
4937       payload.num_regs += nir->info.gs.vertices_in;
4938
4939       vue_prog_data->urb_read_length =
4940          ROUND_DOWN_TO(max_push_components / nir->info.gs.vertices_in, 8) / 8;
4941    }
4942 }
4943
4944 void
4945 fs_visitor::setup_cs_payload()
4946 {
4947    assert(devinfo->gen >= 7);
4948    brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
4949
4950    payload.num_regs = 1;
4951
4952    if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
4953       prog_data->local_invocation_id_regs = dispatch_width * 3 / 8;
4954       payload.local_invocation_id_reg = payload.num_regs;
4955       payload.num_regs += prog_data->local_invocation_id_regs;
4956    }
4957 }
4958
4959 void
4960 fs_visitor::calculate_register_pressure()
4961 {
4962    invalidate_live_intervals();
4963    calculate_live_intervals();
4964
4965    unsigned num_instructions = 0;
4966    foreach_block(block, cfg)
4967       num_instructions += block->instructions.length();
4968
4969    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
4970
4971    for (unsigned reg = 0; reg < alloc.count; reg++) {
4972       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
4973          regs_live_at_ip[ip] += alloc.sizes[reg];
4974    }
4975 }
4976
4977 void
4978 fs_visitor::optimize()
4979 {
4980    /* Start by validating the shader we currently have. */
4981    validate();
4982
4983    /* bld is the common builder object pointing at the end of the program we
4984     * used to translate it into i965 IR.  For the optimization and lowering
4985     * passes coming next, any code added after the end of the program without
4986     * having explicitly called fs_builder::at() clearly points at a mistake.
4987     * Ideally optimization passes wouldn't be part of the visitor so they
4988     * wouldn't have access to bld at all, but they do, so just in case some
4989     * pass forgets to ask for a location explicitly set it to NULL here to
4990     * make it trip.  The dispatch width is initialized to a bogus value to
4991     * make sure that optimizations set the execution controls explicitly to
4992     * match the code they are manipulating instead of relying on the defaults.
4993     */
4994    bld = fs_builder(this, 64);
4995
4996    assign_constant_locations();
4997    demote_pull_constants();
4998
4999    validate();
5000
5001    split_virtual_grfs();
5002    validate();
5003
5004 #define OPT(pass, args...) ({                                           \
5005       pass_num++;                                                       \
5006       bool this_progress = pass(args);                                  \
5007                                                                         \
5008       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
5009          char filename[64];                                             \
5010          snprintf(filename, 64, "%s%d-%s-%02d-%02d-" #pass,              \
5011                   stage_abbrev, dispatch_width, nir->info.name, iteration, pass_num); \
5012                                                                         \
5013          backend_shader::dump_instructions(filename);                   \
5014       }                                                                 \
5015                                                                         \
5016       validate();                                                       \
5017                                                                         \
5018       progress = progress || this_progress;                             \
5019       this_progress;                                                    \
5020    })
5021
5022    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
5023       char filename[64];
5024       snprintf(filename, 64, "%s%d-%s-00-start",
5025                stage_abbrev, dispatch_width, nir->info.name);
5026
5027       backend_shader::dump_instructions(filename);
5028    }
5029
5030    bool progress = false;
5031    int iteration = 0;
5032    int pass_num = 0;
5033
5034    OPT(lower_simd_width);
5035    OPT(lower_logical_sends);
5036
5037    do {
5038       progress = false;
5039       pass_num = 0;
5040       iteration++;
5041
5042       OPT(remove_duplicate_mrf_writes);
5043
5044       OPT(opt_algebraic);
5045       OPT(opt_cse);
5046       OPT(opt_copy_propagate);
5047       OPT(opt_predicated_break, this);
5048       OPT(opt_cmod_propagation);
5049       OPT(dead_code_eliminate);
5050       OPT(opt_peephole_sel);
5051       OPT(dead_control_flow_eliminate, this);
5052       OPT(opt_register_renaming);
5053       OPT(opt_redundant_discard_jumps);
5054       OPT(opt_saturate_propagation);
5055       OPT(opt_zero_samples);
5056       OPT(register_coalesce);
5057       OPT(compute_to_mrf);
5058       OPT(eliminate_find_live_channel);
5059
5060       OPT(compact_virtual_grfs);
5061    } while (progress);
5062
5063    pass_num = 0;
5064
5065    OPT(opt_sampler_eot);
5066
5067    if (OPT(lower_load_payload)) {
5068       split_virtual_grfs();
5069       OPT(register_coalesce);
5070       OPT(compute_to_mrf);
5071       OPT(dead_code_eliminate);
5072    }
5073
5074    OPT(opt_combine_constants);
5075    OPT(lower_integer_multiplication);
5076
5077    lower_uniform_pull_constant_loads();
5078
5079    validate();
5080 }
5081
5082 /**
5083  * Three source instruction must have a GRF/MRF destination register.
5084  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
5085  */
5086 void
5087 fs_visitor::fixup_3src_null_dest()
5088 {
5089    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
5090       if (inst->is_3src() && inst->dst.is_null()) {
5091          inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
5092                             inst->dst.type);
5093       }
5094    }
5095 }
5096
5097 void
5098 fs_visitor::allocate_registers()
5099 {
5100    bool allocated_without_spills;
5101
5102    static const enum instruction_scheduler_mode pre_modes[] = {
5103       SCHEDULE_PRE,
5104       SCHEDULE_PRE_NON_LIFO,
5105       SCHEDULE_PRE_LIFO,
5106    };
5107
5108    /* Try each scheduling heuristic to see if it can successfully register
5109     * allocate without spilling.  They should be ordered by decreasing
5110     * performance but increasing likelihood of allocating.
5111     */
5112    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
5113       schedule_instructions(pre_modes[i]);
5114
5115       if (0) {
5116          assign_regs_trivial();
5117          allocated_without_spills = true;
5118       } else {
5119          allocated_without_spills = assign_regs(false);
5120       }
5121       if (allocated_without_spills)
5122          break;
5123    }
5124
5125    if (!allocated_without_spills) {
5126       /* We assume that any spilling is worse than just dropping back to
5127        * SIMD8.  There's probably actually some intermediate point where
5128        * SIMD16 with a couple of spills is still better.
5129        */
5130       if (dispatch_width == 16) {
5131          fail("Failure to register allocate.  Reduce number of "
5132               "live scalar values to avoid this.");
5133       } else {
5134          compiler->shader_perf_log(log_data,
5135                                    "%s shader triggered register spilling.  "
5136                                    "Try reducing the number of live scalar "
5137                                    "values to improve performance.\n",
5138                                    stage_name);
5139       }
5140
5141       /* Since we're out of heuristics, just go spill registers until we
5142        * get an allocation.
5143        */
5144       while (!assign_regs(true)) {
5145          if (failed)
5146             break;
5147       }
5148    }
5149
5150    /* This must come after all optimization and register allocation, since
5151     * it inserts dead code that happens to have side effects, and it does
5152     * so based on the actual physical registers in use.
5153     */
5154    insert_gen4_send_dependency_workarounds();
5155
5156    if (failed)
5157       return;
5158
5159    schedule_instructions(SCHEDULE_POST);
5160
5161    if (last_scratch > 0)
5162       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
5163 }
5164
5165 bool
5166 fs_visitor::run_vs(gl_clip_plane *clip_planes)
5167 {
5168    assert(stage == MESA_SHADER_VERTEX);
5169
5170    setup_vs_payload();
5171
5172    if (shader_time_index >= 0)
5173       emit_shader_time_begin();
5174
5175    emit_nir_code();
5176
5177    if (failed)
5178       return false;
5179
5180    compute_clip_distance(clip_planes);
5181
5182    emit_urb_writes();
5183
5184    if (shader_time_index >= 0)
5185       emit_shader_time_end();
5186
5187    calculate_cfg();
5188
5189    optimize();
5190
5191    assign_curb_setup();
5192    assign_vs_urb_setup();
5193
5194    fixup_3src_null_dest();
5195    allocate_registers();
5196
5197    return !failed;
5198 }
5199
5200 bool
5201 fs_visitor::run_gs()
5202 {
5203    assert(stage == MESA_SHADER_GEOMETRY);
5204
5205    setup_gs_payload();
5206
5207    this->final_gs_vertex_count = vgrf(glsl_type::uint_type);
5208
5209    if (gs_compile->control_data_header_size_bits > 0) {
5210       /* Create a VGRF to store accumulated control data bits. */
5211       this->control_data_bits = vgrf(glsl_type::uint_type);
5212
5213       /* If we're outputting more than 32 control data bits, then EmitVertex()
5214        * will set control_data_bits to 0 after emitting the first vertex.
5215        * Otherwise, we need to initialize it to 0 here.
5216        */
5217       if (gs_compile->control_data_header_size_bits <= 32) {
5218          const fs_builder abld = bld.annotate("initialize control data bits");
5219          abld.MOV(this->control_data_bits, fs_reg(0u));
5220       }
5221    }
5222
5223    if (shader_time_index >= 0)
5224       emit_shader_time_begin();
5225
5226    emit_nir_code();
5227
5228    emit_gs_thread_end();
5229
5230    if (shader_time_index >= 0)
5231       emit_shader_time_end();
5232
5233    if (failed)
5234       return false;
5235
5236    calculate_cfg();
5237
5238    optimize();
5239
5240    assign_curb_setup();
5241    assign_gs_urb_setup();
5242
5243    fixup_3src_null_dest();
5244    allocate_registers();
5245
5246    return !failed;
5247 }
5248
5249 bool
5250 fs_visitor::run_fs(bool do_rep_send)
5251 {
5252    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
5253    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
5254
5255    assert(stage == MESA_SHADER_FRAGMENT);
5256
5257    if (devinfo->gen >= 6)
5258       setup_payload_gen6();
5259    else
5260       setup_payload_gen4();
5261
5262    if (0) {
5263       emit_dummy_fs();
5264    } else if (do_rep_send) {
5265       assert(dispatch_width == 16);
5266       emit_repclear_shader();
5267    } else {
5268       if (shader_time_index >= 0)
5269          emit_shader_time_begin();
5270
5271       calculate_urb_setup();
5272       if (nir->info.inputs_read > 0) {
5273          if (devinfo->gen < 6)
5274             emit_interpolation_setup_gen4();
5275          else
5276             emit_interpolation_setup_gen6();
5277       }
5278
5279       /* We handle discards by keeping track of the still-live pixels in f0.1.
5280        * Initialize it with the dispatched pixels.
5281        */
5282       if (wm_prog_data->uses_kill) {
5283          fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
5284          discard_init->flag_subreg = 1;
5285       }
5286
5287       /* Generate FS IR for main().  (the visitor only descends into
5288        * functions called "main").
5289        */
5290       emit_nir_code();
5291
5292       if (failed)
5293          return false;
5294
5295       if (wm_prog_data->uses_kill)
5296          bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
5297
5298       if (wm_key->alpha_test_func)
5299          emit_alpha_test();
5300
5301       emit_fb_writes();
5302
5303       if (shader_time_index >= 0)
5304          emit_shader_time_end();
5305
5306       calculate_cfg();
5307
5308       optimize();
5309
5310       assign_curb_setup();
5311       assign_urb_setup();
5312
5313       fixup_3src_null_dest();
5314       allocate_registers();
5315
5316       if (failed)
5317          return false;
5318    }
5319
5320    if (dispatch_width == 8)
5321       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
5322    else
5323       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
5324
5325    return !failed;
5326 }
5327
5328 bool
5329 fs_visitor::run_cs()
5330 {
5331    assert(stage == MESA_SHADER_COMPUTE);
5332
5333    setup_cs_payload();
5334
5335    if (shader_time_index >= 0)
5336       emit_shader_time_begin();
5337
5338    emit_nir_code();
5339
5340    if (failed)
5341       return false;
5342
5343    emit_cs_terminate();
5344
5345    if (shader_time_index >= 0)
5346       emit_shader_time_end();
5347
5348    calculate_cfg();
5349
5350    optimize();
5351
5352    assign_curb_setup();
5353
5354    fixup_3src_null_dest();
5355    allocate_registers();
5356
5357    if (failed)
5358       return false;
5359
5360    return !failed;
5361 }
5362
5363 /**
5364  * Return a bitfield where bit n is set if barycentric interpolation mode n
5365  * (see enum brw_wm_barycentric_interp_mode) is needed by the fragment shader.
5366  */
5367 static unsigned
5368 brw_compute_barycentric_interp_modes(const struct brw_device_info *devinfo,
5369                                      bool shade_model_flat,
5370                                      bool persample_shading,
5371                                      const nir_shader *shader)
5372 {
5373    unsigned barycentric_interp_modes = 0;
5374
5375    nir_foreach_variable(var, &shader->inputs) {
5376       enum glsl_interp_qualifier interp_qualifier =
5377          (enum glsl_interp_qualifier)var->data.interpolation;
5378       bool is_centroid = var->data.centroid && !persample_shading;
5379       bool is_sample = var->data.sample || persample_shading;
5380       bool is_gl_Color = (var->data.location == VARYING_SLOT_COL0) ||
5381                          (var->data.location == VARYING_SLOT_COL1);
5382
5383       /* Ignore WPOS and FACE, because they don't require interpolation. */
5384       if (var->data.location == VARYING_SLOT_POS ||
5385           var->data.location == VARYING_SLOT_FACE)
5386          continue;
5387
5388       /* Determine the set (or sets) of barycentric coordinates needed to
5389        * interpolate this variable.  Note that when
5390        * brw->needs_unlit_centroid_workaround is set, centroid interpolation
5391        * uses PIXEL interpolation for unlit pixels and CENTROID interpolation
5392        * for lit pixels, so we need both sets of barycentric coordinates.
5393        */
5394       if (interp_qualifier == INTERP_QUALIFIER_NOPERSPECTIVE) {
5395          if (is_centroid) {
5396             barycentric_interp_modes |=
5397                1 << BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
5398          } else if (is_sample) {
5399             barycentric_interp_modes |=
5400                1 << BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
5401          }
5402          if ((!is_centroid && !is_sample) ||
5403              devinfo->needs_unlit_centroid_workaround) {
5404             barycentric_interp_modes |=
5405                1 << BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
5406          }
5407       } else if (interp_qualifier == INTERP_QUALIFIER_SMOOTH ||
5408                  (!(shade_model_flat && is_gl_Color) &&
5409                   interp_qualifier == INTERP_QUALIFIER_NONE)) {
5410          if (is_centroid) {
5411             barycentric_interp_modes |=
5412                1 << BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
5413          } else if (is_sample) {
5414             barycentric_interp_modes |=
5415                1 << BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
5416          }
5417          if ((!is_centroid && !is_sample) ||
5418              devinfo->needs_unlit_centroid_workaround) {
5419             barycentric_interp_modes |=
5420                1 << BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
5421          }
5422       }
5423    }
5424
5425    return barycentric_interp_modes;
5426 }
5427
5428 static uint8_t
5429 computed_depth_mode(const nir_shader *shader)
5430 {
5431    if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
5432       switch (shader->info.fs.depth_layout) {
5433       case FRAG_DEPTH_LAYOUT_NONE:
5434       case FRAG_DEPTH_LAYOUT_ANY:
5435          return BRW_PSCDEPTH_ON;
5436       case FRAG_DEPTH_LAYOUT_GREATER:
5437          return BRW_PSCDEPTH_ON_GE;
5438       case FRAG_DEPTH_LAYOUT_LESS:
5439          return BRW_PSCDEPTH_ON_LE;
5440       case FRAG_DEPTH_LAYOUT_UNCHANGED:
5441          return BRW_PSCDEPTH_OFF;
5442       }
5443    }
5444    return BRW_PSCDEPTH_OFF;
5445 }
5446
5447 const unsigned *
5448 brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
5449                void *mem_ctx,
5450                const struct brw_wm_prog_key *key,
5451                struct brw_wm_prog_data *prog_data,
5452                const nir_shader *shader,
5453                struct gl_program *prog,
5454                int shader_time_index8, int shader_time_index16,
5455                bool use_rep_send,
5456                unsigned *final_assembly_size,
5457                char **error_str)
5458 {
5459    /* key->alpha_test_func means simulating alpha testing via discards,
5460     * so the shader definitely kills pixels.
5461     */
5462    prog_data->uses_kill = shader->info.fs.uses_discard || key->alpha_test_func;
5463    prog_data->uses_omask =
5464       shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
5465    prog_data->computed_depth_mode = computed_depth_mode(shader);
5466    prog_data->computed_stencil =
5467       shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
5468
5469    prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
5470
5471    prog_data->barycentric_interp_modes =
5472       brw_compute_barycentric_interp_modes(compiler->devinfo,
5473                                            key->flat_shade,
5474                                            key->persample_shading,
5475                                            shader);
5476
5477    fs_visitor v(compiler, log_data, mem_ctx, key,
5478                 &prog_data->base, prog, shader, 8,
5479                 shader_time_index8);
5480    if (!v.run_fs(false /* do_rep_send */)) {
5481       if (error_str)
5482          *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
5483
5484       return NULL;
5485    }
5486
5487    cfg_t *simd16_cfg = NULL;
5488    fs_visitor v2(compiler, log_data, mem_ctx, key,
5489                  &prog_data->base, prog, shader, 16,
5490                  shader_time_index16);
5491    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) {
5492       if (!v.simd16_unsupported) {
5493          /* Try a SIMD16 compile */
5494          v2.import_uniforms(&v);
5495          if (!v2.run_fs(use_rep_send)) {
5496             compiler->shader_perf_log(log_data,
5497                                       "SIMD16 shader failed to compile: %s",
5498                                       v2.fail_msg);
5499          } else {
5500             simd16_cfg = v2.cfg;
5501          }
5502       }
5503    }
5504
5505    cfg_t *simd8_cfg;
5506    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || use_rep_send;
5507    if ((no_simd8 || compiler->devinfo->gen < 5) && simd16_cfg) {
5508       simd8_cfg = NULL;
5509       prog_data->no_8 = true;
5510    } else {
5511       simd8_cfg = v.cfg;
5512       prog_data->no_8 = false;
5513    }
5514
5515    fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base,
5516                   v.promoted_constants, v.runtime_check_aads_emit, "FS");
5517
5518    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
5519       g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s",
5520                                      shader->info.label ? shader->info.label :
5521                                                           "unnamed",
5522                                      shader->info.name));
5523    }
5524
5525    if (simd8_cfg)
5526       g.generate_code(simd8_cfg, 8);
5527    if (simd16_cfg)
5528       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
5529
5530    return g.get_assembly(final_assembly_size);
5531 }
5532
5533 void
5534 brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data,
5535                              void *buffer, uint32_t threads, uint32_t stride)
5536 {
5537    if (prog_data->local_invocation_id_regs == 0)
5538       return;
5539
5540    /* 'stride' should be an integer number of registers, that is, a multiple
5541     * of 32 bytes.
5542     */
5543    assert(stride % 32 == 0);
5544
5545    unsigned x = 0, y = 0, z = 0;
5546    for (unsigned t = 0; t < threads; t++) {
5547       uint32_t *param = (uint32_t *) buffer + stride * t / 4;
5548
5549       for (unsigned i = 0; i < prog_data->simd_size; i++) {
5550          param[0 * prog_data->simd_size + i] = x;
5551          param[1 * prog_data->simd_size + i] = y;
5552          param[2 * prog_data->simd_size + i] = z;
5553
5554          x++;
5555          if (x == prog_data->local_size[0]) {
5556             x = 0;
5557             y++;
5558             if (y == prog_data->local_size[1]) {
5559                y = 0;
5560                z++;
5561                if (z == prog_data->local_size[2])
5562                   z = 0;
5563             }
5564          }
5565       }
5566    }
5567 }
5568
5569 fs_reg *
5570 fs_visitor::emit_cs_local_invocation_id_setup()
5571 {
5572    assert(stage == MESA_SHADER_COMPUTE);
5573
5574    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
5575
5576    struct brw_reg src =
5577       brw_vec8_grf(payload.local_invocation_id_reg, 0);
5578    src = retype(src, BRW_REGISTER_TYPE_UD);
5579    bld.MOV(*reg, src);
5580    src.nr += dispatch_width / 8;
5581    bld.MOV(offset(*reg, bld, 1), src);
5582    src.nr += dispatch_width / 8;
5583    bld.MOV(offset(*reg, bld, 2), src);
5584
5585    return reg;
5586 }
5587
5588 fs_reg *
5589 fs_visitor::emit_cs_work_group_id_setup()
5590 {
5591    assert(stage == MESA_SHADER_COMPUTE);
5592
5593    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
5594
5595    struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
5596    struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD));
5597    struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD));
5598
5599    bld.MOV(*reg, r0_1);
5600    bld.MOV(offset(*reg, bld, 1), r0_6);
5601    bld.MOV(offset(*reg, bld, 2), r0_7);
5602
5603    return reg;
5604 }
5605
5606 const unsigned *
5607 brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
5608                void *mem_ctx,
5609                const struct brw_cs_prog_key *key,
5610                struct brw_cs_prog_data *prog_data,
5611                const nir_shader *shader,
5612                int shader_time_index,
5613                unsigned *final_assembly_size,
5614                char **error_str)
5615 {
5616    prog_data->local_size[0] = shader->info.cs.local_size[0];
5617    prog_data->local_size[1] = shader->info.cs.local_size[1];
5618    prog_data->local_size[2] = shader->info.cs.local_size[2];
5619    unsigned local_workgroup_size =
5620       shader->info.cs.local_size[0] * shader->info.cs.local_size[1] *
5621       shader->info.cs.local_size[2];
5622
5623    unsigned max_cs_threads = compiler->devinfo->max_cs_threads;
5624
5625    cfg_t *cfg = NULL;
5626    const char *fail_msg = NULL;
5627
5628    /* Now the main event: Visit the shader IR and generate our CS IR for it.
5629     */
5630    fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base,
5631                  NULL, /* Never used in core profile */
5632                  shader, 8, shader_time_index);
5633    if (!v8.run_cs()) {
5634       fail_msg = v8.fail_msg;
5635    } else if (local_workgroup_size <= 8 * max_cs_threads) {
5636       cfg = v8.cfg;
5637       prog_data->simd_size = 8;
5638    }
5639
5640    fs_visitor v16(compiler, log_data, mem_ctx, key, &prog_data->base,
5641                  NULL, /* Never used in core profile */
5642                  shader, 16, shader_time_index);
5643    if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
5644        !fail_msg && !v8.simd16_unsupported &&
5645        local_workgroup_size <= 16 * max_cs_threads) {
5646       /* Try a SIMD16 compile */
5647       v16.import_uniforms(&v8);
5648       if (!v16.run_cs()) {
5649          compiler->shader_perf_log(log_data,
5650                                    "SIMD16 shader failed to compile: %s",
5651                                    v16.fail_msg);
5652          if (!cfg) {
5653             fail_msg =
5654                "Couldn't generate SIMD16 program and not "
5655                "enough threads for SIMD8";
5656          }
5657       } else {
5658          cfg = v16.cfg;
5659          prog_data->simd_size = 16;
5660       }
5661    }
5662
5663    if (unlikely(cfg == NULL)) {
5664       assert(fail_msg);
5665       if (error_str)
5666          *error_str = ralloc_strdup(mem_ctx, fail_msg);
5667
5668       return NULL;
5669    }
5670
5671    fs_generator g(compiler, log_data, mem_ctx, (void*) key, &prog_data->base,
5672                   v8.promoted_constants, v8.runtime_check_aads_emit, "CS");
5673    if (INTEL_DEBUG & DEBUG_CS) {
5674       char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
5675                                    shader->info.label ? shader->info.label :
5676                                                         "unnamed",
5677                                    shader->info.name);
5678       g.enable_debug(name);
5679    }
5680
5681    g.generate_code(cfg, prog_data->simd_size);
5682
5683    return g.get_assembly(final_assembly_size);
5684 }