src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include "main/macros.h"
  32 #include "brw_context.h"
  33 #include "brw_eu.h"
  34 #include "brw_fs.h"
  35 #include "brw_cs.h"
  36 #include "brw_nir.h"
  37 #include "brw_vec4_gs_visitor.h"
  38 #include "brw_cfg.h"
  39 #include "brw_program.h"
  40 #include "brw_dead_control_flow.h"
  41 #include "glsl/nir/glsl_types.h"
  42
  43 using namespace brw;
  44
  45 void
  46 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  47               const fs_reg *src, unsigned sources)
  48 {
  49    memset(this, 0, sizeof(*this));
  50
  51    this->src = new fs_reg[MAX2(sources, 3)];
  52    for (unsigned i = 0; i < sources; i++)
  53       this->src[i] = src[i];
  54
  55    this->opcode = opcode;
  56    this->dst = dst;
  57    this->sources = sources;
  58    this->exec_size = exec_size;
  59
  60    assert(dst.file != IMM && dst.file != UNIFORM);
  61
  62    assert(this->exec_size != 0);
  63
  64    this->conditional_mod = BRW_CONDITIONAL_NONE;
  65
  66    /* This will be the case for almost all instructions. */
  67    switch (dst.file) {
  68    case VGRF:
  69    case ARF:
  70    case FIXED_GRF:
  71    case MRF:
  72    case ATTR:
  73       this->regs_written = DIV_ROUND_UP(dst.component_size(exec_size),
  74                                         REG_SIZE);
  75       break;
  76    case BAD_FILE:
  77       this->regs_written = 0;
  78       break;
  79    case IMM:
  80    case UNIFORM:
  81       unreachable("Invalid destination register file");
  82    }
  83
  84    this->writes_accumulator = false;
  85 }
  86
  87 fs_inst::fs_inst()
  88 {
  89    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
  90 }
  91
  92 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
  93 {
  94    init(opcode, exec_size, reg_undef, NULL, 0);
  95 }
  96
  97 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
  98 {
  99    init(opcode, exec_size, dst, NULL, 0);
 100 }
 101
 102 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 103                  const fs_reg &src0)
 104 {
 105    const fs_reg src[1] = { src0 };
 106    init(opcode, exec_size, dst, src, 1);
 107 }
 108
 109 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 110                  const fs_reg &src0, const fs_reg &src1)
 111 {
 112    const fs_reg src[2] = { src0, src1 };
 113    init(opcode, exec_size, dst, src, 2);
 114 }
 115
 116 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 117                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 118 {
 119    const fs_reg src[3] = { src0, src1, src2 };
 120    init(opcode, exec_size, dst, src, 3);
 121 }
 122
 123 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 124                  const fs_reg src[], unsigned sources)
 125 {
 126    init(opcode, exec_width, dst, src, sources);
 127 }
 128
 129 fs_inst::fs_inst(const fs_inst &that)
 130 {
 131    memcpy(this, &that, sizeof(that));
 132
 133    this->src = new fs_reg[MAX2(that.sources, 3)];
 134
 135    for (unsigned i = 0; i < that.sources; i++)
 136       this->src[i] = that.src[i];
 137 }
 138
 139 fs_inst::~fs_inst()
 140 {
 141    delete[] this->src;
 142 }
 143
 144 void
 145 fs_inst::resize_sources(uint8_t num_sources)
 146 {
 147    if (this->sources != num_sources) {
 148       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 149
 150       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 151          src[i] = this->src[i];
 152
 153       delete[] this->src;
 154       this->src = src;
 155       this->sources = num_sources;
 156    }
 157 }
 158
 159 void
 160 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
 161                                        const fs_reg &dst,
 162                                        const fs_reg &surf_index,
 163                                        const fs_reg &varying_offset,
 164                                        uint32_t const_offset)
 165 {
 166    /* We have our constant surface use a pitch of 4 bytes, so our index can
 167     * be any component of a vector, and then we load 4 contiguous
 168     * components starting from that.
 169     *
 170     * We break down the const_offset to a portion added to the variable
 171     * offset and a portion done using reg_offset, which means that if you
 172     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 173     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 174     * CSE can later notice that those loads are all the same and eliminate
 175     * the redundant ones.
 176     */
 177    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 178    bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~3));
 179
 180    int scale = 1;
 181    if (devinfo->gen == 4 && bld.dispatch_width() == 8) {
 182       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 183        * u, v, r) as parameters, or we can just use the SIMD16 message
 184        * consisting of (header, u).  We choose the second, at the cost of a
 185        * longer return length.
 186        */
 187       scale = 2;
 188    }
 189
 190    enum opcode op;
 191    if (devinfo->gen >= 7)
 192       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 193    else
 194       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 195
 196    int regs_written = 4 * (bld.dispatch_width() / 8) * scale;
 197    fs_reg vec4_result = fs_reg(VGRF, alloc.allocate(regs_written), dst.type);
 198    fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
 199    inst->regs_written = regs_written;
 200
 201    if (devinfo->gen < 7) {
 202       inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen);
 203       inst->header_size = 1;
 204       if (devinfo->gen == 4)
 205          inst->mlen = 3;
 206       else
 207          inst->mlen = 1 + bld.dispatch_width() / 8;
 208    }
 209
 210    bld.MOV(dst, offset(vec4_result, bld, (const_offset & 3) * scale));
 211 }
 212
 213 /**
 214  * A helper for MOV generation for fixing up broken hardware SEND dependency
 215  * handling.
 216  */
 217 void
 218 fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
 219 {
 220    /* The caller always wants uncompressed to emit the minimal extra
 221     * dependencies, and to avoid having to deal with aligning its regs to 2.
 222     */
 223    const fs_builder ubld = bld.annotate("send dependency resolve")
 224                               .half(0);
 225
 226    ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F));
 227 }
 228
 229 bool
 230 fs_inst::equals(fs_inst *inst) const
 231 {
 232    return (opcode == inst->opcode &&
 233            dst.equals(inst->dst) &&
 234            src[0].equals(inst->src[0]) &&
 235            src[1].equals(inst->src[1]) &&
 236            src[2].equals(inst->src[2]) &&
 237            saturate == inst->saturate &&
 238            predicate == inst->predicate &&
 239            conditional_mod == inst->conditional_mod &&
 240            mlen == inst->mlen &&
 241            base_mrf == inst->base_mrf &&
 242            target == inst->target &&
 243            eot == inst->eot &&
 244            header_size == inst->header_size &&
 245            shadow_compare == inst->shadow_compare &&
 246            exec_size == inst->exec_size &&
 247            offset == inst->offset);
 248 }
 249
 250 bool
 251 fs_inst::overwrites_reg(const fs_reg &reg) const
 252 {
 253    return reg.in_range(dst, regs_written);
 254 }
 255
 256 bool
 257 fs_inst::is_send_from_grf() const
 258 {
 259    switch (opcode) {
 260    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 261    case SHADER_OPCODE_SHADER_TIME_ADD:
 262    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 263    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 264    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 265    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 266    case SHADER_OPCODE_UNTYPED_ATOMIC:
 267    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 268    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 269    case SHADER_OPCODE_TYPED_ATOMIC:
 270    case SHADER_OPCODE_TYPED_SURFACE_READ:
 271    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 272    case SHADER_OPCODE_URB_WRITE_SIMD8:
 273    case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
 274    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
 275    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
 276    case SHADER_OPCODE_URB_READ_SIMD8:
 277    case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
 278       return true;
 279    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 280       return src[1].file == VGRF;
 281    case FS_OPCODE_FB_WRITE:
 282       return src[0].file == VGRF;
 283    default:
 284       if (is_tex())
 285          return src[0].file == VGRF;
 286
 287       return false;
 288    }
 289 }
 290
 291 bool
 292 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
 293 {
 294    if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
 295       return false;
 296
 297    fs_reg reg = this->src[0];
 298    if (reg.file != VGRF || reg.reg_offset != 0 || reg.stride == 0)
 299       return false;
 300
 301    if (grf_alloc.sizes[reg.nr] != this->regs_written)
 302       return false;
 303
 304    for (int i = 0; i < this->sources; i++) {
 305       reg.type = this->src[i].type;
 306       if (!this->src[i].equals(reg))
 307          return false;
 308
 309       if (i < this->header_size) {
 310          reg.reg_offset += 1;
 311       } else {
 312          reg.reg_offset += this->exec_size / 8;
 313       }
 314    }
 315
 316    return true;
 317 }
 318
 319 bool
 320 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
 321 {
 322    if (devinfo->gen == 6 && is_math())
 323       return false;
 324
 325    if (is_send_from_grf())
 326       return false;
 327
 328    if (!backend_instruction::can_do_source_mods())
 329       return false;
 330
 331    return true;
 332 }
 333
 334 bool
 335 fs_inst::can_change_types() const
 336 {
 337    return dst.type == src[0].type &&
 338           !src[0].abs && !src[0].negate && !saturate &&
 339           (opcode == BRW_OPCODE_MOV ||
 340            (opcode == BRW_OPCODE_SEL &&
 341             dst.type == src[1].type &&
 342             predicate != BRW_PREDICATE_NONE &&
 343             !src[1].abs && !src[1].negate));
 344 }
 345
 346 bool
 347 fs_inst::has_side_effects() const
 348 {
 349    return this->eot || backend_instruction::has_side_effects();
 350 }
 351
 352 void
 353 fs_reg::init()
 354 {
 355    memset(this, 0, sizeof(*this));
 356    stride = 1;
 357 }
 358
 359 /** Generic unset register constructor. */
 360 fs_reg::fs_reg()
 361 {
 362    init();
 363    this->file = BAD_FILE;
 364 }
 365
 366 fs_reg::fs_reg(struct ::brw_reg reg) :
 367    backend_reg(reg)
 368 {
 369    this->reg_offset = 0;
 370    this->subreg_offset = 0;
 371    this->reladdr = NULL;
 372    this->stride = 1;
 373    if (this->file == IMM &&
 374        (this->type != BRW_REGISTER_TYPE_V &&
 375         this->type != BRW_REGISTER_TYPE_UV &&
 376         this->type != BRW_REGISTER_TYPE_VF)) {
 377       this->stride = 0;
 378    }
 379 }
 380
 381 bool
 382 fs_reg::equals(const fs_reg &r) const
 383 {
 384    return (this->backend_reg::equals(r) &&
 385            subreg_offset == r.subreg_offset &&
 386            !reladdr && !r.reladdr &&
 387            stride == r.stride);
 388 }
 389
 390 fs_reg &
 391 fs_reg::set_smear(unsigned subreg)
 392 {
 393    assert(file != ARF && file != FIXED_GRF && file != IMM);
 394    subreg_offset = subreg * type_sz(type);
 395    stride = 0;
 396    return *this;
 397 }
 398
 399 bool
 400 fs_reg::is_contiguous() const
 401 {
 402    return stride == 1;
 403 }
 404
 405 unsigned
 406 fs_reg::component_size(unsigned width) const
 407 {
 408    const unsigned stride = ((file != ARF && file != FIXED_GRF) ? this->stride :
 409                             hstride == 0 ? 0 :
 410                             1 << (hstride - 1));
 411    return MAX2(width * stride, 1) * type_sz(type);
 412 }
 413
 414 extern "C" int
 415 type_size_scalar(const struct glsl_type *type)
 416 {
 417    unsigned int size, i;
 418
 419    switch (type->base_type) {
 420    case GLSL_TYPE_UINT:
 421    case GLSL_TYPE_INT:
 422    case GLSL_TYPE_FLOAT:
 423    case GLSL_TYPE_BOOL:
 424       return type->components();
 425    case GLSL_TYPE_ARRAY:
 426       return type_size_scalar(type->fields.array) * type->length;
 427    case GLSL_TYPE_STRUCT:
 428       size = 0;
 429       for (i = 0; i < type->length; i++) {
 430          size += type_size_scalar(type->fields.structure[i].type);
 431       }
 432       return size;
 433    case GLSL_TYPE_SAMPLER:
 434       /* Samplers take up no register space, since they're baked in at
 435        * link time.
 436        */
 437       return 0;
 438    case GLSL_TYPE_ATOMIC_UINT:
 439       return 0;
 440    case GLSL_TYPE_SUBROUTINE:
 441       return 1;
 442    case GLSL_TYPE_IMAGE:
 443       return BRW_IMAGE_PARAM_SIZE;
 444    case GLSL_TYPE_VOID:
 445    case GLSL_TYPE_ERROR:
 446    case GLSL_TYPE_INTERFACE:
 447    case GLSL_TYPE_DOUBLE:
 448       unreachable("not reached");
 449    }
 450
 451    return 0;
 452 }
 453
 454 /**
 455  * Returns the number of scalar components needed to store type, assuming
 456  * that vectors are padded out to vec4.
 457  *
 458  * This has the packing rules of type_size_vec4(), but counts components
 459  * similar to type_size_scalar().
 460  */
 461 extern "C" int
 462 type_size_vec4_times_4(const struct glsl_type *type)
 463 {
 464    return 4 * type_size_vec4(type);
 465 }
 466
 467 /**
 468  * Create a MOV to read the timestamp register.
 469  *
 470  * The caller is responsible for emitting the MOV.  The return value is
 471  * the destination of the MOV, with extra parameters set.
 472  */
 473 fs_reg
 474 fs_visitor::get_timestamp(const fs_builder &bld)
 475 {
 476    assert(devinfo->gen >= 7);
 477
 478    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 479                                           BRW_ARF_TIMESTAMP,
 480                                           0),
 481                              BRW_REGISTER_TYPE_UD));
 482
 483    fs_reg dst = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
 484
 485    /* We want to read the 3 fields we care about even if it's not enabled in
 486     * the dispatch.
 487     */
 488    bld.group(4, 0).exec_all().MOV(dst, ts);
 489
 490    return dst;
 491 }
 492
 493 void
 494 fs_visitor::emit_shader_time_begin()
 495 {
 496    shader_start_time = get_timestamp(bld.annotate("shader time start"));
 497
 498    /* We want only the low 32 bits of the timestamp.  Since it's running
 499     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 500     * which is plenty of time for our purposes.  It is identical across the
 501     * EUs, but since it's tracking GPU core speed it will increment at a
 502     * varying rate as render P-states change.
 503     */
 504    shader_start_time.set_smear(0);
 505 }
 506
 507 void
 508 fs_visitor::emit_shader_time_end()
 509 {
 510    /* Insert our code just before the final SEND with EOT. */
 511    exec_node *end = this->instructions.get_tail();
 512    assert(end && ((fs_inst *) end)->eot);
 513    const fs_builder ibld = bld.annotate("shader time end")
 514                               .exec_all().at(NULL, end);
 515
 516    fs_reg shader_end_time = get_timestamp(ibld);
 517
 518    /* We only use the low 32 bits of the timestamp - see
 519     * emit_shader_time_begin()).
 520     *
 521     * We could also check if render P-states have changed (or anything
 522     * else that might disrupt timing) by setting smear to 2 and checking if
 523     * that field is != 0.
 524     */
 525    shader_end_time.set_smear(0);
 526
 527    /* Check that there weren't any timestamp reset events (assuming these
 528     * were the only two timestamp reads that happened).
 529     */
 530    fs_reg reset = shader_end_time;
 531    reset.set_smear(2);
 532    set_condmod(BRW_CONDITIONAL_Z,
 533                ibld.AND(ibld.null_reg_ud(), reset, brw_imm_ud(1u)));
 534    ibld.IF(BRW_PREDICATE_NORMAL);
 535
 536    fs_reg start = shader_start_time;
 537    start.negate = true;
 538    fs_reg diff = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
 539    diff.set_smear(0);
 540
 541    const fs_builder cbld = ibld.group(1, 0);
 542    cbld.group(1, 0).ADD(diff, start, shader_end_time);
 543
 544    /* If there were no instructions between the two timestamp gets, the diff
 545     * is 2 cycles.  Remove that overhead, so I can forget about that when
 546     * trying to determine the time taken for single instructions.
 547     */
 548    cbld.ADD(diff, diff, brw_imm_ud(-2u));
 549    SHADER_TIME_ADD(cbld, 0, diff);
 550    SHADER_TIME_ADD(cbld, 1, brw_imm_ud(1u));
 551    ibld.emit(BRW_OPCODE_ELSE);
 552    SHADER_TIME_ADD(cbld, 2, brw_imm_ud(1u));
 553    ibld.emit(BRW_OPCODE_ENDIF);
 554 }
 555
 556 void
 557 fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
 558                             int shader_time_subindex,
 559                             fs_reg value)
 560 {
 561    int index = shader_time_index * 3 + shader_time_subindex;
 562    struct brw_reg offset = brw_imm_d(index * SHADER_TIME_STRIDE);
 563
 564    fs_reg payload;
 565    if (dispatch_width == 8)
 566       payload = vgrf(glsl_type::uvec2_type);
 567    else
 568       payload = vgrf(glsl_type::uint_type);
 569
 570    bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
 571 }
 572
 573 void
 574 fs_visitor::vfail(const char *format, va_list va)
 575 {
 576    char *msg;
 577
 578    if (failed)
 579       return;
 580
 581    failed = true;
 582
 583    msg = ralloc_vasprintf(mem_ctx, format, va);
 584    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 585
 586    this->fail_msg = msg;
 587
 588    if (debug_enabled) {
 589       fprintf(stderr, "%s",  msg);
 590    }
 591 }
 592
 593 void
 594 fs_visitor::fail(const char *format, ...)
 595 {
 596    va_list va;
 597
 598    va_start(va, format);
 599    vfail(format, va);
 600    va_end(va);
 601 }
 602
 603 /**
 604  * Mark this program as impossible to compile in SIMD16 mode.
 605  *
 606  * During the SIMD8 compile (which happens first), we can detect and flag
 607  * things that are unsupported in SIMD16 mode, so the compiler can skip
 608  * the SIMD16 compile altogether.
 609  *
 610  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 611  */
 612 void
 613 fs_visitor::no16(const char *msg)
 614 {
 615    if (dispatch_width == 16) {
 616       fail("%s", msg);
 617    } else {
 618       simd16_unsupported = true;
 619
 620       compiler->shader_perf_log(log_data,
 621                                 "SIMD16 shader failed to compile: %s", msg);
 622    }
 623 }
 624
 625 /**
 626  * Returns true if the instruction has a flag that means it won't
 627  * update an entire destination register.
 628  *
 629  * For example, dead code elimination and live variable analysis want to know
 630  * when a write to a variable screens off any preceding values that were in
 631  * it.
 632  */
 633 bool
 634 fs_inst::is_partial_write() const
 635 {
 636    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 637            (this->exec_size * type_sz(this->dst.type)) < 32 ||
 638            !this->dst.is_contiguous());
 639 }
 640
 641 unsigned
 642 fs_inst::components_read(unsigned i) const
 643 {
 644    switch (opcode) {
 645    case FS_OPCODE_LINTERP:
 646       if (i == 0)
 647          return 2;
 648       else
 649          return 1;
 650
 651    case FS_OPCODE_PIXEL_X:
 652    case FS_OPCODE_PIXEL_Y:
 653       assert(i == 0);
 654       return 2;
 655
 656    case FS_OPCODE_FB_WRITE_LOGICAL:
 657       assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
 658       /* First/second FB write color. */
 659       if (i < 2)
 660          return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
 661       else
 662          return 1;
 663
 664    case SHADER_OPCODE_TEX_LOGICAL:
 665    case SHADER_OPCODE_TXD_LOGICAL:
 666    case SHADER_OPCODE_TXF_LOGICAL:
 667    case SHADER_OPCODE_TXL_LOGICAL:
 668    case SHADER_OPCODE_TXS_LOGICAL:
 669    case FS_OPCODE_TXB_LOGICAL:
 670    case SHADER_OPCODE_TXF_CMS_LOGICAL:
 671    case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
 672    case SHADER_OPCODE_TXF_UMS_LOGICAL:
 673    case SHADER_OPCODE_TXF_MCS_LOGICAL:
 674    case SHADER_OPCODE_LOD_LOGICAL:
 675    case SHADER_OPCODE_TG4_LOGICAL:
 676    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
 677       assert(src[8].file == IMM && src[9].file == IMM);
 678       /* Texture coordinates. */
 679       if (i == 0)
 680          return src[8].ud;
 681       /* Texture derivatives. */
 682       else if ((i == 2 || i == 3) && opcode == SHADER_OPCODE_TXD_LOGICAL)
 683          return src[9].ud;
 684       /* Texture offset. */
 685       else if (i == 7)
 686          return 2;
 687       /* MCS */
 688       else if (i == 5 && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
 689          return 2;
 690       else
 691          return 1;
 692
 693    case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
 694    case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
 695       assert(src[3].file == IMM);
 696       /* Surface coordinates. */
 697       if (i == 0)
 698          return src[3].ud;
 699       /* Surface operation source (ignored for reads). */
 700       else if (i == 1)
 701          return 0;
 702       else
 703          return 1;
 704
 705    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
 706    case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
 707       assert(src[3].file == IMM &&
 708              src[4].file == IMM);
 709       /* Surface coordinates. */
 710       if (i == 0)
 711          return src[3].ud;
 712       /* Surface operation source. */
 713       else if (i == 1)
 714          return src[4].ud;
 715       else
 716          return 1;
 717
 718    case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
 719    case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
 720       assert(src[3].file == IMM &&
 721              src[4].file == IMM);
 722       const unsigned op = src[4].ud;
 723       /* Surface coordinates. */
 724       if (i == 0)
 725          return src[3].ud;
 726       /* Surface operation source. */
 727       else if (i == 1 && op == BRW_AOP_CMPWR)
 728          return 2;
 729       else if (i == 1 && (op == BRW_AOP_INC || op == BRW_AOP_DEC ||
 730                           op == BRW_AOP_PREDEC))
 731          return 0;
 732       else
 733          return 1;
 734    }
 735
 736    default:
 737       return 1;
 738    }
 739 }
 740
 741 int
 742 fs_inst::regs_read(int arg) const
 743 {
 744    switch (opcode) {
 745    case FS_OPCODE_FB_WRITE:
 746    case SHADER_OPCODE_URB_WRITE_SIMD8:
 747    case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
 748    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
 749    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
 750    case SHADER_OPCODE_URB_READ_SIMD8:
 751    case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
 752    case SHADER_OPCODE_UNTYPED_ATOMIC:
 753    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 754    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 755    case SHADER_OPCODE_TYPED_ATOMIC:
 756    case SHADER_OPCODE_TYPED_SURFACE_READ:
 757    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 758    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 759       if (arg == 0)
 760          return mlen;
 761       break;
 762
 763    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
 764       /* The payload is actually stored in src1 */
 765       if (arg == 1)
 766          return mlen;
 767       break;
 768
 769    case FS_OPCODE_LINTERP:
 770       if (arg == 1)
 771          return 1;
 772       break;
 773
 774    case SHADER_OPCODE_LOAD_PAYLOAD:
 775       if (arg < this->header_size)
 776          return 1;
 777       break;
 778
 779    case CS_OPCODE_CS_TERMINATE:
 780    case SHADER_OPCODE_BARRIER:
 781       return 1;
 782
 783    case SHADER_OPCODE_MOV_INDIRECT:
 784       if (arg == 0) {
 785          assert(src[2].file == IMM);
 786          unsigned region_length = src[2].ud;
 787
 788          if (src[0].file == FIXED_GRF) {
 789             /* If the start of the region is not register aligned, then
 790              * there's some portion of the register that's technically
 791              * unread at the beginning.
 792              *
 793              * However, the register allocator works in terms of whole
 794              * registers, and does not use subnr.  It assumes that the
 795              * read starts at the beginning of the register, and extends
 796              * regs_read() whole registers beyond that.
 797              *
 798              * To compensate, we extend the region length to include this
 799              * unread portion at the beginning.
 800              */
 801             if (src[0].subnr)
 802                region_length += src[0].subnr * type_sz(src[0].type);
 803
 804             return DIV_ROUND_UP(region_length, REG_SIZE);
 805          } else {
 806             assert(!"Invalid register file");
 807          }
 808       }
 809       break;
 810
 811    default:
 812       if (is_tex() && arg == 0 && src[0].file == VGRF)
 813          return mlen;
 814       break;
 815    }
 816
 817    switch (src[arg].file) {
 818    case BAD_FILE:
 819       return 0;
 820    case UNIFORM:
 821    case IMM:
 822       return 1;
 823    case ARF:
 824    case FIXED_GRF:
 825    case VGRF:
 826    case ATTR:
 827       return DIV_ROUND_UP(components_read(arg) *
 828                           src[arg].component_size(exec_size),
 829                           REG_SIZE);
 830    case MRF:
 831       unreachable("MRF registers are not allowed as sources");
 832    }
 833    return 0;
 834 }
 835
 836 bool
 837 fs_inst::reads_flag() const
 838 {
 839    return predicate;
 840 }
 841
 842 bool
 843 fs_inst::writes_flag() const
 844 {
 845    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 846                                opcode != BRW_OPCODE_IF &&
 847                                opcode != BRW_OPCODE_WHILE)) ||
 848           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 849 }
 850
 851 /**
 852  * Returns how many MRFs an FS opcode will write over.
 853  *
 854  * Note that this is not the 0 or 1 implied writes in an actual gen
 855  * instruction -- the FS opcodes often generate MOVs in addition.
 856  */
 857 int
 858 fs_visitor::implied_mrf_writes(fs_inst *inst)
 859 {
 860    if (inst->mlen == 0)
 861       return 0;
 862
 863    if (inst->base_mrf == -1)
 864       return 0;
 865
 866    switch (inst->opcode) {
 867    case SHADER_OPCODE_RCP:
 868    case SHADER_OPCODE_RSQ:
 869    case SHADER_OPCODE_SQRT:
 870    case SHADER_OPCODE_EXP2:
 871    case SHADER_OPCODE_LOG2:
 872    case SHADER_OPCODE_SIN:
 873    case SHADER_OPCODE_COS:
 874       return 1 * dispatch_width / 8;
 875    case SHADER_OPCODE_POW:
 876    case SHADER_OPCODE_INT_QUOTIENT:
 877    case SHADER_OPCODE_INT_REMAINDER:
 878       return 2 * dispatch_width / 8;
 879    case SHADER_OPCODE_TEX:
 880    case FS_OPCODE_TXB:
 881    case SHADER_OPCODE_TXD:
 882    case SHADER_OPCODE_TXF:
 883    case SHADER_OPCODE_TXF_CMS:
 884    case SHADER_OPCODE_TXF_CMS_W:
 885    case SHADER_OPCODE_TXF_MCS:
 886    case SHADER_OPCODE_TG4:
 887    case SHADER_OPCODE_TG4_OFFSET:
 888    case SHADER_OPCODE_TXL:
 889    case SHADER_OPCODE_TXS:
 890    case SHADER_OPCODE_LOD:
 891    case SHADER_OPCODE_SAMPLEINFO:
 892       return 1;
 893    case FS_OPCODE_FB_WRITE:
 894       return 2;
 895    case FS_OPCODE_GET_BUFFER_SIZE:
 896    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 897    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 898       return 1;
 899    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 900       return inst->mlen;
 901    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 902       return inst->mlen;
 903    case SHADER_OPCODE_UNTYPED_ATOMIC:
 904    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 905    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 906    case SHADER_OPCODE_TYPED_ATOMIC:
 907    case SHADER_OPCODE_TYPED_SURFACE_READ:
 908    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 909    case SHADER_OPCODE_URB_WRITE_SIMD8:
 910    case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
 911    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
 912    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
 913    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 914    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 915    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 916    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 917       return 0;
 918    default:
 919       unreachable("not reached");
 920    }
 921 }
 922
 923 fs_reg
 924 fs_visitor::vgrf(const glsl_type *const type)
 925 {
 926    int reg_width = dispatch_width / 8;
 927    return fs_reg(VGRF, alloc.allocate(type_size_scalar(type) * reg_width),
 928                  brw_type_for_base_type(type));
 929 }
 930
 931 fs_reg::fs_reg(enum brw_reg_file file, int nr)
 932 {
 933    init();
 934    this->file = file;
 935    this->nr = nr;
 936    this->type = BRW_REGISTER_TYPE_F;
 937    this->stride = (file == UNIFORM ? 0 : 1);
 938 }
 939
 940 fs_reg::fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type)
 941 {
 942    init();
 943    this->file = file;
 944    this->nr = nr;
 945    this->type = type;
 946    this->stride = (file == UNIFORM ? 0 : 1);
 947 }
 948
 949 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 950  * This brings in those uniform definitions
 951  */
 952 void
 953 fs_visitor::import_uniforms(fs_visitor *v)
 954 {
 955    this->push_constant_loc = v->push_constant_loc;
 956    this->pull_constant_loc = v->pull_constant_loc;
 957    this->uniforms = v->uniforms;
 958    this->param_size = v->param_size;
 959 }
 960
 961 fs_reg *
 962 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
 963                                          bool origin_upper_left)
 964 {
 965    assert(stage == MESA_SHADER_FRAGMENT);
 966    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 967    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
 968    fs_reg wpos = *reg;
 969    bool flip = !origin_upper_left ^ key->render_to_fbo;
 970
 971    /* gl_FragCoord.x */
 972    if (pixel_center_integer) {
 973       bld.MOV(wpos, this->pixel_x);
 974    } else {
 975       bld.ADD(wpos, this->pixel_x, brw_imm_f(0.5f));
 976    }
 977    wpos = offset(wpos, bld, 1);
 978
 979    /* gl_FragCoord.y */
 980    if (!flip && pixel_center_integer) {
 981       bld.MOV(wpos, this->pixel_y);
 982    } else {
 983       fs_reg pixel_y = this->pixel_y;
 984       float offset = (pixel_center_integer ? 0.0f : 0.5f);
 985
 986       if (flip) {
 987          pixel_y.negate = true;
 988          offset += key->drawable_height - 1.0f;
 989       }
 990
 991       bld.ADD(wpos, pixel_y, brw_imm_f(offset));
 992    }
 993    wpos = offset(wpos, bld, 1);
 994
 995    /* gl_FragCoord.z */
 996    if (devinfo->gen >= 6) {
 997       bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
 998    } else {
 999       bld.emit(FS_OPCODE_LINTERP, wpos,
1000            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1001            interp_reg(VARYING_SLOT_POS, 2));
1002    }
1003    wpos = offset(wpos, bld, 1);
1004
1005    /* gl_FragCoord.w: Already set up in emit_interpolation */
1006    bld.MOV(wpos, this->wpos_w);
1007
1008    return reg;
1009 }
1010
1011 fs_inst *
1012 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1013                          glsl_interp_qualifier interpolation_mode,
1014                          bool is_centroid, bool is_sample)
1015 {
1016    brw_wm_barycentric_interp_mode barycoord_mode;
1017    if (devinfo->gen >= 6) {
1018       if (is_centroid) {
1019          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1020             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1021          else
1022             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1023       } else if (is_sample) {
1024           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1025             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1026          else
1027             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1028       } else {
1029          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1030             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1031          else
1032             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1033       }
1034    } else {
1035       /* On Ironlake and below, there is only one interpolation mode.
1036        * Centroid interpolation doesn't mean anything on this hardware --
1037        * there is no multisampling.
1038        */
1039       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1040    }
1041    return bld.emit(FS_OPCODE_LINTERP, attr,
1042                    this->delta_xy[barycoord_mode], interp);
1043 }
1044
1045 void
1046 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1047                                        const glsl_type *type,
1048                                        glsl_interp_qualifier interpolation_mode,
1049                                        int location, bool mod_centroid,
1050                                        bool mod_sample)
1051 {
1052    attr.type = brw_type_for_base_type(type->get_scalar_type());
1053
1054    assert(stage == MESA_SHADER_FRAGMENT);
1055    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1056    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1057
1058    unsigned int array_elements;
1059
1060    if (type->is_array()) {
1061       array_elements = type->arrays_of_arrays_size();
1062       if (array_elements == 0) {
1063          fail("dereferenced array '%s' has length 0\n", name);
1064       }
1065       type = type->without_array();
1066    } else {
1067       array_elements = 1;
1068    }
1069
1070    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1071       bool is_gl_Color =
1072          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1073       if (key->flat_shade && is_gl_Color) {
1074          interpolation_mode = INTERP_QUALIFIER_FLAT;
1075       } else {
1076          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1077       }
1078    }
1079
1080    for (unsigned int i = 0; i < array_elements; i++) {
1081       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1082          if (prog_data->urb_setup[location] == -1) {
1083             /* If there's no incoming setup data for this slot, don't
1084              * emit interpolation for it.
1085              */
1086             attr = offset(attr, bld, type->vector_elements);
1087             location++;
1088             continue;
1089          }
1090
1091          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1092             /* Constant interpolation (flat shading) case. The SF has
1093              * handed us defined values in only the constant offset
1094              * field of the setup reg.
1095              */
1096             for (unsigned int k = 0; k < type->vector_elements; k++) {
1097                struct brw_reg interp = interp_reg(location, k);
1098                interp = suboffset(interp, 3);
1099                interp.type = attr.type;
1100                bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1101                attr = offset(attr, bld, 1);
1102             }
1103          } else {
1104             /* Smooth/noperspective interpolation case. */
1105             for (unsigned int k = 0; k < type->vector_elements; k++) {
1106                struct brw_reg interp = interp_reg(location, k);
1107                if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1108                   /* Get the pixel/sample mask into f0 so that we know
1109                    * which pixels are lit.  Then, for each channel that is
1110                    * unlit, replace the centroid data with non-centroid
1111                    * data.
1112                    */
1113                   bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1114
1115                   fs_inst *inst;
1116                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1117                                       false, false);
1118                   inst->predicate = BRW_PREDICATE_NORMAL;
1119                   inst->predicate_inverse = true;
1120                   if (devinfo->has_pln)
1121                      inst->no_dd_clear = true;
1122
1123                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1124                                       mod_centroid && !key->persample_shading,
1125                                       mod_sample || key->persample_shading);
1126                   inst->predicate = BRW_PREDICATE_NORMAL;
1127                   inst->predicate_inverse = false;
1128                   if (devinfo->has_pln)
1129                      inst->no_dd_check = true;
1130
1131                } else {
1132                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1133                                mod_centroid && !key->persample_shading,
1134                                mod_sample || key->persample_shading);
1135                }
1136                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1137                   bld.MUL(attr, attr, this->pixel_w);
1138                }
1139                attr = offset(attr, bld, 1);
1140             }
1141
1142          }
1143          location++;
1144       }
1145    }
1146 }
1147
1148 fs_reg *
1149 fs_visitor::emit_frontfacing_interpolation()
1150 {
1151    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1152
1153    if (devinfo->gen >= 6) {
1154       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1155        * a boolean result from this (~0/true or 0/false).
1156        *
1157        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1158        * this task in only one instruction:
1159        *    - a negation source modifier will flip the bit; and
1160        *    - a W -> D type conversion will sign extend the bit into the high
1161        *      word of the destination.
1162        *
1163        * An ASR 15 fills the low word of the destination.
1164        */
1165       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1166       g0.negate = true;
1167
1168       bld.ASR(*reg, g0, brw_imm_d(15));
1169    } else {
1170       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1171        * a boolean result from this (1/true or 0/false).
1172        *
1173        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1174        * the negation source modifier to flip it. Unfortunately the SHR
1175        * instruction only operates on UD (or D with an abs source modifier)
1176        * sources without negation.
1177        *
1178        * Instead, use ASR (which will give ~0/true or 0/false).
1179        */
1180       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1181       g1_6.negate = true;
1182
1183       bld.ASR(*reg, g1_6, brw_imm_d(31));
1184    }
1185
1186    return reg;
1187 }
1188
1189 void
1190 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1191 {
1192    assert(stage == MESA_SHADER_FRAGMENT);
1193    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1194    assert(dst.type == BRW_REGISTER_TYPE_F);
1195
1196    if (key->compute_pos_offset) {
1197       /* Convert int_sample_pos to floating point */
1198       bld.MOV(dst, int_sample_pos);
1199       /* Scale to the range [0, 1] */
1200       bld.MUL(dst, dst, brw_imm_f(1 / 16.0f));
1201    }
1202    else {
1203       /* From ARB_sample_shading specification:
1204        * "When rendering to a non-multisample buffer, or if multisample
1205        *  rasterization is disabled, gl_SamplePosition will always be
1206        *  (0.5, 0.5).
1207        */
1208       bld.MOV(dst, brw_imm_f(0.5f));
1209    }
1210 }
1211
1212 fs_reg *
1213 fs_visitor::emit_samplepos_setup()
1214 {
1215    assert(devinfo->gen >= 6);
1216
1217    const fs_builder abld = bld.annotate("compute sample position");
1218    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1219    fs_reg pos = *reg;
1220    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1221    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1222
1223    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1224     * mode will be enabled.
1225     *
1226     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1227     * R31.1:0         Position Offset X/Y for Slot[3:0]
1228     * R31.3:2         Position Offset X/Y for Slot[7:4]
1229     * .....
1230     *
1231     * The X, Y sample positions come in as bytes in  thread payload. So, read
1232     * the positions using vstride=16, width=8, hstride=2.
1233     */
1234    struct brw_reg sample_pos_reg =
1235       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1236                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1237
1238    if (dispatch_width == 8) {
1239       abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
1240    } else {
1241       abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
1242       abld.half(1).MOV(half(int_sample_x, 1),
1243                        fs_reg(suboffset(sample_pos_reg, 16)));
1244    }
1245    /* Compute gl_SamplePosition.x */
1246    compute_sample_position(pos, int_sample_x);
1247    pos = offset(pos, abld, 1);
1248    if (dispatch_width == 8) {
1249       abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
1250    } else {
1251       abld.half(0).MOV(half(int_sample_y, 0),
1252                        fs_reg(suboffset(sample_pos_reg, 1)));
1253       abld.half(1).MOV(half(int_sample_y, 1),
1254                        fs_reg(suboffset(sample_pos_reg, 17)));
1255    }
1256    /* Compute gl_SamplePosition.y */
1257    compute_sample_position(pos, int_sample_y);
1258    return reg;
1259 }
1260
1261 fs_reg *
1262 fs_visitor::emit_sampleid_setup()
1263 {
1264    assert(stage == MESA_SHADER_FRAGMENT);
1265    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1266    assert(devinfo->gen >= 6);
1267
1268    const fs_builder abld = bld.annotate("compute sample id");
1269    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1270
1271    if (key->compute_sample_id) {
1272       fs_reg t1(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_D);
1273       t1.set_smear(0);
1274       fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
1275
1276       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1277        * 8x multisampling, subspan 0 will represent sample N (where N
1278        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1279        * 7. We can find the value of N by looking at R0.0 bits 7:6
1280        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1281        * (since samples are always delivered in pairs). That is, we
1282        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1283        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1284        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1285        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1286        * populating a temporary variable with the sequence (0, 1, 2, 3),
1287        * and then reading from it using vstride=1, width=4, hstride=0.
1288        * These computations hold good for 4x multisampling as well.
1289        *
1290        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1291        * the first four slots are sample 0 of subspan 0; the next four
1292        * are sample 1 of subspan 0; the third group is sample 0 of
1293        * subspan 1, and finally sample 1 of subspan 1.
1294        */
1295
1296       /* SKL+ has an extra bit for the Starting Sample Pair Index to
1297        * accomodate 16x MSAA.
1298        */
1299       unsigned sspi_mask = devinfo->gen >= 9 ? 0x1c0 : 0xc0;
1300
1301       abld.exec_all().group(1, 0)
1302           .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1303                brw_imm_ud(sspi_mask));
1304       abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5));
1305
1306       /* This works for both SIMD8 and SIMD16 */
1307       abld.exec_all().group(4, 0)
1308           .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
1309
1310       /* This special instruction takes care of setting vstride=1,
1311        * width=4, hstride=0 of t2 during an ADD instruction.
1312        */
1313       abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1314    } else {
1315       /* As per GL_ARB_sample_shading specification:
1316        * "When rendering to a non-multisample buffer, or if multisample
1317        *  rasterization is disabled, gl_SampleID will always be zero."
1318        */
1319       abld.MOV(*reg, brw_imm_d(0));
1320    }
1321
1322    return reg;
1323 }
1324
1325 fs_reg
1326 fs_visitor::resolve_source_modifiers(const fs_reg &src)
1327 {
1328    if (!src.abs && !src.negate)
1329       return src;
1330
1331    fs_reg temp = bld.vgrf(src.type);
1332    bld.MOV(temp, src);
1333
1334    return temp;
1335 }
1336
1337 void
1338 fs_visitor::emit_discard_jump()
1339 {
1340    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1341
1342    /* For performance, after a discard, jump to the end of the
1343     * shader if all relevant channels have been discarded.
1344     */
1345    fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
1346    discard_jump->flag_subreg = 1;
1347
1348    discard_jump->predicate = (dispatch_width == 8)
1349                              ? BRW_PREDICATE_ALIGN1_ANY8H
1350                              : BRW_PREDICATE_ALIGN1_ANY16H;
1351    discard_jump->predicate_inverse = true;
1352 }
1353
1354 void
1355 fs_visitor::emit_gs_thread_end()
1356 {
1357    assert(stage == MESA_SHADER_GEOMETRY);
1358
1359    struct brw_gs_prog_data *gs_prog_data =
1360       (struct brw_gs_prog_data *) prog_data;
1361
1362    if (gs_compile->control_data_header_size_bits > 0) {
1363       emit_gs_control_data_bits(this->final_gs_vertex_count);
1364    }
1365
1366    const fs_builder abld = bld.annotate("thread end");
1367    fs_inst *inst;
1368
1369    if (gs_prog_data->static_vertex_count != -1) {
1370       foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
1371          if (prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8 ||
1372              prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
1373              prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
1374              prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) {
1375             prev->eot = true;
1376
1377             /* Delete now dead instructions. */
1378             foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
1379                if (dead == prev)
1380                   break;
1381                dead->remove();
1382             }
1383             return;
1384          } else if (prev->is_control_flow() || prev->has_side_effects()) {
1385             break;
1386          }
1387       }
1388       fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1389       abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)));
1390       inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, hdr);
1391       inst->mlen = 1;
1392    } else {
1393       fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2);
1394       fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
1395       sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
1396       sources[1] = this->final_gs_vertex_count;
1397       abld.LOAD_PAYLOAD(payload, sources, 2, 2);
1398       inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
1399       inst->mlen = 2;
1400    }
1401    inst->eot = true;
1402    inst->offset = 0;
1403 }
1404
1405 void
1406 fs_visitor::assign_curb_setup()
1407 {
1408    if (dispatch_width == 8) {
1409       prog_data->dispatch_grf_start_reg = payload.num_regs;
1410    } else {
1411       if (stage == MESA_SHADER_FRAGMENT) {
1412          brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1413          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1414       } else if (stage == MESA_SHADER_COMPUTE) {
1415          brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1416          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1417       } else {
1418          unreachable("Unsupported shader type!");
1419       }
1420    }
1421
1422    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1423
1424    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1425    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1426       for (unsigned int i = 0; i < inst->sources; i++) {
1427          if (inst->src[i].file == UNIFORM) {
1428             int uniform_nr = inst->src[i].nr + inst->src[i].reg_offset;
1429             int constant_nr;
1430             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1431                constant_nr = push_constant_loc[uniform_nr];
1432             } else {
1433                /* Section 5.11 of the OpenGL 4.1 spec says:
1434                 * "Out-of-bounds reads return undefined values, which include
1435                 *  values from other variables of the active program or zero."
1436                 * Just return the first push constant.
1437                 */
1438                constant_nr = 0;
1439             }
1440
1441             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1442                                                   constant_nr / 8,
1443                                                   constant_nr % 8);
1444             brw_reg.abs = inst->src[i].abs;
1445             brw_reg.negate = inst->src[i].negate;
1446
1447             assert(inst->src[i].stride == 0);
1448             inst->src[i] = byte_offset(
1449                retype(brw_reg, inst->src[i].type),
1450                inst->src[i].subreg_offset);
1451          }
1452       }
1453    }
1454
1455    /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
1456    this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length;
1457 }
1458
1459 void
1460 fs_visitor::calculate_urb_setup()
1461 {
1462    assert(stage == MESA_SHADER_FRAGMENT);
1463    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1464    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1465
1466    memset(prog_data->urb_setup, -1,
1467           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1468
1469    int urb_next = 0;
1470    /* Figure out where each of the incoming setup attributes lands. */
1471    if (devinfo->gen >= 6) {
1472       if (_mesa_bitcount_64(nir->info.inputs_read &
1473                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1474          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1475           * first 16 varying inputs, so we can put them wherever we want.
1476           * Just put them in order.
1477           *
1478           * This is useful because it means that (a) inputs not used by the
1479           * fragment shader won't take up valuable register space, and (b) we
1480           * won't have to recompile the fragment shader if it gets paired with
1481           * a different vertex (or geometry) shader.
1482           */
1483          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1484             if (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
1485                 BITFIELD64_BIT(i)) {
1486                prog_data->urb_setup[i] = urb_next++;
1487             }
1488          }
1489       } else {
1490          bool include_vue_header =
1491             nir->info.inputs_read & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
1492
1493          /* We have enough input varyings that the SF/SBE pipeline stage can't
1494           * arbitrarily rearrange them to suit our whim; we have to put them
1495           * in an order that matches the output of the previous pipeline stage
1496           * (geometry or vertex shader).
1497           */
1498          struct brw_vue_map prev_stage_vue_map;
1499          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1500                              key->input_slots_valid,
1501                              nir->info.separate_shader);
1502          int first_slot =
1503             include_vue_header ? 0 : 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1504
1505          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1506          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1507               slot++) {
1508             int varying = prev_stage_vue_map.slot_to_varying[slot];
1509             if (varying != BRW_VARYING_SLOT_PAD &&
1510                 (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
1511                  BITFIELD64_BIT(varying))) {
1512                prog_data->urb_setup[varying] = slot - first_slot;
1513             }
1514          }
1515          urb_next = prev_stage_vue_map.num_slots - first_slot;
1516       }
1517    } else {
1518       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1519       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1520          /* Point size is packed into the header, not as a general attribute */
1521          if (i == VARYING_SLOT_PSIZ)
1522             continue;
1523
1524          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1525             /* The back color slot is skipped when the front color is
1526              * also written to.  In addition, some slots can be
1527              * written in the vertex shader and not read in the
1528              * fragment shader.  So the register number must always be
1529              * incremented, mapped or not.
1530              */
1531             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1532                prog_data->urb_setup[i] = urb_next;
1533             urb_next++;
1534          }
1535       }
1536
1537       /*
1538        * It's a FS only attribute, and we did interpolation for this attribute
1539        * in SF thread. So, count it here, too.
1540        *
1541        * See compile_sf_prog() for more info.
1542        */
1543       if (nir->info.inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1544          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1545    }
1546
1547    prog_data->num_varying_inputs = urb_next;
1548 }
1549
1550 void
1551 fs_visitor::assign_urb_setup()
1552 {
1553    assert(stage == MESA_SHADER_FRAGMENT);
1554    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1555
1556    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1557
1558    /* Offset all the urb_setup[] index by the actual position of the
1559     * setup regs, now that the location of the constants has been chosen.
1560     */
1561    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1562       if (inst->opcode == FS_OPCODE_LINTERP) {
1563          assert(inst->src[1].file == FIXED_GRF);
1564          inst->src[1].nr += urb_start;
1565       }
1566
1567       if (inst->opcode == FS_OPCODE_CINTERP) {
1568          assert(inst->src[0].file == FIXED_GRF);
1569          inst->src[0].nr += urb_start;
1570       }
1571    }
1572
1573    /* Each attribute is 4 setup channels, each of which is half a reg. */
1574    this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
1575 }
1576
1577 void
1578 fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
1579 {
1580    for (int i = 0; i < inst->sources; i++) {
1581       if (inst->src[i].file == ATTR) {
1582          int grf = payload.num_regs +
1583                    prog_data->curb_read_length +
1584                    inst->src[i].nr +
1585                    inst->src[i].reg_offset;
1586
1587          unsigned width = inst->src[i].stride == 0 ? 1 : inst->exec_size;
1588          struct brw_reg reg =
1589             stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1590                                inst->src[i].subreg_offset),
1591                    inst->exec_size * inst->src[i].stride,
1592                    width, inst->src[i].stride);
1593          reg.abs = inst->src[i].abs;
1594          reg.negate = inst->src[i].negate;
1595
1596          inst->src[i] = reg;
1597       }
1598    }
1599 }
1600
1601 void
1602 fs_visitor::assign_vs_urb_setup()
1603 {
1604    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1605
1606    assert(stage == MESA_SHADER_VERTEX);
1607    int count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1608    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1609       count++;
1610
1611    /* Each attribute is 4 regs. */
1612    this->first_non_payload_grf += 4 * vs_prog_data->nr_attributes;
1613
1614    assert(vs_prog_data->base.urb_read_length <= 15);
1615
1616    /* Rewrite all ATTR file references to the hw grf that they land in. */
1617    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1618       convert_attr_sources_to_hw_regs(inst);
1619    }
1620 }
1621
1622 void
1623 fs_visitor::assign_gs_urb_setup()
1624 {
1625    assert(stage == MESA_SHADER_GEOMETRY);
1626
1627    brw_vue_prog_data *vue_prog_data = (brw_vue_prog_data *) prog_data;
1628
1629    first_non_payload_grf +=
1630       8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
1631
1632    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1633       /* Rewrite all ATTR file references to GRFs. */
1634       convert_attr_sources_to_hw_regs(inst);
1635    }
1636 }
1637
1638
1639 /**
1640  * Split large virtual GRFs into separate components if we can.
1641  *
1642  * This is mostly duplicated with what brw_fs_vector_splitting does,
1643  * but that's really conservative because it's afraid of doing
1644  * splitting that doesn't result in real progress after the rest of
1645  * the optimization phases, which would cause infinite looping in
1646  * optimization.  We can do it once here, safely.  This also has the
1647  * opportunity to split interpolated values, or maybe even uniforms,
1648  * which we don't have at the IR level.
1649  *
1650  * We want to split, because virtual GRFs are what we register
1651  * allocate and spill (due to contiguousness requirements for some
1652  * instructions), and they're what we naturally generate in the
1653  * codegen process, but most virtual GRFs don't actually need to be
1654  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1655  * live intervals and better dead code elimination and coalescing.
1656  */
1657 void
1658 fs_visitor::split_virtual_grfs()
1659 {
1660    int num_vars = this->alloc.count;
1661
1662    /* Count the total number of registers */
1663    int reg_count = 0;
1664    int vgrf_to_reg[num_vars];
1665    for (int i = 0; i < num_vars; i++) {
1666       vgrf_to_reg[i] = reg_count;
1667       reg_count += alloc.sizes[i];
1668    }
1669
1670    /* An array of "split points".  For each register slot, this indicates
1671     * if this slot can be separated from the previous slot.  Every time an
1672     * instruction uses multiple elements of a register (as a source or
1673     * destination), we mark the used slots as inseparable.  Then we go
1674     * through and split the registers into the smallest pieces we can.
1675     */
1676    bool split_points[reg_count];
1677    memset(split_points, 0, sizeof(split_points));
1678
1679    /* Mark all used registers as fully splittable */
1680    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1681       if (inst->dst.file == VGRF) {
1682          int reg = vgrf_to_reg[inst->dst.nr];
1683          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
1684             split_points[reg + j] = true;
1685       }
1686
1687       for (int i = 0; i < inst->sources; i++) {
1688          if (inst->src[i].file == VGRF) {
1689             int reg = vgrf_to_reg[inst->src[i].nr];
1690             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
1691                split_points[reg + j] = true;
1692          }
1693       }
1694    }
1695
1696    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1697       if (inst->dst.file == VGRF) {
1698          int reg = vgrf_to_reg[inst->dst.nr] + inst->dst.reg_offset;
1699          for (int j = 1; j < inst->regs_written; j++)
1700             split_points[reg + j] = false;
1701       }
1702       for (int i = 0; i < inst->sources; i++) {
1703          if (inst->src[i].file == VGRF) {
1704             int reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].reg_offset;
1705             for (int j = 1; j < inst->regs_read(i); j++)
1706                split_points[reg + j] = false;
1707          }
1708       }
1709    }
1710
1711    int new_virtual_grf[reg_count];
1712    int new_reg_offset[reg_count];
1713
1714    int reg = 0;
1715    for (int i = 0; i < num_vars; i++) {
1716       /* The first one should always be 0 as a quick sanity check. */
1717       assert(split_points[reg] == false);
1718
1719       /* j = 0 case */
1720       new_reg_offset[reg] = 0;
1721       reg++;
1722       int offset = 1;
1723
1724       /* j > 0 case */
1725       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1726          /* If this is a split point, reset the offset to 0 and allocate a
1727           * new virtual GRF for the previous offset many registers
1728           */
1729          if (split_points[reg]) {
1730             assert(offset <= MAX_VGRF_SIZE);
1731             int grf = alloc.allocate(offset);
1732             for (int k = reg - offset; k < reg; k++)
1733                new_virtual_grf[k] = grf;
1734             offset = 0;
1735          }
1736          new_reg_offset[reg] = offset;
1737          offset++;
1738          reg++;
1739       }
1740
1741       /* The last one gets the original register number */
1742       assert(offset <= MAX_VGRF_SIZE);
1743       alloc.sizes[i] = offset;
1744       for (int k = reg - offset; k < reg; k++)
1745          new_virtual_grf[k] = i;
1746    }
1747    assert(reg == reg_count);
1748
1749    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1750       if (inst->dst.file == VGRF) {
1751          reg = vgrf_to_reg[inst->dst.nr] + inst->dst.reg_offset;
1752          inst->dst.nr = new_virtual_grf[reg];
1753          inst->dst.reg_offset = new_reg_offset[reg];
1754          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1755       }
1756       for (int i = 0; i < inst->sources; i++) {
1757          if (inst->src[i].file == VGRF) {
1758             reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].reg_offset;
1759             inst->src[i].nr = new_virtual_grf[reg];
1760             inst->src[i].reg_offset = new_reg_offset[reg];
1761             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1762          }
1763       }
1764    }
1765    invalidate_live_intervals();
1766 }
1767
1768 /**
1769  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1770  *
1771  * During code generation, we create tons of temporary variables, many of
1772  * which get immediately killed and are never used again.  Yet, in later
1773  * optimization and analysis passes, such as compute_live_intervals, we need
1774  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1775  * overhead.
1776  */
1777 bool
1778 fs_visitor::compact_virtual_grfs()
1779 {
1780    bool progress = false;
1781    int remap_table[this->alloc.count];
1782    memset(remap_table, -1, sizeof(remap_table));
1783
1784    /* Mark which virtual GRFs are used. */
1785    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1786       if (inst->dst.file == VGRF)
1787          remap_table[inst->dst.nr] = 0;
1788
1789       for (int i = 0; i < inst->sources; i++) {
1790          if (inst->src[i].file == VGRF)
1791             remap_table[inst->src[i].nr] = 0;
1792       }
1793    }
1794
1795    /* Compact the GRF arrays. */
1796    int new_index = 0;
1797    for (unsigned i = 0; i < this->alloc.count; i++) {
1798       if (remap_table[i] == -1) {
1799          /* We just found an unused register.  This means that we are
1800           * actually going to compact something.
1801           */
1802          progress = true;
1803       } else {
1804          remap_table[i] = new_index;
1805          alloc.sizes[new_index] = alloc.sizes[i];
1806          invalidate_live_intervals();
1807          ++new_index;
1808       }
1809    }
1810
1811    this->alloc.count = new_index;
1812
1813    /* Patch all the instructions to use the newly renumbered registers */
1814    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1815       if (inst->dst.file == VGRF)
1816          inst->dst.nr = remap_table[inst->dst.nr];
1817
1818       for (int i = 0; i < inst->sources; i++) {
1819          if (inst->src[i].file == VGRF)
1820             inst->src[i].nr = remap_table[inst->src[i].nr];
1821       }
1822    }
1823
1824    /* Patch all the references to delta_xy, since they're used in register
1825     * allocation.  If they're unused, switch them to BAD_FILE so we don't
1826     * think some random VGRF is delta_xy.
1827     */
1828    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
1829       if (delta_xy[i].file == VGRF) {
1830          if (remap_table[delta_xy[i].nr] != -1) {
1831             delta_xy[i].nr = remap_table[delta_xy[i].nr];
1832          } else {
1833             delta_xy[i].file = BAD_FILE;
1834          }
1835       }
1836    }
1837
1838    return progress;
1839 }
1840
1841 /**
1842  * Assign UNIFORM file registers to either push constants or pull constants.
1843  *
1844  * We allow a fragment shader to have more than the specified minimum
1845  * maximum number of fragment shader uniform components (64).  If
1846  * there are too many of these, they'd fill up all of register space.
1847  * So, this will push some of them out to the pull constant buffer and
1848  * update the program to load them.  We also use pull constants for all
1849  * indirect constant loads because we don't support indirect accesses in
1850  * registers yet.
1851  */
1852 void
1853 fs_visitor::assign_constant_locations()
1854 {
1855    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1856    if (dispatch_width != 8)
1857       return;
1858
1859    unsigned int num_pull_constants = 0;
1860
1861    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1862    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
1863
1864    bool is_live[uniforms];
1865    memset(is_live, 0, sizeof(is_live));
1866
1867    /* First, we walk through the instructions and do two things:
1868     *
1869     *  1) Figure out which uniforms are live.
1870     *
1871     *  2) Find all indirect access of uniform arrays and flag them as needing
1872     *     to go into the pull constant buffer.
1873     *
1874     * Note that we don't move constant-indexed accesses to arrays.  No
1875     * testing has been done of the performance impact of this choice.
1876     */
1877    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
1878       for (int i = 0 ; i < inst->sources; i++) {
1879          if (inst->src[i].file != UNIFORM)
1880             continue;
1881
1882          if (inst->src[i].reladdr) {
1883             int uniform = inst->src[i].nr;
1884
1885             /* If this array isn't already present in the pull constant buffer,
1886              * add it.
1887              */
1888             if (pull_constant_loc[uniform] == -1) {
1889                assert(param_size[uniform]);
1890                for (int j = 0; j < param_size[uniform]; j++)
1891                   pull_constant_loc[uniform + j] = num_pull_constants++;
1892             }
1893          } else {
1894             /* Mark the the one accessed uniform as live */
1895             int constant_nr = inst->src[i].nr + inst->src[i].reg_offset;
1896             if (constant_nr >= 0 && constant_nr < (int) uniforms)
1897                is_live[constant_nr] = true;
1898          }
1899       }
1900    }
1901
1902    /* Only allow 16 registers (128 uniform components) as push constants.
1903     *
1904     * Just demote the end of the list.  We could probably do better
1905     * here, demoting things that are rarely used in the program first.
1906     *
1907     * If changing this value, note the limitation about total_regs in
1908     * brw_curbe.c.
1909     */
1910    unsigned int max_push_components = 16 * 8;
1911    unsigned int num_push_constants = 0;
1912
1913    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1914
1915    for (unsigned int i = 0; i < uniforms; i++) {
1916       if (!is_live[i] || pull_constant_loc[i] != -1) {
1917          /* This UNIFORM register is either dead, or has already been demoted
1918           * to a pull const.  Mark it as no longer living in the param[] array.
1919           */
1920          push_constant_loc[i] = -1;
1921          continue;
1922       }
1923
1924       if (num_push_constants < max_push_components) {
1925          /* Retain as a push constant.  Record the location in the params[]
1926           * array.
1927           */
1928          push_constant_loc[i] = num_push_constants++;
1929       } else {
1930          /* Demote to a pull constant. */
1931          push_constant_loc[i] = -1;
1932          pull_constant_loc[i] = num_pull_constants++;
1933       }
1934    }
1935
1936    stage_prog_data->nr_params = num_push_constants;
1937    stage_prog_data->nr_pull_params = num_pull_constants;
1938
1939    /* Up until now, the param[] array has been indexed by reg + reg_offset
1940     * of UNIFORM registers.  Move pull constants into pull_param[] and
1941     * condense param[] to only contain the uniforms we chose to push.
1942     *
1943     * NOTE: Because we are condensing the params[] array, we know that
1944     * push_constant_loc[i] <= i and we can do it in one smooth loop without
1945     * having to make a copy.
1946     */
1947    for (unsigned int i = 0; i < uniforms; i++) {
1948       const gl_constant_value *value = stage_prog_data->param[i];
1949
1950       if (pull_constant_loc[i] != -1) {
1951          stage_prog_data->pull_param[pull_constant_loc[i]] = value;
1952       } else if (push_constant_loc[i] != -1) {
1953          stage_prog_data->param[push_constant_loc[i]] = value;
1954       }
1955    }
1956 }
1957
1958 /**
1959  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1960  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1961  */
1962 void
1963 fs_visitor::demote_pull_constants()
1964 {
1965    foreach_block_and_inst (block, fs_inst, inst, cfg) {
1966       for (int i = 0; i < inst->sources; i++) {
1967          if (inst->src[i].file != UNIFORM)
1968             continue;
1969
1970          int pull_index;
1971          unsigned location = inst->src[i].nr + inst->src[i].reg_offset;
1972          if (location >= uniforms) /* Out of bounds access */
1973             pull_index = -1;
1974          else
1975             pull_index = pull_constant_loc[location];
1976
1977          if (pull_index == -1)
1978             continue;
1979
1980          /* Set up the annotation tracking for new generated instructions. */
1981          const fs_builder ibld(this, block, inst);
1982          const unsigned index = stage_prog_data->binding_table.pull_constants_start;
1983          fs_reg dst = vgrf(glsl_type::float_type);
1984
1985          assert(inst->src[i].stride == 0);
1986
1987          /* Generate a pull load into dst. */
1988          if (inst->src[i].reladdr) {
1989             VARYING_PULL_CONSTANT_LOAD(ibld, dst,
1990                                        brw_imm_ud(index),
1991                                        *inst->src[i].reladdr,
1992                                        pull_index);
1993             inst->src[i].reladdr = NULL;
1994             inst->src[i].stride = 1;
1995          } else {
1996             const fs_builder ubld = ibld.exec_all().group(8, 0);
1997             struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15);
1998             ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1999                       dst, brw_imm_ud(index), offset);
2000             inst->src[i].set_smear(pull_index & 3);
2001          }
2002          brw_mark_surface_used(prog_data, index);
2003
2004          /* Rewrite the instruction to use the temporary VGRF. */
2005          inst->src[i].file = VGRF;
2006          inst->src[i].nr = dst.nr;
2007          inst->src[i].reg_offset = 0;
2008       }
2009    }
2010    invalidate_live_intervals();
2011 }
2012
2013 bool
2014 fs_visitor::opt_algebraic()
2015 {
2016    bool progress = false;
2017
2018    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2019       switch (inst->opcode) {
2020       case BRW_OPCODE_MOV:
2021          if (inst->src[0].file != IMM)
2022             break;
2023
2024          if (inst->saturate) {
2025             if (inst->dst.type != inst->src[0].type)
2026                assert(!"unimplemented: saturate mixed types");
2027
2028             if (brw_saturate_immediate(inst->dst.type,
2029                                        &inst->src[0].as_brw_reg())) {
2030                inst->saturate = false;
2031                progress = true;
2032             }
2033          }
2034          break;
2035
2036       case BRW_OPCODE_MUL:
2037          if (inst->src[1].file != IMM)
2038             continue;
2039
2040          /* a * 1.0 = a */
2041          if (inst->src[1].is_one()) {
2042             inst->opcode = BRW_OPCODE_MOV;
2043             inst->src[1] = reg_undef;
2044             progress = true;
2045             break;
2046          }
2047
2048          /* a * -1.0 = -a */
2049          if (inst->src[1].is_negative_one()) {
2050             inst->opcode = BRW_OPCODE_MOV;
2051             inst->src[0].negate = !inst->src[0].negate;
2052             inst->src[1] = reg_undef;
2053             progress = true;
2054             break;
2055          }
2056
2057          /* a * 0.0 = 0.0 */
2058          if (inst->src[1].is_zero()) {
2059             inst->opcode = BRW_OPCODE_MOV;
2060             inst->src[0] = inst->src[1];
2061             inst->src[1] = reg_undef;
2062             progress = true;
2063             break;
2064          }
2065
2066          if (inst->src[0].file == IMM) {
2067             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2068             inst->opcode = BRW_OPCODE_MOV;
2069             inst->src[0].f *= inst->src[1].f;
2070             inst->src[1] = reg_undef;
2071             progress = true;
2072             break;
2073          }
2074          break;
2075       case BRW_OPCODE_ADD:
2076          if (inst->src[1].file != IMM)
2077             continue;
2078
2079          /* a + 0.0 = a */
2080          if (inst->src[1].is_zero()) {
2081             inst->opcode = BRW_OPCODE_MOV;
2082             inst->src[1] = reg_undef;
2083             progress = true;
2084             break;
2085          }
2086
2087          if (inst->src[0].file == IMM) {
2088             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2089             inst->opcode = BRW_OPCODE_MOV;
2090             inst->src[0].f += inst->src[1].f;
2091             inst->src[1] = reg_undef;
2092             progress = true;
2093             break;
2094          }
2095          break;
2096       case BRW_OPCODE_OR:
2097          if (inst->src[0].equals(inst->src[1])) {
2098             inst->opcode = BRW_OPCODE_MOV;
2099             inst->src[1] = reg_undef;
2100             progress = true;
2101             break;
2102          }
2103          break;
2104       case BRW_OPCODE_LRP:
2105          if (inst->src[1].equals(inst->src[2])) {
2106             inst->opcode = BRW_OPCODE_MOV;
2107             inst->src[0] = inst->src[1];
2108             inst->src[1] = reg_undef;
2109             inst->src[2] = reg_undef;
2110             progress = true;
2111             break;
2112          }
2113          break;
2114       case BRW_OPCODE_CMP:
2115          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2116              inst->src[0].abs &&
2117              inst->src[0].negate &&
2118              inst->src[1].is_zero()) {
2119             inst->src[0].abs = false;
2120             inst->src[0].negate = false;
2121             inst->conditional_mod = BRW_CONDITIONAL_Z;
2122             progress = true;
2123             break;
2124          }
2125          break;
2126       case BRW_OPCODE_SEL:
2127          if (inst->src[0].equals(inst->src[1])) {
2128             inst->opcode = BRW_OPCODE_MOV;
2129             inst->src[1] = reg_undef;
2130             inst->predicate = BRW_PREDICATE_NONE;
2131             inst->predicate_inverse = false;
2132             progress = true;
2133          } else if (inst->saturate && inst->src[1].file == IMM) {
2134             switch (inst->conditional_mod) {
2135             case BRW_CONDITIONAL_LE:
2136             case BRW_CONDITIONAL_L:
2137                switch (inst->src[1].type) {
2138                case BRW_REGISTER_TYPE_F:
2139                   if (inst->src[1].f >= 1.0f) {
2140                      inst->opcode = BRW_OPCODE_MOV;
2141                      inst->src[1] = reg_undef;
2142                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2143                      progress = true;
2144                   }
2145                   break;
2146                default:
2147                   break;
2148                }
2149                break;
2150             case BRW_CONDITIONAL_GE:
2151             case BRW_CONDITIONAL_G:
2152                switch (inst->src[1].type) {
2153                case BRW_REGISTER_TYPE_F:
2154                   if (inst->src[1].f <= 0.0f) {
2155                      inst->opcode = BRW_OPCODE_MOV;
2156                      inst->src[1] = reg_undef;
2157                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2158                      progress = true;
2159                   }
2160                   break;
2161                default:
2162                   break;
2163                }
2164             default:
2165                break;
2166             }
2167          }
2168          break;
2169       case BRW_OPCODE_MAD:
2170          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2171             inst->opcode = BRW_OPCODE_MOV;
2172             inst->src[1] = reg_undef;
2173             inst->src[2] = reg_undef;
2174             progress = true;
2175          } else if (inst->src[0].is_zero()) {
2176             inst->opcode = BRW_OPCODE_MUL;
2177             inst->src[0] = inst->src[2];
2178             inst->src[2] = reg_undef;
2179             progress = true;
2180          } else if (inst->src[1].is_one()) {
2181             inst->opcode = BRW_OPCODE_ADD;
2182             inst->src[1] = inst->src[2];
2183             inst->src[2] = reg_undef;
2184             progress = true;
2185          } else if (inst->src[2].is_one()) {
2186             inst->opcode = BRW_OPCODE_ADD;
2187             inst->src[2] = reg_undef;
2188             progress = true;
2189          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2190             inst->opcode = BRW_OPCODE_ADD;
2191             inst->src[1].f *= inst->src[2].f;
2192             inst->src[2] = reg_undef;
2193             progress = true;
2194          }
2195          break;
2196       case SHADER_OPCODE_RCP: {
2197          fs_inst *prev = (fs_inst *)inst->prev;
2198          if (prev->opcode == SHADER_OPCODE_SQRT) {
2199             if (inst->src[0].equals(prev->dst)) {
2200                inst->opcode = SHADER_OPCODE_RSQ;
2201                inst->src[0] = prev->src[0];
2202                progress = true;
2203             }
2204          }
2205          break;
2206       }
2207       case SHADER_OPCODE_BROADCAST:
2208          if (is_uniform(inst->src[0])) {
2209             inst->opcode = BRW_OPCODE_MOV;
2210             inst->sources = 1;
2211             inst->force_writemask_all = true;
2212             progress = true;
2213          } else if (inst->src[1].file == IMM) {
2214             inst->opcode = BRW_OPCODE_MOV;
2215             inst->src[0] = component(inst->src[0],
2216                                      inst->src[1].ud);
2217             inst->sources = 1;
2218             inst->force_writemask_all = true;
2219             progress = true;
2220          }
2221          break;
2222
2223       default:
2224          break;
2225       }
2226
2227       /* Swap if src[0] is immediate. */
2228       if (progress && inst->is_commutative()) {
2229          if (inst->src[0].file == IMM) {
2230             fs_reg tmp = inst->src[1];
2231             inst->src[1] = inst->src[0];
2232             inst->src[0] = tmp;
2233          }
2234       }
2235    }
2236    return progress;
2237 }
2238
2239 /**
2240  * Optimize sample messages that have constant zero values for the trailing
2241  * texture coordinates. We can just reduce the message length for these
2242  * instructions instead of reserving a register for it. Trailing parameters
2243  * that aren't sent default to zero anyway. This will cause the dead code
2244  * eliminator to remove the MOV instruction that would otherwise be emitted to
2245  * set up the zero value.
2246  */
2247 bool
2248 fs_visitor::opt_zero_samples()
2249 {
2250    /* Gen4 infers the texturing opcode based on the message length so we can't
2251     * change it.
2252     */
2253    if (devinfo->gen < 5)
2254       return false;
2255
2256    bool progress = false;
2257
2258    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2259       if (!inst->is_tex())
2260          continue;
2261
2262       fs_inst *load_payload = (fs_inst *) inst->prev;
2263
2264       if (load_payload->is_head_sentinel() ||
2265           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2266          continue;
2267
2268       /* We don't want to remove the message header or the first parameter.
2269        * Removing the first parameter is not allowed, see the Haswell PRM
2270        * volume 7, page 149:
2271        *
2272        *     "Parameter 0 is required except for the sampleinfo message, which
2273        *      has no parameter 0"
2274        */
2275       while (inst->mlen > inst->header_size + inst->exec_size / 8 &&
2276              load_payload->src[(inst->mlen - inst->header_size) /
2277                                (inst->exec_size / 8) +
2278                                inst->header_size - 1].is_zero()) {
2279          inst->mlen -= inst->exec_size / 8;
2280          progress = true;
2281       }
2282    }
2283
2284    if (progress)
2285       invalidate_live_intervals();
2286
2287    return progress;
2288 }
2289
2290 /**
2291  * Optimize sample messages which are followed by the final RT write.
2292  *
2293  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2294  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2295  * final texturing results copied to the framebuffer write payload and modify
2296  * them to write to the framebuffer directly.
2297  */
2298 bool
2299 fs_visitor::opt_sampler_eot()
2300 {
2301    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2302
2303    if (stage != MESA_SHADER_FRAGMENT)
2304       return false;
2305
2306    if (devinfo->gen < 9 && !devinfo->is_cherryview)
2307       return false;
2308
2309    /* FINISHME: It should be possible to implement this optimization when there
2310     * are multiple drawbuffers.
2311     */
2312    if (key->nr_color_regions != 1)
2313       return false;
2314
2315    /* Look for a texturing instruction immediately before the final FB_WRITE. */
2316    bblock_t *block = cfg->blocks[cfg->num_blocks - 1];
2317    fs_inst *fb_write = (fs_inst *)block->end();
2318    assert(fb_write->eot);
2319    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2320
2321    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2322
2323    /* There wasn't one; nothing to do. */
2324    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2325       return false;
2326
2327    /* 3D Sampler » Messages » Message Format
2328     *
2329     * “Response Length of zero is allowed on all SIMD8* and SIMD16* sampler
2330     *  messages except sample+killpix, resinfo, sampleinfo, LOD, and gather4*”
2331     */
2332    if (tex_inst->opcode == SHADER_OPCODE_TXS ||
2333        tex_inst->opcode == SHADER_OPCODE_SAMPLEINFO ||
2334        tex_inst->opcode == SHADER_OPCODE_LOD ||
2335        tex_inst->opcode == SHADER_OPCODE_TG4 ||
2336        tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2337       return false;
2338
2339    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2340     * It's very likely to be the previous instruction.
2341     */
2342    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2343    if (load_payload->is_head_sentinel() ||
2344        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2345       return false;
2346
2347    assert(!tex_inst->eot); /* We can't get here twice */
2348    assert((tex_inst->offset & (0xff << 24)) == 0);
2349
2350    const fs_builder ibld(this, block, tex_inst);
2351
2352    tex_inst->offset |= fb_write->target << 24;
2353    tex_inst->eot = true;
2354    tex_inst->dst = ibld.null_reg_ud();
2355    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2356
2357    /* If a header is present, marking the eot is sufficient. Otherwise, we need
2358     * to create a new LOAD_PAYLOAD command with the same sources and a space
2359     * saved for the header. Using a new destination register not only makes sure
2360     * we have enough space, but it will make sure the dead code eliminator kills
2361     * the instruction that this will replace.
2362     */
2363    if (tex_inst->header_size != 0)
2364       return true;
2365
2366    fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F,
2367                                   load_payload->sources + 1);
2368    fs_reg *new_sources =
2369       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2370
2371    new_sources[0] = fs_reg();
2372    for (int i = 0; i < load_payload->sources; i++)
2373       new_sources[i+1] = load_payload->src[i];
2374
2375    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2376     * requires a lot of information about the sources to appropriately figure
2377     * out the number of registers needed to be used. Given this stage in our
2378     * optimization, we may not have the appropriate GRFs required by
2379     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2380     * manually emit the instruction.
2381     */
2382    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2383                                                     load_payload->exec_size,
2384                                                     send_header,
2385                                                     new_sources,
2386                                                     load_payload->sources + 1);
2387
2388    new_load_payload->regs_written = load_payload->regs_written + 1;
2389    new_load_payload->header_size = 1;
2390    tex_inst->mlen++;
2391    tex_inst->header_size = 1;
2392    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2393    tex_inst->src[0] = send_header;
2394
2395    return true;
2396 }
2397
2398 bool
2399 fs_visitor::opt_register_renaming()
2400 {
2401    bool progress = false;
2402    int depth = 0;
2403
2404    int remap[alloc.count];
2405    memset(remap, -1, sizeof(int) * alloc.count);
2406
2407    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2408       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2409          depth++;
2410       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2411                  inst->opcode == BRW_OPCODE_WHILE) {
2412          depth--;
2413       }
2414
2415       /* Rewrite instruction sources. */
2416       for (int i = 0; i < inst->sources; i++) {
2417          if (inst->src[i].file == VGRF &&
2418              remap[inst->src[i].nr] != -1 &&
2419              remap[inst->src[i].nr] != inst->src[i].nr) {
2420             inst->src[i].nr = remap[inst->src[i].nr];
2421             progress = true;
2422          }
2423       }
2424
2425       const int dst = inst->dst.nr;
2426
2427       if (depth == 0 &&
2428           inst->dst.file == VGRF &&
2429           alloc.sizes[inst->dst.nr] == inst->exec_size / 8 &&
2430           !inst->is_partial_write()) {
2431          if (remap[dst] == -1) {
2432             remap[dst] = dst;
2433          } else {
2434             remap[dst] = alloc.allocate(inst->exec_size / 8);
2435             inst->dst.nr = remap[dst];
2436             progress = true;
2437          }
2438       } else if (inst->dst.file == VGRF &&
2439                  remap[dst] != -1 &&
2440                  remap[dst] != dst) {
2441          inst->dst.nr = remap[dst];
2442          progress = true;
2443       }
2444    }
2445
2446    if (progress) {
2447       invalidate_live_intervals();
2448
2449       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2450          if (delta_xy[i].file == VGRF && remap[delta_xy[i].nr] != -1) {
2451             delta_xy[i].nr = remap[delta_xy[i].nr];
2452          }
2453       }
2454    }
2455
2456    return progress;
2457 }
2458
2459 /**
2460  * Remove redundant or useless discard jumps.
2461  *
2462  * For example, we can eliminate jumps in the following sequence:
2463  *
2464  * discard-jump       (redundant with the next jump)
2465  * discard-jump       (useless; jumps to the next instruction)
2466  * placeholder-halt
2467  */
2468 bool
2469 fs_visitor::opt_redundant_discard_jumps()
2470 {
2471    bool progress = false;
2472
2473    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2474
2475    fs_inst *placeholder_halt = NULL;
2476    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2477       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2478          placeholder_halt = inst;
2479          break;
2480       }
2481    }
2482
2483    if (!placeholder_halt)
2484       return false;
2485
2486    /* Delete any HALTs immediately before the placeholder halt. */
2487    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2488         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2489         prev = (fs_inst *) placeholder_halt->prev) {
2490       prev->remove(last_bblock);
2491       progress = true;
2492    }
2493
2494    if (progress)
2495       invalidate_live_intervals();
2496
2497    return progress;
2498 }
2499
2500 bool
2501 fs_visitor::compute_to_mrf()
2502 {
2503    bool progress = false;
2504    int next_ip = 0;
2505
2506    /* No MRFs on Gen >= 7. */
2507    if (devinfo->gen >= 7)
2508       return false;
2509
2510    calculate_live_intervals();
2511
2512    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2513       int ip = next_ip;
2514       next_ip++;
2515
2516       if (inst->opcode != BRW_OPCODE_MOV ||
2517           inst->is_partial_write() ||
2518           inst->dst.file != MRF || inst->src[0].file != VGRF ||
2519           inst->dst.type != inst->src[0].type ||
2520           inst->src[0].abs || inst->src[0].negate ||
2521           !inst->src[0].is_contiguous() ||
2522           inst->src[0].subreg_offset)
2523          continue;
2524
2525       /* Work out which hardware MRF registers are written by this
2526        * instruction.
2527        */
2528       int mrf_low = inst->dst.nr & ~BRW_MRF_COMPR4;
2529       int mrf_high;
2530       if (inst->dst.nr & BRW_MRF_COMPR4) {
2531          mrf_high = mrf_low + 4;
2532       } else if (inst->exec_size == 16) {
2533          mrf_high = mrf_low + 1;
2534       } else {
2535          mrf_high = mrf_low;
2536       }
2537
2538       /* Can't compute-to-MRF this GRF if someone else was going to
2539        * read it later.
2540        */
2541       if (this->virtual_grf_end[inst->src[0].nr] > ip)
2542          continue;
2543
2544       /* Found a move of a GRF to a MRF.  Let's see if we can go
2545        * rewrite the thing that made this GRF to write into the MRF.
2546        */
2547       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
2548          if (scan_inst->dst.file == VGRF &&
2549             scan_inst->dst.nr == inst->src[0].nr) {
2550             /* Found the last thing to write our reg we want to turn
2551              * into a compute-to-MRF.
2552              */
2553
2554             /* If this one instruction didn't populate all the
2555              * channels, bail.  We might be able to rewrite everything
2556              * that writes that reg, but it would require smarter
2557              * tracking to delay the rewriting until complete success.
2558              */
2559             if (scan_inst->is_partial_write())
2560                break;
2561
2562             /* Things returning more than one register would need us to
2563              * understand coalescing out more than one MOV at a time.
2564              */
2565             if (scan_inst->regs_written > scan_inst->exec_size / 8)
2566                break;
2567
2568             /* SEND instructions can't have MRF as a destination. */
2569             if (scan_inst->mlen)
2570                break;
2571
2572             if (devinfo->gen == 6) {
2573                /* gen6 math instructions must have the destination be
2574                 * GRF, so no compute-to-MRF for them.
2575                 */
2576                if (scan_inst->is_math()) {
2577                   break;
2578                }
2579             }
2580
2581             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2582                /* Found the creator of our MRF's source value. */
2583                scan_inst->dst.file = MRF;
2584                scan_inst->dst.nr = inst->dst.nr;
2585                scan_inst->saturate |= inst->saturate;
2586                inst->remove(block);
2587                progress = true;
2588             }
2589             break;
2590          }
2591
2592          /* We don't handle control flow here.  Most computation of
2593           * values that end up in MRFs are shortly before the MRF
2594           * write anyway.
2595           */
2596          if (block->start() == scan_inst)
2597             break;
2598
2599          /* You can't read from an MRF, so if someone else reads our
2600           * MRF's source GRF that we wanted to rewrite, that stops us.
2601           */
2602          bool interfered = false;
2603          for (int i = 0; i < scan_inst->sources; i++) {
2604             if (scan_inst->src[i].file == VGRF &&
2605                 scan_inst->src[i].nr == inst->src[0].nr &&
2606                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2607                interfered = true;
2608             }
2609          }
2610          if (interfered)
2611             break;
2612
2613          if (scan_inst->dst.file == MRF) {
2614             /* If somebody else writes our MRF here, we can't
2615              * compute-to-MRF before that.
2616              */
2617             int scan_mrf_low = scan_inst->dst.nr & ~BRW_MRF_COMPR4;
2618             int scan_mrf_high;
2619
2620             if (scan_inst->dst.nr & BRW_MRF_COMPR4) {
2621                scan_mrf_high = scan_mrf_low + 4;
2622             } else if (scan_inst->exec_size == 16) {
2623                scan_mrf_high = scan_mrf_low + 1;
2624             } else {
2625                scan_mrf_high = scan_mrf_low;
2626             }
2627
2628             if (mrf_low == scan_mrf_low ||
2629                 mrf_low == scan_mrf_high ||
2630                 mrf_high == scan_mrf_low ||
2631                 mrf_high == scan_mrf_high) {
2632                break;
2633             }
2634          }
2635
2636          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2637             /* Found a SEND instruction, which means that there are
2638              * live values in MRFs from base_mrf to base_mrf +
2639              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2640              * above it.
2641              */
2642             if (mrf_low >= scan_inst->base_mrf &&
2643                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2644                break;
2645             }
2646             if (mrf_high >= scan_inst->base_mrf &&
2647                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2648                break;
2649             }
2650          }
2651       }
2652    }
2653
2654    if (progress)
2655       invalidate_live_intervals();
2656
2657    return progress;
2658 }
2659
2660 /**
2661  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2662  * flow.  We could probably do better here with some form of divergence
2663  * analysis.
2664  */
2665 bool
2666 fs_visitor::eliminate_find_live_channel()
2667 {
2668    bool progress = false;
2669    unsigned depth = 0;
2670
2671    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2672       switch (inst->opcode) {
2673       case BRW_OPCODE_IF:
2674       case BRW_OPCODE_DO:
2675          depth++;
2676          break;
2677
2678       case BRW_OPCODE_ENDIF:
2679       case BRW_OPCODE_WHILE:
2680          depth--;
2681          break;
2682
2683       case FS_OPCODE_DISCARD_JUMP:
2684          /* This can potentially make control flow non-uniform until the end
2685           * of the program.
2686           */
2687          return progress;
2688
2689       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2690          if (depth == 0) {
2691             inst->opcode = BRW_OPCODE_MOV;
2692             inst->src[0] = brw_imm_ud(0u);
2693             inst->sources = 1;
2694             inst->force_writemask_all = true;
2695             progress = true;
2696          }
2697          break;
2698
2699       default:
2700          break;
2701       }
2702    }
2703
2704    return progress;
2705 }
2706
2707 /**
2708  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2709  * instructions to FS_OPCODE_REP_FB_WRITE.
2710  */
2711 void
2712 fs_visitor::emit_repclear_shader()
2713 {
2714    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2715    int base_mrf = 1;
2716    int color_mrf = base_mrf + 2;
2717
2718    fs_inst *mov = bld.exec_all().group(4, 0)
2719                      .MOV(brw_message_reg(color_mrf),
2720                           fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
2721
2722    fs_inst *write;
2723    if (key->nr_color_regions == 1) {
2724       write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2725       write->saturate = key->clamp_fragment_color;
2726       write->base_mrf = color_mrf;
2727       write->target = 0;
2728       write->header_size = 0;
2729       write->mlen = 1;
2730    } else {
2731       assume(key->nr_color_regions > 0);
2732       for (int i = 0; i < key->nr_color_regions; ++i) {
2733          write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2734          write->saturate = key->clamp_fragment_color;
2735          write->base_mrf = base_mrf;
2736          write->target = i;
2737          write->header_size = 2;
2738          write->mlen = 3;
2739       }
2740    }
2741    write->eot = true;
2742
2743    calculate_cfg();
2744
2745    assign_constant_locations();
2746    assign_curb_setup();
2747
2748    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2749    assert(mov->src[0].file == FIXED_GRF);
2750    mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
2751 }
2752
2753 /**
2754  * Walks through basic blocks, looking for repeated MRF writes and
2755  * removing the later ones.
2756  */
2757 bool
2758 fs_visitor::remove_duplicate_mrf_writes()
2759 {
2760    fs_inst *last_mrf_move[BRW_MAX_MRF(devinfo->gen)];
2761    bool progress = false;
2762
2763    /* Need to update the MRF tracking for compressed instructions. */
2764    if (dispatch_width == 16)
2765       return false;
2766
2767    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2768
2769    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2770       if (inst->is_control_flow()) {
2771          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2772       }
2773
2774       if (inst->opcode == BRW_OPCODE_MOV &&
2775           inst->dst.file == MRF) {
2776          fs_inst *prev_inst = last_mrf_move[inst->dst.nr];
2777          if (prev_inst && inst->equals(prev_inst)) {
2778             inst->remove(block);
2779             progress = true;
2780             continue;
2781          }
2782       }
2783
2784       /* Clear out the last-write records for MRFs that were overwritten. */
2785       if (inst->dst.file == MRF) {
2786          last_mrf_move[inst->dst.nr] = NULL;
2787       }
2788
2789       if (inst->mlen > 0 && inst->base_mrf != -1) {
2790          /* Found a SEND instruction, which will include two or fewer
2791           * implied MRF writes.  We could do better here.
2792           */
2793          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2794             last_mrf_move[inst->base_mrf + i] = NULL;
2795          }
2796       }
2797
2798       /* Clear out any MRF move records whose sources got overwritten. */
2799       if (inst->dst.file == VGRF) {
2800          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2801             if (last_mrf_move[i] &&
2802                 last_mrf_move[i]->src[0].nr == inst->dst.nr) {
2803                last_mrf_move[i] = NULL;
2804             }
2805          }
2806       }
2807
2808       if (inst->opcode == BRW_OPCODE_MOV &&
2809           inst->dst.file == MRF &&
2810           inst->src[0].file == VGRF &&
2811           !inst->is_partial_write()) {
2812          last_mrf_move[inst->dst.nr] = inst;
2813       }
2814    }
2815
2816    if (progress)
2817       invalidate_live_intervals();
2818
2819    return progress;
2820 }
2821
2822 static void
2823 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2824 {
2825    /* Clear the flag for registers that actually got read (as expected). */
2826    for (int i = 0; i < inst->sources; i++) {
2827       int grf;
2828       if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {
2829          grf = inst->src[i].nr;
2830       } else {
2831          continue;
2832       }
2833
2834       if (grf >= first_grf &&
2835           grf < first_grf + grf_len) {
2836          deps[grf - first_grf] = false;
2837          if (inst->exec_size == 16)
2838             deps[grf - first_grf + 1] = false;
2839       }
2840    }
2841 }
2842
2843 /**
2844  * Implements this workaround for the original 965:
2845  *
2846  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2847  *      check for post destination dependencies on this instruction, software
2848  *      must ensure that there is no destination hazard for the case of ‘write
2849  *      followed by a posted write’ shown in the following example.
2850  *
2851  *      1. mov r3 0
2852  *      2. send r3.xy <rest of send instruction>
2853  *      3. mov r2 r3
2854  *
2855  *      Due to no post-destination dependency check on the ‘send’, the above
2856  *      code sequence could have two instructions (1 and 2) in flight at the
2857  *      same time that both consider ‘r3’ as the target of their final writes.
2858  */
2859 void
2860 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2861                                                         fs_inst *inst)
2862 {
2863    int write_len = inst->regs_written;
2864    int first_write_grf = inst->dst.nr;
2865    bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
2866    assert(write_len < (int)sizeof(needs_dep) - 1);
2867
2868    memset(needs_dep, false, sizeof(needs_dep));
2869    memset(needs_dep, true, write_len);
2870
2871    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2872
2873    /* Walk backwards looking for writes to registers we're writing which
2874     * aren't read since being written.  If we hit the start of the program,
2875     * we assume that there are no outstanding dependencies on entry to the
2876     * program.
2877     */
2878    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
2879       /* If we hit control flow, assume that there *are* outstanding
2880        * dependencies, and force their cleanup before our instruction.
2881        */
2882       if (block->start() == scan_inst) {
2883          for (int i = 0; i < write_len; i++) {
2884             if (needs_dep[i])
2885                DEP_RESOLVE_MOV(fs_builder(this, block, inst),
2886                                first_write_grf + i);
2887          }
2888          return;
2889       }
2890
2891       /* We insert our reads as late as possible on the assumption that any
2892        * instruction but a MOV that might have left us an outstanding
2893        * dependency has more latency than a MOV.
2894        */
2895       if (scan_inst->dst.file == VGRF) {
2896          for (int i = 0; i < scan_inst->regs_written; i++) {
2897             int reg = scan_inst->dst.nr + i;
2898
2899             if (reg >= first_write_grf &&
2900                 reg < first_write_grf + write_len &&
2901                 needs_dep[reg - first_write_grf]) {
2902                DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
2903                needs_dep[reg - first_write_grf] = false;
2904                if (scan_inst->exec_size == 16)
2905                   needs_dep[reg - first_write_grf + 1] = false;
2906             }
2907          }
2908       }
2909
2910       /* Clear the flag for registers that actually got read (as expected). */
2911       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2912
2913       /* Continue the loop only if we haven't resolved all the dependencies */
2914       int i;
2915       for (i = 0; i < write_len; i++) {
2916          if (needs_dep[i])
2917             break;
2918       }
2919       if (i == write_len)
2920          return;
2921    }
2922 }
2923
2924 /**
2925  * Implements this workaround for the original 965:
2926  *
2927  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2928  *      used as a destination register until after it has been sourced by an
2929  *      instruction with a different destination register.
2930  */
2931 void
2932 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2933 {
2934    int write_len = inst->regs_written;
2935    int first_write_grf = inst->dst.nr;
2936    bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
2937    assert(write_len < (int)sizeof(needs_dep) - 1);
2938
2939    memset(needs_dep, false, sizeof(needs_dep));
2940    memset(needs_dep, true, write_len);
2941    /* Walk forwards looking for writes to registers we're writing which aren't
2942     * read before being written.
2943     */
2944    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst) {
2945       /* If we hit control flow, force resolve all remaining dependencies. */
2946       if (block->end() == scan_inst) {
2947          for (int i = 0; i < write_len; i++) {
2948             if (needs_dep[i])
2949                DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
2950                                first_write_grf + i);
2951          }
2952          return;
2953       }
2954
2955       /* Clear the flag for registers that actually got read (as expected). */
2956       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2957
2958       /* We insert our reads as late as possible since they're reading the
2959        * result of a SEND, which has massive latency.
2960        */
2961       if (scan_inst->dst.file == VGRF &&
2962           scan_inst->dst.nr >= first_write_grf &&
2963           scan_inst->dst.nr < first_write_grf + write_len &&
2964           needs_dep[scan_inst->dst.nr - first_write_grf]) {
2965          DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
2966                          scan_inst->dst.nr);
2967          needs_dep[scan_inst->dst.nr - first_write_grf] = false;
2968       }
2969
2970       /* Continue the loop only if we haven't resolved all the dependencies */
2971       int i;
2972       for (i = 0; i < write_len; i++) {
2973          if (needs_dep[i])
2974             break;
2975       }
2976       if (i == write_len)
2977          return;
2978    }
2979 }
2980
2981 void
2982 fs_visitor::insert_gen4_send_dependency_workarounds()
2983 {
2984    if (devinfo->gen != 4 || devinfo->is_g4x)
2985       return;
2986
2987    bool progress = false;
2988
2989    /* Note that we're done with register allocation, so GRF fs_regs always
2990     * have a .reg_offset of 0.
2991     */
2992
2993    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2994       if (inst->mlen != 0 && inst->dst.file == VGRF) {
2995          insert_gen4_pre_send_dependency_workarounds(block, inst);
2996          insert_gen4_post_send_dependency_workarounds(block, inst);
2997          progress = true;
2998       }
2999    }
3000
3001    if (progress)
3002       invalidate_live_intervals();
3003 }
3004
3005 /**
3006  * Turns the generic expression-style uniform pull constant load instruction
3007  * into a hardware-specific series of instructions for loading a pull
3008  * constant.
3009  *
3010  * The expression style allows the CSE pass before this to optimize out
3011  * repeated loads from the same offset, and gives the pre-register-allocation
3012  * scheduling full flexibility, while the conversion to native instructions
3013  * allows the post-register-allocation scheduler the best information
3014  * possible.
3015  *
3016  * Note that execution masking for setting up pull constant loads is special:
3017  * the channels that need to be written are unrelated to the current execution
3018  * mask, since a later instruction will use one of the result channels as a
3019  * source operand for all 8 or 16 of its channels.
3020  */
3021 void
3022 fs_visitor::lower_uniform_pull_constant_loads()
3023 {
3024    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3025       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3026          continue;
3027
3028       if (devinfo->gen >= 7) {
3029          /* The offset arg before was a vec4-aligned byte offset.  We need to
3030           * turn it into a dword offset.
3031           */
3032          fs_reg const_offset_reg = inst->src[1];
3033          assert(const_offset_reg.file == IMM &&
3034                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3035          const_offset_reg.ud /= 4;
3036
3037          fs_reg payload, offset;
3038          if (devinfo->gen >= 9) {
3039             /* We have to use a message header on Skylake to get SIMD4x2
3040              * mode.  Reserve space for the register.
3041             */
3042             offset = payload = fs_reg(VGRF, alloc.allocate(2));
3043             offset.reg_offset++;
3044             inst->mlen = 2;
3045          } else {
3046             offset = payload = fs_reg(VGRF, alloc.allocate(1));
3047             inst->mlen = 1;
3048          }
3049
3050          /* This is actually going to be a MOV, but since only the first dword
3051           * is accessed, we have a special opcode to do just that one.  Note
3052           * that this needs to be an operation that will be considered a def
3053           * by live variable analysis, or register allocation will explode.
3054           */
3055          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3056                                                8, offset, const_offset_reg);
3057          setup->force_writemask_all = true;
3058
3059          setup->ir = inst->ir;
3060          setup->annotation = inst->annotation;
3061          inst->insert_before(block, setup);
3062
3063          /* Similarly, this will only populate the first 4 channels of the
3064           * result register (since we only use smear values from 0-3), but we
3065           * don't tell the optimizer.
3066           */
3067          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3068          inst->src[1] = payload;
3069          inst->base_mrf = -1;
3070
3071          invalidate_live_intervals();
3072       } else {
3073          /* Before register allocation, we didn't tell the scheduler about the
3074           * MRF we use.  We know it's safe to use this MRF because nothing
3075           * else does except for register spill/unspill, which generates and
3076           * uses its MRF within a single IR instruction.
3077           */
3078          inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
3079          inst->mlen = 1;
3080       }
3081    }
3082 }
3083
3084 bool
3085 fs_visitor::lower_load_payload()
3086 {
3087    bool progress = false;
3088
3089    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3090       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3091          continue;
3092
3093       assert(inst->dst.file == MRF || inst->dst.file == VGRF);
3094       assert(inst->saturate == false);
3095       fs_reg dst = inst->dst;
3096
3097       /* Get rid of COMPR4.  We'll add it back in if we need it */
3098       if (dst.file == MRF)
3099          dst.nr = dst.nr & ~BRW_MRF_COMPR4;
3100
3101       const fs_builder ibld(this, block, inst);
3102       const fs_builder hbld = ibld.exec_all().group(8, 0);
3103
3104       for (uint8_t i = 0; i < inst->header_size; i++) {
3105          if (inst->src[i].file != BAD_FILE) {
3106             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3107             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3108             hbld.MOV(mov_dst, mov_src);
3109          }
3110          dst = offset(dst, hbld, 1);
3111       }
3112
3113       if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) &&
3114           inst->exec_size > 8) {
3115          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3116           * a straightforward copy.  Instead, the result of the
3117           * LOAD_PAYLOAD is treated as interleaved and the first four
3118           * non-header sources are unpacked as:
3119           *
3120           * m + 0: r0
3121           * m + 1: g0
3122           * m + 2: b0
3123           * m + 3: a0
3124           * m + 4: r1
3125           * m + 5: g1
3126           * m + 6: b1
3127           * m + 7: a1
3128           *
3129           * This is used for gen <= 5 fb writes.
3130           */
3131          assert(inst->exec_size == 16);
3132          assert(inst->header_size + 4 <= inst->sources);
3133          for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3134             if (inst->src[i].file != BAD_FILE) {
3135                if (devinfo->has_compr4) {
3136                   fs_reg compr4_dst = retype(dst, inst->src[i].type);
3137                   compr4_dst.nr |= BRW_MRF_COMPR4;
3138                   ibld.MOV(compr4_dst, inst->src[i]);
3139                } else {
3140                   /* Platform doesn't have COMPR4.  We have to fake it */
3141                   fs_reg mov_dst = retype(dst, inst->src[i].type);
3142                   ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
3143                   mov_dst.nr += 4;
3144                   ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
3145                }
3146             }
3147
3148             dst.nr++;
3149          }
3150
3151          /* The loop above only ever incremented us through the first set
3152           * of 4 registers.  However, thanks to the magic of COMPR4, we
3153           * actually wrote to the first 8 registers, so we need to take
3154           * that into account now.
3155           */
3156          dst.nr += 4;
3157
3158          /* The COMPR4 code took care of the first 4 sources.  We'll let
3159           * the regular path handle any remaining sources.  Yes, we are
3160           * modifying the instruction but we're about to delete it so
3161           * this really doesn't hurt anything.
3162           */
3163          inst->header_size += 4;
3164       }
3165
3166       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3167          if (inst->src[i].file != BAD_FILE)
3168             ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
3169          dst = offset(dst, ibld, 1);
3170       }
3171
3172       inst->remove(block);
3173       progress = true;
3174    }
3175
3176    if (progress)
3177       invalidate_live_intervals();
3178
3179    return progress;
3180 }
3181
3182 bool
3183 fs_visitor::lower_integer_multiplication()
3184 {
3185    bool progress = false;
3186
3187    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3188       const fs_builder ibld(this, block, inst);
3189
3190       if (inst->opcode == BRW_OPCODE_MUL) {
3191          if (inst->dst.is_accumulator() ||
3192              (inst->dst.type != BRW_REGISTER_TYPE_D &&
3193               inst->dst.type != BRW_REGISTER_TYPE_UD))
3194             continue;
3195
3196          /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit
3197           * operation directly, but CHV/BXT cannot.
3198           */
3199          if (devinfo->gen >= 8 &&
3200              !devinfo->is_cherryview && !devinfo->is_broxton)
3201             continue;
3202
3203          if (inst->src[1].file == IMM &&
3204              inst->src[1].ud < (1 << 16)) {
3205             /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3206              * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3207              * src1 are used.
3208              *
3209              * If multiplying by an immediate value that fits in 16-bits, do a
3210              * single MUL instruction with that value in the proper location.
3211              */
3212             if (devinfo->gen < 7) {
3213                fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8),
3214                           inst->dst.type);
3215                ibld.MOV(imm, inst->src[1]);
3216                ibld.MUL(inst->dst, imm, inst->src[0]);
3217             } else {
3218                ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
3219             }
3220          } else {
3221             /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3222              * do 32-bit integer multiplication in one instruction, but instead
3223              * must do a sequence (which actually calculates a 64-bit result):
3224              *
3225              *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
3226              *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
3227              *    mov(8)  g2<1>D     acc0<8,8,1>D
3228              *
3229              * But on Gen > 6, the ability to use second accumulator register
3230              * (acc1) for non-float data types was removed, preventing a simple
3231              * implementation in SIMD16. A 16-channel result can be calculated by
3232              * executing the three instructions twice in SIMD8, once with quarter
3233              * control of 1Q for the first eight channels and again with 2Q for
3234              * the second eight channels.
3235              *
3236              * Which accumulator register is implicitly accessed (by AccWrEnable
3237              * for instance) is determined by the quarter control. Unfortunately
3238              * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3239              * implicit accumulator access by an instruction with 2Q will access
3240              * acc1 regardless of whether the data type is usable in acc1.
3241              *
3242              * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3243              * integer data types.
3244              *
3245              * Since we only want the low 32-bits of the result, we can do two
3246              * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3247              * adjust the high result and add them (like the mach is doing):
3248              *
3249              *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
3250              *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
3251              *    shl(8)  g9<1>D     g8<8,8,1>D      16D
3252              *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
3253              *
3254              * We avoid the shl instruction by realizing that we only want to add
3255              * the low 16-bits of the "high" result to the high 16-bits of the
3256              * "low" result and using proper regioning on the add:
3257              *
3258              *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
3259              *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
3260              *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
3261              *
3262              * Since it does not use the (single) accumulator register, we can
3263              * schedule multi-component multiplications much better.
3264              */
3265
3266             fs_reg orig_dst = inst->dst;
3267             if (orig_dst.is_null() || orig_dst.file == MRF) {
3268                inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
3269                                   inst->dst.type);
3270             }
3271             fs_reg low = inst->dst;
3272             fs_reg high(VGRF, alloc.allocate(dispatch_width / 8),
3273                         inst->dst.type);
3274
3275             if (devinfo->gen >= 7) {
3276                fs_reg src1_0_w = inst->src[1];
3277                fs_reg src1_1_w = inst->src[1];
3278
3279                if (inst->src[1].file == IMM) {
3280                   src1_0_w.ud &= 0xffff;
3281                   src1_1_w.ud >>= 16;
3282                } else {
3283                   src1_0_w.type = BRW_REGISTER_TYPE_UW;
3284                   if (src1_0_w.stride != 0) {
3285                      assert(src1_0_w.stride == 1);
3286                      src1_0_w.stride = 2;
3287                   }
3288
3289                   src1_1_w.type = BRW_REGISTER_TYPE_UW;
3290                   if (src1_1_w.stride != 0) {
3291                      assert(src1_1_w.stride == 1);
3292                      src1_1_w.stride = 2;
3293                   }
3294                   src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3295                }
3296                ibld.MUL(low, inst->src[0], src1_0_w);
3297                ibld.MUL(high, inst->src[0], src1_1_w);
3298             } else {
3299                fs_reg src0_0_w = inst->src[0];
3300                fs_reg src0_1_w = inst->src[0];
3301
3302                src0_0_w.type = BRW_REGISTER_TYPE_UW;
3303                if (src0_0_w.stride != 0) {
3304                   assert(src0_0_w.stride == 1);
3305                   src0_0_w.stride = 2;
3306                }
3307
3308                src0_1_w.type = BRW_REGISTER_TYPE_UW;
3309                if (src0_1_w.stride != 0) {
3310                   assert(src0_1_w.stride == 1);
3311                   src0_1_w.stride = 2;
3312                }
3313                src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3314
3315                ibld.MUL(low, src0_0_w, inst->src[1]);
3316                ibld.MUL(high, src0_1_w, inst->src[1]);
3317             }
3318
3319             fs_reg dst = inst->dst;
3320             dst.type = BRW_REGISTER_TYPE_UW;
3321             dst.subreg_offset = 2;
3322             dst.stride = 2;
3323
3324             high.type = BRW_REGISTER_TYPE_UW;
3325             high.stride = 2;
3326
3327             low.type = BRW_REGISTER_TYPE_UW;
3328             low.subreg_offset = 2;
3329             low.stride = 2;
3330
3331             ibld.ADD(dst, low, high);
3332
3333             if (inst->conditional_mod || orig_dst.file == MRF) {
3334                set_condmod(inst->conditional_mod,
3335                            ibld.MOV(orig_dst, inst->dst));
3336             }
3337          }
3338
3339       } else if (inst->opcode == SHADER_OPCODE_MULH) {
3340          /* Should have been lowered to 8-wide. */
3341          assert(inst->exec_size <= 8);
3342          const fs_reg acc = retype(brw_acc_reg(inst->exec_size),
3343                                    inst->dst.type);
3344          fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
3345          fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
3346
3347          if (devinfo->gen >= 8) {
3348             /* Until Gen8, integer multiplies read 32-bits from one source,
3349              * and 16-bits from the other, and relying on the MACH instruction
3350              * to generate the high bits of the result.
3351              *
3352              * On Gen8, the multiply instruction does a full 32x32-bit
3353              * multiply, but in order to do a 64-bit multiply we can simulate
3354              * the previous behavior and then use a MACH instruction.
3355              *
3356              * FINISHME: Don't use source modifiers on src1.
3357              */
3358             assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
3359                    mul->src[1].type == BRW_REGISTER_TYPE_UD);
3360             mul->src[1].type = (type_is_signed(mul->src[1].type) ?
3361                                 BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW);
3362             mul->src[1].stride *= 2;
3363
3364          } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
3365                     inst->force_sechalf) {
3366             /* Among other things the quarter control bits influence which
3367              * accumulator register is used by the hardware for instructions
3368              * that access the accumulator implicitly (e.g. MACH).  A
3369              * second-half instruction would normally map to acc1, which
3370              * doesn't exist on Gen7 and up (the hardware does emulate it for
3371              * floating-point instructions *only* by taking advantage of the
3372              * extra precision of acc0 not normally used for floating point
3373              * arithmetic).
3374              *
3375              * HSW and up are careful enough not to try to access an
3376              * accumulator register that doesn't exist, but on earlier Gen7
3377              * hardware we need to make sure that the quarter control bits are
3378              * zero to avoid non-deterministic behaviour and emit an extra MOV
3379              * to get the result masked correctly according to the current
3380              * channel enables.
3381              */
3382             mach->force_sechalf = false;
3383             mach->force_writemask_all = true;
3384             mach->dst = ibld.vgrf(inst->dst.type);
3385             ibld.MOV(inst->dst, mach->dst);
3386          }
3387       } else {
3388          continue;
3389       }
3390
3391       inst->remove(block);
3392       progress = true;
3393    }
3394
3395    if (progress)
3396       invalidate_live_intervals();
3397
3398    return progress;
3399 }
3400
3401 static void
3402 setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
3403                     fs_reg *dst, fs_reg color, unsigned components)
3404 {
3405    if (key->clamp_fragment_color) {
3406       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
3407       assert(color.type == BRW_REGISTER_TYPE_F);
3408
3409       for (unsigned i = 0; i < components; i++)
3410          set_saturate(true,
3411                       bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
3412
3413       color = tmp;
3414    }
3415
3416    for (unsigned i = 0; i < components; i++)
3417       dst[i] = offset(color, bld, i);
3418 }
3419
3420 static void
3421 lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
3422                             const brw_wm_prog_data *prog_data,
3423                             const brw_wm_prog_key *key,
3424                             const fs_visitor::thread_payload &payload)
3425 {
3426    assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
3427    const brw_device_info *devinfo = bld.shader->devinfo;
3428    const fs_reg &color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
3429    const fs_reg &color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
3430    const fs_reg &src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
3431    const fs_reg &src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
3432    const fs_reg &dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
3433    const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
3434    fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
3435    const unsigned components =
3436       inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
3437
3438    /* We can potentially have a message length of up to 15, so we have to set
3439     * base_mrf to either 0 or 1 in order to fit in m0..m15.
3440     */
3441    fs_reg sources[15];
3442    int header_size = 2, payload_header_size;
3443    unsigned length = 0;
3444
3445    /* From the Sandy Bridge PRM, volume 4, page 198:
3446     *
3447     *     "Dispatched Pixel Enables. One bit per pixel indicating
3448     *      which pixels were originally enabled when the thread was
3449     *      dispatched. This field is only required for the end-of-
3450     *      thread message and on all dual-source messages."
3451     */
3452    if (devinfo->gen >= 6 &&
3453        (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
3454        color1.file == BAD_FILE &&
3455        key->nr_color_regions == 1) {
3456       header_size = 0;
3457    }
3458
3459    if (header_size != 0) {
3460       assert(header_size == 2);
3461       /* Allocate 2 registers for a header */
3462       length += 2;
3463    }
3464
3465    if (payload.aa_dest_stencil_reg) {
3466       sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
3467       bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
3468          .MOV(sources[length],
3469               fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
3470       length++;
3471    }
3472
3473    if (prog_data->uses_omask) {
3474       sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1),
3475                                BRW_REGISTER_TYPE_UD);
3476
3477       /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
3478        * relevant.  Since it's unsigned single words one vgrf is always
3479        * 16-wide, but only the lower or higher 8 channels will be used by the
3480        * hardware when doing a SIMD8 write depending on whether we have
3481        * selected the subspans for the first or second half respectively.
3482        */
3483       assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
3484       sample_mask.type = BRW_REGISTER_TYPE_UW;
3485       sample_mask.stride *= 2;
3486
3487       bld.exec_all().annotate("FB write oMask")
3488          .MOV(half(retype(sources[length], BRW_REGISTER_TYPE_UW),
3489                    inst->force_sechalf),
3490               sample_mask);
3491       length++;
3492    }
3493
3494    payload_header_size = length;
3495
3496    if (src0_alpha.file != BAD_FILE) {
3497       /* FIXME: This is being passed at the wrong location in the payload and
3498        * doesn't work when gl_SampleMask and MRTs are used simultaneously.
3499        * It's supposed to be immediately before oMask but there seems to be no
3500        * reasonable way to pass them in the correct order because LOAD_PAYLOAD
3501        * requires header sources to form a contiguous segment at the beginning
3502        * of the message and src0_alpha has per-channel semantics.
3503        */
3504       setup_color_payload(bld, key, &sources[length], src0_alpha, 1);
3505       length++;
3506    }
3507
3508    setup_color_payload(bld, key, &sources[length], color0, components);
3509    length += 4;
3510
3511    if (color1.file != BAD_FILE) {
3512       setup_color_payload(bld, key, &sources[length], color1, components);
3513       length += 4;
3514    }
3515
3516    if (src_depth.file != BAD_FILE) {
3517       sources[length] = src_depth;
3518       length++;
3519    }
3520
3521    if (dst_depth.file != BAD_FILE) {
3522       sources[length] = dst_depth;
3523       length++;
3524    }
3525
3526    if (src_stencil.file != BAD_FILE) {
3527       assert(devinfo->gen >= 9);
3528       assert(bld.dispatch_width() != 16);
3529
3530       /* XXX: src_stencil is only available on gen9+. dst_depth is never
3531        * available on gen9+. As such it's impossible to have both enabled at the
3532        * same time and therefore length cannot overrun the array.
3533        */
3534       assert(length < 15);
3535
3536       sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD);
3537       bld.exec_all().annotate("FB write OS")
3538          .emit(FS_OPCODE_PACK_STENCIL_REF, sources[length],
3539                retype(src_stencil, BRW_REGISTER_TYPE_UB));
3540       length++;
3541    }
3542
3543    fs_inst *load;
3544    if (devinfo->gen >= 7) {
3545       /* Send from the GRF */
3546       fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F);
3547       load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
3548       payload.nr = bld.shader->alloc.allocate(load->regs_written);
3549       load->dst = payload;
3550
3551       inst->src[0] = payload;
3552       inst->resize_sources(1);
3553       inst->base_mrf = -1;
3554    } else {
3555       /* Send from the MRF */
3556       load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
3557                               sources, length, payload_header_size);
3558
3559       /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
3560        * will do this for us if we just give it a COMPR4 destination.
3561        */
3562       if (devinfo->gen < 6 && bld.dispatch_width() == 16)
3563          load->dst.nr |= BRW_MRF_COMPR4;
3564
3565       inst->resize_sources(0);
3566       inst->base_mrf = 1;
3567    }
3568
3569    inst->opcode = FS_OPCODE_FB_WRITE;
3570    inst->mlen = load->regs_written;
3571    inst->header_size = header_size;
3572 }
3573
3574 static void
3575 lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
3576                                 const fs_reg &coordinate,
3577                                 const fs_reg &shadow_c,
3578                                 const fs_reg &lod, const fs_reg &lod2,
3579                                 const fs_reg &sampler,
3580                                 unsigned coord_components,
3581                                 unsigned grad_components)
3582 {
3583    const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
3584                          op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
3585    fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
3586    fs_reg msg_end = msg_begin;
3587
3588    /* g0 header. */
3589    msg_end = offset(msg_end, bld.group(8, 0), 1);
3590
3591    for (unsigned i = 0; i < coord_components; i++)
3592       bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
3593               offset(coordinate, bld, i));
3594
3595    msg_end = offset(msg_end, bld, coord_components);
3596
3597    /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
3598     * require all three components to be present and zero if they are unused.
3599     */
3600    if (coord_components > 0 &&
3601        (has_lod || shadow_c.file != BAD_FILE ||
3602         (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
3603       for (unsigned i = coord_components; i < 3; i++)
3604          bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f));
3605
3606       msg_end = offset(msg_end, bld, 3 - coord_components);
3607    }
3608
3609    if (op == SHADER_OPCODE_TXD) {
3610       /* TXD unsupported in SIMD16 mode. */
3611       assert(bld.dispatch_width() == 8);
3612
3613       /* the slots for u and v are always present, but r is optional */
3614       if (coord_components < 2)
3615          msg_end = offset(msg_end, bld, 2 - coord_components);
3616
3617       /*  P   = u, v, r
3618        * dPdx = dudx, dvdx, drdx
3619        * dPdy = dudy, dvdy, drdy
3620        *
3621        * 1-arg: Does not exist.
3622        *
3623        * 2-arg: dudx   dvdx   dudy   dvdy
3624        *        dPdx.x dPdx.y dPdy.x dPdy.y
3625        *        m4     m5     m6     m7
3626        *
3627        * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
3628        *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
3629        *        m5     m6     m7     m8     m9     m10
3630        */
3631       for (unsigned i = 0; i < grad_components; i++)
3632          bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
3633
3634       msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
3635
3636       for (unsigned i = 0; i < grad_components; i++)
3637          bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
3638
3639       msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
3640    }
3641
3642    if (has_lod) {
3643       /* Bias/LOD with shadow comparitor is unsupported in SIMD16 -- *Without*
3644        * shadow comparitor (including RESINFO) it's unsupported in SIMD8 mode.
3645        */
3646       assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
3647              bld.dispatch_width() == 16);
3648
3649       const brw_reg_type type =
3650          (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
3651           BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
3652       bld.MOV(retype(msg_end, type), lod);
3653       msg_end = offset(msg_end, bld, 1);
3654    }
3655
3656    if (shadow_c.file != BAD_FILE) {
3657       if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
3658          /* There's no plain shadow compare message, so we use shadow
3659           * compare with a bias of 0.0.
3660           */
3661          bld.MOV(msg_end, brw_imm_f(0.0f));
3662          msg_end = offset(msg_end, bld, 1);
3663       }
3664
3665       bld.MOV(msg_end, shadow_c);
3666       msg_end = offset(msg_end, bld, 1);
3667    }
3668
3669    inst->opcode = op;
3670    inst->src[0] = reg_undef;
3671    inst->src[1] = sampler;
3672    inst->resize_sources(2);
3673    inst->base_mrf = msg_begin.nr;
3674    inst->mlen = msg_end.nr - msg_begin.nr;
3675    inst->header_size = 1;
3676 }
3677
3678 static void
3679 lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
3680                                 fs_reg coordinate,
3681                                 const fs_reg &shadow_c,
3682                                 fs_reg lod, fs_reg lod2,
3683                                 const fs_reg &sample_index,
3684                                 const fs_reg &sampler,
3685                                 const fs_reg &offset_value,
3686                                 unsigned coord_components,
3687                                 unsigned grad_components)
3688 {
3689    fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
3690    fs_reg msg_coords = message;
3691    unsigned header_size = 0;
3692
3693    if (offset_value.file != BAD_FILE) {
3694       /* The offsets set up by the visitor are in the m1 header, so we can't
3695        * go headerless.
3696        */
3697       header_size = 1;
3698       message.nr--;
3699    }
3700
3701    for (unsigned i = 0; i < coord_components; i++) {
3702       bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type), coordinate);
3703       coordinate = offset(coordinate, bld, 1);
3704    }
3705    fs_reg msg_end = offset(msg_coords, bld, coord_components);
3706    fs_reg msg_lod = offset(msg_coords, bld, 4);
3707
3708    if (shadow_c.file != BAD_FILE) {
3709       fs_reg msg_shadow = msg_lod;
3710       bld.MOV(msg_shadow, shadow_c);
3711       msg_lod = offset(msg_shadow, bld, 1);
3712       msg_end = msg_lod;
3713    }
3714
3715    switch (op) {
3716    case SHADER_OPCODE_TXL:
3717    case FS_OPCODE_TXB:
3718       bld.MOV(msg_lod, lod);
3719       msg_end = offset(msg_lod, bld, 1);
3720       break;
3721    case SHADER_OPCODE_TXD:
3722       /**
3723        *  P   =  u,    v,    r
3724        * dPdx = dudx, dvdx, drdx
3725        * dPdy = dudy, dvdy, drdy
3726        *
3727        * Load up these values:
3728        * - dudx   dudy   dvdx   dvdy   drdx   drdy
3729        * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
3730        */
3731       msg_end = msg_lod;
3732       for (unsigned i = 0; i < grad_components; i++) {
3733          bld.MOV(msg_end, lod);
3734          lod = offset(lod, bld, 1);
3735          msg_end = offset(msg_end, bld, 1);
3736
3737          bld.MOV(msg_end, lod2);
3738          lod2 = offset(lod2, bld, 1);
3739          msg_end = offset(msg_end, bld, 1);
3740       }
3741       break;
3742    case SHADER_OPCODE_TXS:
3743       msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
3744       bld.MOV(msg_lod, lod);
3745       msg_end = offset(msg_lod, bld, 1);
3746       break;
3747    case SHADER_OPCODE_TXF:
3748       msg_lod = offset(msg_coords, bld, 3);
3749       bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
3750       msg_end = offset(msg_lod, bld, 1);
3751       break;
3752    case SHADER_OPCODE_TXF_CMS:
3753       msg_lod = offset(msg_coords, bld, 3);
3754       /* lod */
3755       bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
3756       /* sample index */
3757       bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
3758       msg_end = offset(msg_lod, bld, 2);
3759       break;
3760    default:
3761       break;
3762    }
3763
3764    inst->opcode = op;
3765    inst->src[0] = reg_undef;
3766    inst->src[1] = sampler;
3767    inst->resize_sources(2);
3768    inst->base_mrf = message.nr;
3769    inst->mlen = msg_end.nr - message.nr;
3770    inst->header_size = header_size;
3771
3772    /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
3773    assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
3774 }
3775
3776 static bool
3777 is_high_sampler(const struct brw_device_info *devinfo, const fs_reg &sampler)
3778 {
3779    if (devinfo->gen < 8 && !devinfo->is_haswell)
3780       return false;
3781
3782    return sampler.file != IMM || sampler.ud >= 16;
3783 }
3784
3785 static void
3786 lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
3787                                 fs_reg coordinate,
3788                                 const fs_reg &shadow_c,
3789                                 fs_reg lod, fs_reg lod2,
3790                                 const fs_reg &sample_index,
3791                                 const fs_reg &mcs, const fs_reg &sampler,
3792                                 fs_reg offset_value,
3793                                 unsigned coord_components,
3794                                 unsigned grad_components)
3795 {
3796    const brw_device_info *devinfo = bld.shader->devinfo;
3797    int reg_width = bld.dispatch_width() / 8;
3798    unsigned header_size = 0, length = 0;
3799    fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
3800    for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
3801       sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
3802
3803    if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
3804        offset_value.file != BAD_FILE ||
3805        is_high_sampler(devinfo, sampler)) {
3806       /* For general texture offsets (no txf workaround), we need a header to
3807        * put them in.  Note that we're only reserving space for it in the
3808        * message payload as it will be initialized implicitly by the
3809        * generator.
3810        *
3811        * TG4 needs to place its channel select in the header, for interaction
3812        * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
3813        * larger sampler numbers we need to offset the Sampler State Pointer in
3814        * the header.
3815        */
3816       header_size = 1;
3817       sources[0] = fs_reg();
3818       length++;
3819    }
3820
3821    if (shadow_c.file != BAD_FILE) {
3822       bld.MOV(sources[length], shadow_c);
3823       length++;
3824    }
3825
3826    bool coordinate_done = false;
3827
3828    /* The sampler can only meaningfully compute LOD for fragment shader
3829     * messages. For all other stages, we change the opcode to TXL and
3830     * hardcode the LOD to 0.
3831     */
3832    if (bld.shader->stage != MESA_SHADER_FRAGMENT &&
3833        op == SHADER_OPCODE_TEX) {
3834       op = SHADER_OPCODE_TXL;
3835       lod = brw_imm_f(0.0f);
3836    }
3837
3838    /* Set up the LOD info */
3839    switch (op) {
3840    case FS_OPCODE_TXB:
3841    case SHADER_OPCODE_TXL:
3842       bld.MOV(sources[length], lod);
3843       length++;
3844       break;
3845    case SHADER_OPCODE_TXD:
3846       /* TXD should have been lowered in SIMD16 mode. */
3847       assert(bld.dispatch_width() == 8);
3848
3849       /* Load dPdx and the coordinate together:
3850        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
3851        */
3852       for (unsigned i = 0; i < coord_components; i++) {
3853          bld.MOV(sources[length], coordinate);
3854          coordinate = offset(coordinate, bld, 1);
3855          length++;
3856
3857          /* For cube map array, the coordinate is (u,v,r,ai) but there are
3858           * only derivatives for (u, v, r).
3859           */
3860          if (i < grad_components) {
3861             bld.MOV(sources[length], lod);
3862             lod = offset(lod, bld, 1);
3863             length++;
3864
3865             bld.MOV(sources[length], lod2);
3866             lod2 = offset(lod2, bld, 1);
3867             length++;
3868          }
3869       }
3870
3871       coordinate_done = true;
3872       break;
3873    case SHADER_OPCODE_TXS:
3874       bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
3875       length++;
3876       break;
3877    case SHADER_OPCODE_TXF:
3878       /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
3879        * On Gen9 they are u, v, lod, r
3880        */
3881       bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
3882       coordinate = offset(coordinate, bld, 1);
3883       length++;
3884
3885       if (devinfo->gen >= 9) {
3886          if (coord_components >= 2) {
3887             bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
3888             coordinate = offset(coordinate, bld, 1);
3889          }
3890          length++;
3891       }
3892
3893       bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
3894       length++;
3895
3896       for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
3897          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
3898          coordinate = offset(coordinate, bld, 1);
3899          length++;
3900       }
3901
3902       coordinate_done = true;
3903       break;
3904    case SHADER_OPCODE_TXF_CMS:
3905    case SHADER_OPCODE_TXF_CMS_W:
3906    case SHADER_OPCODE_TXF_UMS:
3907    case SHADER_OPCODE_TXF_MCS:
3908       if (op == SHADER_OPCODE_TXF_UMS ||
3909           op == SHADER_OPCODE_TXF_CMS ||
3910           op == SHADER_OPCODE_TXF_CMS_W) {
3911          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
3912          length++;
3913       }
3914
3915       if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {
3916          /* Data from the multisample control surface. */
3917          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
3918          length++;
3919
3920          /* On Gen9+ we'll use ld2dms_w instead which has two registers for
3921           * the MCS data.
3922           */
3923          if (op == SHADER_OPCODE_TXF_CMS_W) {
3924             bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD),
3925                     mcs.file == IMM ?
3926                     mcs :
3927                     offset(mcs, bld, 1));
3928             length++;
3929          }
3930       }
3931
3932       /* There is no offsetting for this message; just copy in the integer
3933        * texture coordinates.
3934        */
3935       for (unsigned i = 0; i < coord_components; i++) {
3936          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
3937          coordinate = offset(coordinate, bld, 1);
3938          length++;
3939       }
3940
3941       coordinate_done = true;
3942       break;
3943    case SHADER_OPCODE_TG4_OFFSET:
3944       /* gather4_po_c should have been lowered in SIMD16 mode. */
3945       assert(bld.dispatch_width() == 8 || shadow_c.file == BAD_FILE);
3946
3947       /* More crazy intermixing */
3948       for (unsigned i = 0; i < 2; i++) { /* u, v */
3949          bld.MOV(sources[length], coordinate);
3950          coordinate = offset(coordinate, bld, 1);
3951          length++;
3952       }
3953
3954       for (unsigned i = 0; i < 2; i++) { /* offu, offv */
3955          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value);
3956          offset_value = offset(offset_value, bld, 1);
3957          length++;
3958       }
3959
3960       if (coord_components == 3) { /* r if present */
3961          bld.MOV(sources[length], coordinate);
3962          coordinate = offset(coordinate, bld, 1);
3963          length++;
3964       }
3965
3966       coordinate_done = true;
3967       break;
3968    default:
3969       break;
3970    }
3971
3972    /* Set up the coordinate (except for cases where it was done above) */
3973    if (!coordinate_done) {
3974       for (unsigned i = 0; i < coord_components; i++) {
3975          bld.MOV(sources[length], coordinate);
3976          coordinate = offset(coordinate, bld, 1);
3977          length++;
3978       }
3979    }
3980
3981    int mlen;
3982    if (reg_width == 2)
3983       mlen = length * reg_width - header_size;
3984    else
3985       mlen = length * reg_width;
3986
3987    const fs_reg src_payload = fs_reg(VGRF, bld.shader->alloc.allocate(mlen),
3988                                      BRW_REGISTER_TYPE_F);
3989    bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
3990
3991    /* Generate the SEND. */
3992    inst->opcode = op;
3993    inst->src[0] = src_payload;
3994    inst->src[1] = sampler;
3995    inst->resize_sources(2);
3996    inst->base_mrf = -1;
3997    inst->mlen = mlen;
3998    inst->header_size = header_size;
3999
4000    /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
4001    assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
4002 }
4003
4004 static void
4005 lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
4006 {
4007    const brw_device_info *devinfo = bld.shader->devinfo;
4008    const fs_reg &coordinate = inst->src[0];
4009    const fs_reg &shadow_c = inst->src[1];
4010    const fs_reg &lod = inst->src[2];
4011    const fs_reg &lod2 = inst->src[3];
4012    const fs_reg &sample_index = inst->src[4];
4013    const fs_reg &mcs = inst->src[5];
4014    const fs_reg &sampler = inst->src[6];
4015    const fs_reg &offset_value = inst->src[7];
4016    assert(inst->src[8].file == IMM && inst->src[9].file == IMM);
4017    const unsigned coord_components = inst->src[8].ud;
4018    const unsigned grad_components = inst->src[9].ud;
4019
4020    if (devinfo->gen >= 7) {
4021       lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
4022                                       shadow_c, lod, lod2, sample_index,
4023                                       mcs, sampler, offset_value,
4024                                       coord_components, grad_components);
4025    } else if (devinfo->gen >= 5) {
4026       lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
4027                                       shadow_c, lod, lod2, sample_index,
4028                                       sampler, offset_value,
4029                                       coord_components, grad_components);
4030    } else {
4031       lower_sampler_logical_send_gen4(bld, inst, op, coordinate,
4032                                       shadow_c, lod, lod2, sampler,
4033                                       coord_components, grad_components);
4034    }
4035 }
4036
4037 /**
4038  * Initialize the header present in some typed and untyped surface
4039  * messages.
4040  */
4041 static fs_reg
4042 emit_surface_header(const fs_builder &bld, const fs_reg &sample_mask)
4043 {
4044    fs_builder ubld = bld.exec_all().group(8, 0);
4045    const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4046    ubld.MOV(dst, brw_imm_d(0));
4047    ubld.MOV(component(dst, 7), sample_mask);
4048    return dst;
4049 }
4050
4051 static void
4052 lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
4053                            const fs_reg &sample_mask)
4054 {
4055    /* Get the logical send arguments. */
4056    const fs_reg &addr = inst->src[0];
4057    const fs_reg &src = inst->src[1];
4058    const fs_reg &surface = inst->src[2];
4059    const UNUSED fs_reg &dims = inst->src[3];
4060    const fs_reg &arg = inst->src[4];
4061
4062    /* Calculate the total number of components of the payload. */
4063    const unsigned addr_sz = inst->components_read(0);
4064    const unsigned src_sz = inst->components_read(1);
4065    const unsigned header_sz = (sample_mask.file == BAD_FILE ? 0 : 1);
4066    const unsigned sz = header_sz + addr_sz + src_sz;
4067
4068    /* Allocate space for the payload. */
4069    fs_reg *const components = new fs_reg[sz];
4070    const fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
4071    unsigned n = 0;
4072
4073    /* Construct the payload. */
4074    if (header_sz)
4075       components[n++] = emit_surface_header(bld, sample_mask);
4076
4077    for (unsigned i = 0; i < addr_sz; i++)
4078       components[n++] = offset(addr, bld, i);
4079
4080    for (unsigned i = 0; i < src_sz; i++)
4081       components[n++] = offset(src, bld, i);
4082
4083    bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
4084
4085    /* Update the original instruction. */
4086    inst->opcode = op;
4087    inst->mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
4088    inst->header_size = header_sz;
4089
4090    inst->src[0] = payload;
4091    inst->src[1] = surface;
4092    inst->src[2] = arg;
4093    inst->resize_sources(3);
4094
4095    delete[] components;
4096 }
4097
4098 bool
4099 fs_visitor::lower_logical_sends()
4100 {
4101    bool progress = false;
4102
4103    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
4104       const fs_builder ibld(this, block, inst);
4105
4106       switch (inst->opcode) {
4107       case FS_OPCODE_FB_WRITE_LOGICAL:
4108          assert(stage == MESA_SHADER_FRAGMENT);
4109          lower_fb_write_logical_send(ibld, inst,
4110                                      (const brw_wm_prog_data *)prog_data,
4111                                      (const brw_wm_prog_key *)key,
4112                                      payload);
4113          break;
4114
4115       case SHADER_OPCODE_TEX_LOGICAL:
4116          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
4117          break;
4118
4119       case SHADER_OPCODE_TXD_LOGICAL:
4120          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
4121          break;
4122
4123       case SHADER_OPCODE_TXF_LOGICAL:
4124          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
4125          break;
4126
4127       case SHADER_OPCODE_TXL_LOGICAL:
4128          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
4129          break;
4130
4131       case SHADER_OPCODE_TXS_LOGICAL:
4132          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
4133          break;
4134
4135       case FS_OPCODE_TXB_LOGICAL:
4136          lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
4137          break;
4138
4139       case SHADER_OPCODE_TXF_CMS_LOGICAL:
4140          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
4141          break;
4142
4143       case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
4144          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W);
4145          break;
4146
4147       case SHADER_OPCODE_TXF_UMS_LOGICAL:
4148          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
4149          break;
4150
4151       case SHADER_OPCODE_TXF_MCS_LOGICAL:
4152          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
4153          break;
4154
4155       case SHADER_OPCODE_LOD_LOGICAL:
4156          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
4157          break;
4158
4159       case SHADER_OPCODE_TG4_LOGICAL:
4160          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
4161          break;
4162
4163       case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
4164          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
4165          break;
4166
4167       case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
4168          lower_surface_logical_send(ibld, inst,
4169                                     SHADER_OPCODE_UNTYPED_SURFACE_READ,
4170                                     fs_reg());
4171          break;
4172
4173       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
4174          lower_surface_logical_send(ibld, inst,
4175                                     SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
4176                                     ibld.sample_mask_reg());
4177          break;
4178
4179       case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
4180          lower_surface_logical_send(ibld, inst,
4181                                     SHADER_OPCODE_UNTYPED_ATOMIC,
4182                                     ibld.sample_mask_reg());
4183          break;
4184
4185       case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
4186          lower_surface_logical_send(ibld, inst,
4187                                     SHADER_OPCODE_TYPED_SURFACE_READ,
4188                                     brw_imm_d(0xffff));
4189          break;
4190
4191       case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
4192          lower_surface_logical_send(ibld, inst,
4193                                     SHADER_OPCODE_TYPED_SURFACE_WRITE,
4194                                     ibld.sample_mask_reg());
4195          break;
4196
4197       case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
4198          lower_surface_logical_send(ibld, inst,
4199                                     SHADER_OPCODE_TYPED_ATOMIC,
4200                                     ibld.sample_mask_reg());
4201          break;
4202
4203       default:
4204          continue;
4205       }
4206
4207       progress = true;
4208    }
4209
4210    if (progress)
4211       invalidate_live_intervals();
4212
4213    return progress;
4214 }
4215
4216 /**
4217  * Get the closest native SIMD width supported by the hardware for instruction
4218  * \p inst.  The instruction will be left untouched by
4219  * fs_visitor::lower_simd_width() if the returned value is equal to the
4220  * original execution size.
4221  */
4222 static unsigned
4223 get_lowered_simd_width(const struct brw_device_info *devinfo,
4224                        const fs_inst *inst)
4225 {
4226    switch (inst->opcode) {
4227    case BRW_OPCODE_MOV:
4228    case BRW_OPCODE_SEL:
4229    case BRW_OPCODE_NOT:
4230    case BRW_OPCODE_AND:
4231    case BRW_OPCODE_OR:
4232    case BRW_OPCODE_XOR:
4233    case BRW_OPCODE_SHR:
4234    case BRW_OPCODE_SHL:
4235    case BRW_OPCODE_ASR:
4236    case BRW_OPCODE_CMP:
4237    case BRW_OPCODE_CMPN:
4238    case BRW_OPCODE_CSEL:
4239    case BRW_OPCODE_F32TO16:
4240    case BRW_OPCODE_F16TO32:
4241    case BRW_OPCODE_BFREV:
4242    case BRW_OPCODE_BFE:
4243    case BRW_OPCODE_BFI1:
4244    case BRW_OPCODE_BFI2:
4245    case BRW_OPCODE_ADD:
4246    case BRW_OPCODE_MUL:
4247    case BRW_OPCODE_AVG:
4248    case BRW_OPCODE_FRC:
4249    case BRW_OPCODE_RNDU:
4250    case BRW_OPCODE_RNDD:
4251    case BRW_OPCODE_RNDE:
4252    case BRW_OPCODE_RNDZ:
4253    case BRW_OPCODE_LZD:
4254    case BRW_OPCODE_FBH:
4255    case BRW_OPCODE_FBL:
4256    case BRW_OPCODE_CBIT:
4257    case BRW_OPCODE_SAD2:
4258    case BRW_OPCODE_MAD:
4259    case BRW_OPCODE_LRP:
4260    case SHADER_OPCODE_RCP:
4261    case SHADER_OPCODE_RSQ:
4262    case SHADER_OPCODE_SQRT:
4263    case SHADER_OPCODE_EXP2:
4264    case SHADER_OPCODE_LOG2:
4265    case SHADER_OPCODE_POW:
4266    case SHADER_OPCODE_INT_QUOTIENT:
4267    case SHADER_OPCODE_INT_REMAINDER:
4268    case SHADER_OPCODE_SIN:
4269    case SHADER_OPCODE_COS: {
4270       /* According to the PRMs:
4271        *  "A. In Direct Addressing mode, a source cannot span more than 2
4272        *      adjacent GRF registers.
4273        *   B. A destination cannot span more than 2 adjacent GRF registers."
4274        *
4275        * Look for the source or destination with the largest register region
4276        * which is the one that is going to limit the overal execution size of
4277        * the instruction due to this rule.
4278        */
4279       unsigned reg_count = inst->regs_written;
4280
4281       for (unsigned i = 0; i < inst->sources; i++)
4282          reg_count = MAX2(reg_count, (unsigned)inst->regs_read(i));
4283
4284       /* Calculate the maximum execution size of the instruction based on the
4285        * factor by which it goes over the hardware limit of 2 GRFs.
4286        */
4287       return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
4288    }
4289    case SHADER_OPCODE_MULH:
4290       /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
4291        * is 8-wide on Gen7+.
4292        */
4293       return (devinfo->gen >= 7 ? 8 : inst->exec_size);
4294
4295    case FS_OPCODE_FB_WRITE_LOGICAL:
4296       /* Gen6 doesn't support SIMD16 depth writes but we cannot handle them
4297        * here.
4298        */
4299       assert(devinfo->gen != 6 ||
4300              inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE ||
4301              inst->exec_size == 8);
4302       /* Dual-source FB writes are unsupported in SIMD16 mode. */
4303       return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
4304               8 : inst->exec_size);
4305
4306    case SHADER_OPCODE_TXD_LOGICAL:
4307       /* TXD is unsupported in SIMD16 mode. */
4308       return 8;
4309
4310    case SHADER_OPCODE_TG4_OFFSET_LOGICAL: {
4311       /* gather4_po_c is unsupported in SIMD16 mode. */
4312       const fs_reg &shadow_c = inst->src[1];
4313       return (shadow_c.file != BAD_FILE ? 8 : inst->exec_size);
4314    }
4315    case SHADER_OPCODE_TXL_LOGICAL:
4316    case FS_OPCODE_TXB_LOGICAL: {
4317       /* Gen4 doesn't have SIMD8 non-shadow-compare bias/LOD instructions, and
4318        * Gen4-6 can't support TXL and TXB with shadow comparison in SIMD16
4319        * mode because the message exceeds the maximum length of 11.
4320        */
4321       const fs_reg &shadow_c = inst->src[1];
4322       if (devinfo->gen == 4 && shadow_c.file == BAD_FILE)
4323          return 16;
4324       else if (devinfo->gen < 7 && shadow_c.file != BAD_FILE)
4325          return 8;
4326       else
4327          return inst->exec_size;
4328    }
4329    case SHADER_OPCODE_TXF_LOGICAL:
4330    case SHADER_OPCODE_TXS_LOGICAL:
4331       /* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
4332        * messages.  Use SIMD16 instead.
4333        */
4334       if (devinfo->gen == 4)
4335          return 16;
4336       else
4337          return inst->exec_size;
4338
4339    case SHADER_OPCODE_TXF_CMS_W_LOGICAL: {
4340       /* This opcode can take up to 6 arguments which means that in some
4341        * circumstances it can end up with a message that is too long in SIMD16
4342        * mode.
4343        */
4344       const unsigned coord_components = inst->src[8].ud;
4345       /* First three arguments are the sample index and the two arguments for
4346        * the MCS data.
4347        */
4348       if ((coord_components + 3) * 2 > MAX_SAMPLER_MESSAGE_SIZE)
4349          return 8;
4350       else
4351          return inst->exec_size;
4352    }
4353
4354    case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
4355    case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
4356    case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
4357       return 8;
4358
4359    default:
4360       return inst->exec_size;
4361    }
4362 }
4363
4364 /**
4365  * The \p rows array of registers represents a \p num_rows by \p num_columns
4366  * matrix in row-major order, write it in column-major order into the register
4367  * passed as destination.  \p stride gives the separation between matrix
4368  * elements in the input in fs_builder::dispatch_width() units.
4369  */
4370 static void
4371 emit_transpose(const fs_builder &bld,
4372                const fs_reg &dst, const fs_reg *rows,
4373                unsigned num_rows, unsigned num_columns, unsigned stride)
4374 {
4375    fs_reg *const components = new fs_reg[num_rows * num_columns];
4376
4377    for (unsigned i = 0; i < num_columns; ++i) {
4378       for (unsigned j = 0; j < num_rows; ++j)
4379          components[num_rows * i + j] = offset(rows[j], bld, stride * i);
4380    }
4381
4382    bld.LOAD_PAYLOAD(dst, components, num_rows * num_columns, 0);
4383
4384    delete[] components;
4385 }
4386
4387 bool
4388 fs_visitor::lower_simd_width()
4389 {
4390    bool progress = false;
4391
4392    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
4393       const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
4394
4395       if (lower_width != inst->exec_size) {
4396          /* Builder matching the original instruction.  We may also need to
4397           * emit an instruction of width larger than the original, set the
4398           * execution size of the builder to the highest of both for now so
4399           * we're sure that both cases can be handled.
4400           */
4401          const fs_builder ibld = bld.at(block, inst)
4402                                     .exec_all(inst->force_writemask_all)
4403                                     .group(MAX2(inst->exec_size, lower_width),
4404                                            inst->force_sechalf);
4405
4406          /* Split the copies in chunks of the execution width of either the
4407           * original or the lowered instruction, whichever is lower.
4408           */
4409          const unsigned copy_width = MIN2(lower_width, inst->exec_size);
4410          const unsigned n = inst->exec_size / copy_width;
4411          const unsigned dst_size = inst->regs_written * REG_SIZE /
4412             inst->dst.component_size(inst->exec_size);
4413          fs_reg dsts[4];
4414
4415          assert(n > 0 && n <= ARRAY_SIZE(dsts) &&
4416                 !inst->writes_accumulator && !inst->mlen);
4417
4418          for (unsigned i = 0; i < n; i++) {
4419             /* Emit a copy of the original instruction with the lowered width.
4420              * If the EOT flag was set throw it away except for the last
4421              * instruction to avoid killing the thread prematurely.
4422              */
4423             fs_inst split_inst = *inst;
4424             split_inst.exec_size = lower_width;
4425             split_inst.eot = inst->eot && i == n - 1;
4426
4427             /* Select the correct channel enables for the i-th group, then
4428              * transform the sources and destination and emit the lowered
4429              * instruction.
4430              */
4431             const fs_builder lbld = ibld.group(lower_width, i);
4432
4433             for (unsigned j = 0; j < inst->sources; j++) {
4434                if (inst->src[j].file != BAD_FILE &&
4435                    !is_uniform(inst->src[j])) {
4436                   /* Get the i-th copy_width-wide chunk of the source. */
4437                   const fs_reg src = horiz_offset(inst->src[j], copy_width * i);
4438                   const unsigned src_size = inst->components_read(j);
4439
4440                   /* Use a trivial transposition to copy one every n
4441                    * copy_width-wide components of the register into a
4442                    * temporary passed as source to the lowered instruction.
4443                    */
4444                   split_inst.src[j] = lbld.vgrf(inst->src[j].type, src_size);
4445                   emit_transpose(lbld.group(copy_width, 0),
4446                                  split_inst.src[j], &src, 1, src_size, n);
4447                }
4448             }
4449
4450             if (inst->regs_written) {
4451                /* Allocate enough space to hold the result of the lowered
4452                 * instruction and fix up the number of registers written.
4453                 */
4454                split_inst.dst = dsts[i] =
4455                   lbld.vgrf(inst->dst.type, dst_size);
4456                split_inst.regs_written =
4457                   DIV_ROUND_UP(inst->regs_written * lower_width,
4458                                inst->exec_size);
4459             }
4460
4461             lbld.emit(split_inst);
4462          }
4463
4464          if (inst->regs_written) {
4465             /* Distance between useful channels in the temporaries, skipping
4466              * garbage if the lowered instruction is wider than the original.
4467              */
4468             const unsigned m = lower_width / copy_width;
4469
4470             /* Interleave the components of the result from the lowered
4471              * instructions.  We need to set exec_all() when copying more than
4472              * one half per component, because LOAD_PAYLOAD (in terms of which
4473              * emit_transpose is implemented) can only use the same channel
4474              * enable signals for all of its non-header sources.
4475              */
4476             emit_transpose(ibld.exec_all(inst->exec_size > copy_width)
4477                                .group(copy_width, 0),
4478                            inst->dst, dsts, n, dst_size, m);
4479          }
4480
4481          inst->remove(block);
4482          progress = true;
4483       }
4484    }
4485
4486    if (progress)
4487       invalidate_live_intervals();
4488
4489    return progress;
4490 }
4491
4492 void
4493 fs_visitor::dump_instructions()
4494 {
4495    dump_instructions(NULL);
4496 }
4497
4498 void
4499 fs_visitor::dump_instructions(const char *name)
4500 {
4501    FILE *file = stderr;
4502    if (name && geteuid() != 0) {
4503       file = fopen(name, "w");
4504       if (!file)
4505          file = stderr;
4506    }
4507
4508    if (cfg) {
4509       calculate_register_pressure();
4510       int ip = 0, max_pressure = 0;
4511       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
4512          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
4513          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
4514          dump_instruction(inst, file);
4515          ip++;
4516       }
4517       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
4518    } else {
4519       int ip = 0;
4520       foreach_in_list(backend_instruction, inst, &instructions) {
4521          fprintf(file, "%4d: ", ip++);
4522          dump_instruction(inst, file);
4523       }
4524    }
4525
4526    if (file != stderr) {
4527       fclose(file);
4528    }
4529 }
4530
4531 void
4532 fs_visitor::dump_instruction(backend_instruction *be_inst)
4533 {
4534    dump_instruction(be_inst, stderr);
4535 }
4536
4537 void
4538 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
4539 {
4540    fs_inst *inst = (fs_inst *)be_inst;
4541
4542    if (inst->predicate) {
4543       fprintf(file, "(%cf0.%d) ",
4544              inst->predicate_inverse ? '-' : '+',
4545              inst->flag_subreg);
4546    }
4547
4548    fprintf(file, "%s", brw_instruction_name(inst->opcode));
4549    if (inst->saturate)
4550       fprintf(file, ".sat");
4551    if (inst->conditional_mod) {
4552       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
4553       if (!inst->predicate &&
4554           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
4555                               inst->opcode != BRW_OPCODE_IF &&
4556                               inst->opcode != BRW_OPCODE_WHILE))) {
4557          fprintf(file, ".f0.%d", inst->flag_subreg);
4558       }
4559    }
4560    fprintf(file, "(%d) ", inst->exec_size);
4561
4562    if (inst->mlen) {
4563       fprintf(file, "(mlen: %d) ", inst->mlen);
4564    }
4565
4566    switch (inst->dst.file) {
4567    case VGRF:
4568       fprintf(file, "vgrf%d", inst->dst.nr);
4569       if (alloc.sizes[inst->dst.nr] != inst->regs_written ||
4570           inst->dst.subreg_offset)
4571          fprintf(file, "+%d.%d",
4572                  inst->dst.reg_offset, inst->dst.subreg_offset);
4573       break;
4574    case FIXED_GRF:
4575       fprintf(file, "g%d", inst->dst.nr);
4576       break;
4577    case MRF:
4578       fprintf(file, "m%d", inst->dst.nr);
4579       break;
4580    case BAD_FILE:
4581       fprintf(file, "(null)");
4582       break;
4583    case UNIFORM:
4584       fprintf(file, "***u%d***", inst->dst.nr + inst->dst.reg_offset);
4585       break;
4586    case ATTR:
4587       fprintf(file, "***attr%d***", inst->dst.nr + inst->dst.reg_offset);
4588       break;
4589    case ARF:
4590       switch (inst->dst.nr) {
4591       case BRW_ARF_NULL:
4592          fprintf(file, "null");
4593          break;
4594       case BRW_ARF_ADDRESS:
4595          fprintf(file, "a0.%d", inst->dst.subnr);
4596          break;
4597       case BRW_ARF_ACCUMULATOR:
4598          fprintf(file, "acc%d", inst->dst.subnr);
4599          break;
4600       case BRW_ARF_FLAG:
4601          fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
4602          break;
4603       default:
4604          fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
4605          break;
4606       }
4607       if (inst->dst.subnr)
4608          fprintf(file, "+%d", inst->dst.subnr);
4609       break;
4610    case IMM:
4611       unreachable("not reached");
4612    }
4613    if (inst->dst.stride != 1)
4614       fprintf(file, "<%u>", inst->dst.stride);
4615    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
4616
4617    for (int i = 0; i < inst->sources; i++) {
4618       if (inst->src[i].negate)
4619          fprintf(file, "-");
4620       if (inst->src[i].abs)
4621          fprintf(file, "|");
4622       switch (inst->src[i].file) {
4623       case VGRF:
4624          fprintf(file, "vgrf%d", inst->src[i].nr);
4625          if (alloc.sizes[inst->src[i].nr] != (unsigned)inst->regs_read(i) ||
4626              inst->src[i].subreg_offset)
4627             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
4628                     inst->src[i].subreg_offset);
4629          break;
4630       case FIXED_GRF:
4631          fprintf(file, "g%d", inst->src[i].nr);
4632          break;
4633       case MRF:
4634          fprintf(file, "***m%d***", inst->src[i].nr);
4635          break;
4636       case ATTR:
4637          fprintf(file, "attr%d+%d", inst->src[i].nr, inst->src[i].reg_offset);
4638          break;
4639       case UNIFORM:
4640          fprintf(file, "u%d", inst->src[i].nr + inst->src[i].reg_offset);
4641          if (inst->src[i].reladdr) {
4642             fprintf(file, "+reladdr");
4643          } else if (inst->src[i].subreg_offset) {
4644             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
4645                     inst->src[i].subreg_offset);
4646          }
4647          break;
4648       case BAD_FILE:
4649          fprintf(file, "(null)");
4650          break;
4651       case IMM:
4652          switch (inst->src[i].type) {
4653          case BRW_REGISTER_TYPE_F:
4654             fprintf(file, "%ff", inst->src[i].f);
4655             break;
4656          case BRW_REGISTER_TYPE_W:
4657          case BRW_REGISTER_TYPE_D:
4658             fprintf(file, "%dd", inst->src[i].d);
4659             break;
4660          case BRW_REGISTER_TYPE_UW:
4661          case BRW_REGISTER_TYPE_UD:
4662             fprintf(file, "%uu", inst->src[i].ud);
4663             break;
4664          case BRW_REGISTER_TYPE_VF:
4665             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
4666                     brw_vf_to_float((inst->src[i].ud >>  0) & 0xff),
4667                     brw_vf_to_float((inst->src[i].ud >>  8) & 0xff),
4668                     brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
4669                     brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
4670             break;
4671          default:
4672             fprintf(file, "???");
4673             break;
4674          }
4675          break;
4676       case ARF:
4677          switch (inst->src[i].nr) {
4678          case BRW_ARF_NULL:
4679             fprintf(file, "null");
4680             break;
4681          case BRW_ARF_ADDRESS:
4682             fprintf(file, "a0.%d", inst->src[i].subnr);
4683             break;
4684          case BRW_ARF_ACCUMULATOR:
4685             fprintf(file, "acc%d", inst->src[i].subnr);
4686             break;
4687          case BRW_ARF_FLAG:
4688             fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
4689             break;
4690          default:
4691             fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
4692             break;
4693          }
4694          if (inst->src[i].subnr)
4695             fprintf(file, "+%d", inst->src[i].subnr);
4696          break;
4697       }
4698       if (inst->src[i].abs)
4699          fprintf(file, "|");
4700
4701       if (inst->src[i].file != IMM) {
4702          unsigned stride;
4703          if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
4704             unsigned hstride = inst->src[i].hstride;
4705             stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
4706          } else {
4707             stride = inst->src[i].stride;
4708          }
4709          if (stride != 1)
4710             fprintf(file, "<%u>", stride);
4711
4712          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
4713       }
4714
4715       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
4716          fprintf(file, ", ");
4717    }
4718
4719    fprintf(file, " ");
4720
4721    if (inst->force_writemask_all)
4722       fprintf(file, "NoMask ");
4723
4724    if (dispatch_width == 16 && inst->exec_size == 8) {
4725       if (inst->force_sechalf)
4726          fprintf(file, "2ndhalf ");
4727       else
4728          fprintf(file, "1sthalf ");
4729    }
4730
4731    fprintf(file, "\n");
4732 }
4733
4734 /**
4735  * Possibly returns an instruction that set up @param reg.
4736  *
4737  * Sometimes we want to take the result of some expression/variable
4738  * dereference tree and rewrite the instruction generating the result
4739  * of the tree.  When processing the tree, we know that the
4740  * instructions generated are all writing temporaries that are dead
4741  * outside of this tree.  So, if we have some instructions that write
4742  * a temporary, we're free to point that temp write somewhere else.
4743  *
4744  * Note that this doesn't guarantee that the instruction generated
4745  * only reg -- it might be the size=4 destination of a texture instruction.
4746  */
4747 fs_inst *
4748 fs_visitor::get_instruction_generating_reg(fs_inst *start,
4749                                            fs_inst *end,
4750                                            const fs_reg &reg)
4751 {
4752    if (end == start ||
4753        end->is_partial_write() ||
4754        reg.reladdr ||
4755        !reg.equals(end->dst)) {
4756       return NULL;
4757    } else {
4758       return end;
4759    }
4760 }
4761
4762 void
4763 fs_visitor::setup_payload_gen6()
4764 {
4765    bool uses_depth =
4766       (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
4767    unsigned barycentric_interp_modes =
4768       (stage == MESA_SHADER_FRAGMENT) ?
4769       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
4770
4771    assert(devinfo->gen >= 6);
4772
4773    /* R0-1: masks, pixel X/Y coordinates. */
4774    payload.num_regs = 2;
4775    /* R2: only for 32-pixel dispatch.*/
4776
4777    /* R3-26: barycentric interpolation coordinates.  These appear in the
4778     * same order that they appear in the brw_wm_barycentric_interp_mode
4779     * enum.  Each set of coordinates occupies 2 registers if dispatch width
4780     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
4781     * appear if they were enabled using the "Barycentric Interpolation
4782     * Mode" bits in WM_STATE.
4783     */
4784    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
4785       if (barycentric_interp_modes & (1 << i)) {
4786          payload.barycentric_coord_reg[i] = payload.num_regs;
4787          payload.num_regs += 2;
4788          if (dispatch_width == 16) {
4789             payload.num_regs += 2;
4790          }
4791       }
4792    }
4793
4794    /* R27: interpolated depth if uses source depth */
4795    if (uses_depth) {
4796       payload.source_depth_reg = payload.num_regs;
4797       payload.num_regs++;
4798       if (dispatch_width == 16) {
4799          /* R28: interpolated depth if not SIMD8. */
4800          payload.num_regs++;
4801       }
4802    }
4803    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
4804    if (uses_depth) {
4805       payload.source_w_reg = payload.num_regs;
4806       payload.num_regs++;
4807       if (dispatch_width == 16) {
4808          /* R30: interpolated W if not SIMD8. */
4809          payload.num_regs++;
4810       }
4811    }
4812
4813    if (stage == MESA_SHADER_FRAGMENT) {
4814       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
4815       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
4816       prog_data->uses_pos_offset = key->compute_pos_offset;
4817       /* R31: MSAA position offsets. */
4818       if (prog_data->uses_pos_offset) {
4819          payload.sample_pos_reg = payload.num_regs;
4820          payload.num_regs++;
4821       }
4822    }
4823
4824    /* R32: MSAA input coverage mask */
4825    if (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) {
4826       assert(devinfo->gen >= 7);
4827       payload.sample_mask_in_reg = payload.num_regs;
4828       payload.num_regs++;
4829       if (dispatch_width == 16) {
4830          /* R33: input coverage mask if not SIMD8. */
4831          payload.num_regs++;
4832       }
4833    }
4834
4835    /* R34-: bary for 32-pixel. */
4836    /* R58-59: interp W for 32-pixel. */
4837
4838    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
4839       source_depth_to_render_target = true;
4840    }
4841 }
4842
4843 void
4844 fs_visitor::setup_vs_payload()
4845 {
4846    /* R0: thread header, R1: urb handles */
4847    payload.num_regs = 2;
4848 }
4849
4850 /**
4851  * We are building the local ID push constant data using the simplest possible
4852  * method. We simply push the local IDs directly as they should appear in the
4853  * registers for the uvec3 gl_LocalInvocationID variable.
4854  *
4855  * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6
4856  * registers worth of push constant space.
4857  *
4858  * Note: Any updates to brw_cs_prog_local_id_payload_dwords,
4859  * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup need
4860  * to coordinated.
4861  *
4862  * FINISHME: There are a few easy optimizations to consider.
4863  *
4864  * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is
4865  *    no need for using push constant space for that dimension.
4866  *
4867  * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can
4868  *    easily use 16-bit words rather than 32-bit dwords in the push constant
4869  *    data.
4870  *
4871  * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for
4872  *    conveying the data, and thereby reduce push constant usage.
4873  *
4874  */
4875 void
4876 fs_visitor::setup_gs_payload()
4877 {
4878    assert(stage == MESA_SHADER_GEOMETRY);
4879
4880    struct brw_gs_prog_data *gs_prog_data =
4881       (struct brw_gs_prog_data *) prog_data;
4882    struct brw_vue_prog_data *vue_prog_data =
4883       (struct brw_vue_prog_data *) prog_data;
4884
4885    /* R0: thread header, R1: output URB handles */
4886    payload.num_regs = 2;
4887
4888    if (gs_prog_data->include_primitive_id) {
4889       /* R2: Primitive ID 0..7 */
4890       payload.num_regs++;
4891    }
4892
4893    /* Use a maximum of 32 registers for push-model inputs. */
4894    const unsigned max_push_components = 32;
4895
4896    /* If pushing our inputs would take too many registers, reduce the URB read
4897     * length (which is in HWords, or 8 registers), and resort to pulling.
4898     *
4899     * Note that the GS reads <URB Read Length> HWords for every vertex - so we
4900     * have to multiply by VerticesIn to obtain the total storage requirement.
4901     */
4902    if (8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in >
4903        max_push_components) {
4904       gs_prog_data->base.include_vue_handles = true;
4905
4906       /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
4907       payload.num_regs += nir->info.gs.vertices_in;
4908
4909       vue_prog_data->urb_read_length =
4910          ROUND_DOWN_TO(max_push_components / nir->info.gs.vertices_in, 8) / 8;
4911    }
4912 }
4913
4914 void
4915 fs_visitor::setup_cs_payload()
4916 {
4917    assert(devinfo->gen >= 7);
4918    brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
4919
4920    payload.num_regs = 1;
4921
4922    if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
4923       prog_data->local_invocation_id_regs = dispatch_width * 3 / 8;
4924       payload.local_invocation_id_reg = payload.num_regs;
4925       payload.num_regs += prog_data->local_invocation_id_regs;
4926    }
4927 }
4928
4929 void
4930 fs_visitor::calculate_register_pressure()
4931 {
4932    invalidate_live_intervals();
4933    calculate_live_intervals();
4934
4935    unsigned num_instructions = 0;
4936    foreach_block(block, cfg)
4937       num_instructions += block->instructions.length();
4938
4939    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
4940
4941    for (unsigned reg = 0; reg < alloc.count; reg++) {
4942       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
4943          regs_live_at_ip[ip] += alloc.sizes[reg];
4944    }
4945 }
4946
4947 void
4948 fs_visitor::optimize()
4949 {
4950    /* Start by validating the shader we currently have. */
4951    validate();
4952
4953    /* bld is the common builder object pointing at the end of the program we
4954     * used to translate it into i965 IR.  For the optimization and lowering
4955     * passes coming next, any code added after the end of the program without
4956     * having explicitly called fs_builder::at() clearly points at a mistake.
4957     * Ideally optimization passes wouldn't be part of the visitor so they
4958     * wouldn't have access to bld at all, but they do, so just in case some
4959     * pass forgets to ask for a location explicitly set it to NULL here to
4960     * make it trip.  The dispatch width is initialized to a bogus value to
4961     * make sure that optimizations set the execution controls explicitly to
4962     * match the code they are manipulating instead of relying on the defaults.
4963     */
4964    bld = fs_builder(this, 64);
4965
4966    assign_constant_locations();
4967    demote_pull_constants();
4968
4969    validate();
4970
4971    split_virtual_grfs();
4972    validate();
4973
4974 #define OPT(pass, args...) ({                                           \
4975       pass_num++;                                                       \
4976       bool this_progress = pass(args);                                  \
4977                                                                         \
4978       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
4979          char filename[64];                                             \
4980          snprintf(filename, 64, "%s%d-%s-%02d-%02d-" #pass,              \
4981                   stage_abbrev, dispatch_width, nir->info.name, iteration, pass_num); \
4982                                                                         \
4983          backend_shader::dump_instructions(filename);                   \
4984       }                                                                 \
4985                                                                         \
4986       validate();                                                       \
4987                                                                         \
4988       progress = progress || this_progress;                             \
4989       this_progress;                                                    \
4990    })
4991
4992    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
4993       char filename[64];
4994       snprintf(filename, 64, "%s%d-%s-00-start",
4995                stage_abbrev, dispatch_width, nir->info.name);
4996
4997       backend_shader::dump_instructions(filename);
4998    }
4999
5000    bool progress = false;
5001    int iteration = 0;
5002    int pass_num = 0;
5003
5004    OPT(lower_simd_width);
5005    OPT(lower_logical_sends);
5006
5007    do {
5008       progress = false;
5009       pass_num = 0;
5010       iteration++;
5011
5012       OPT(remove_duplicate_mrf_writes);
5013
5014       OPT(opt_algebraic);
5015       OPT(opt_cse);
5016       OPT(opt_copy_propagate);
5017       OPT(opt_predicated_break, this);
5018       OPT(opt_cmod_propagation);
5019       OPT(dead_code_eliminate);
5020       OPT(opt_peephole_sel);
5021       OPT(dead_control_flow_eliminate, this);
5022       OPT(opt_register_renaming);
5023       OPT(opt_redundant_discard_jumps);
5024       OPT(opt_saturate_propagation);
5025       OPT(opt_zero_samples);
5026       OPT(register_coalesce);
5027       OPT(compute_to_mrf);
5028       OPT(eliminate_find_live_channel);
5029
5030       OPT(compact_virtual_grfs);
5031    } while (progress);
5032
5033    pass_num = 0;
5034
5035    OPT(opt_sampler_eot);
5036
5037    if (OPT(lower_load_payload)) {
5038       split_virtual_grfs();
5039       OPT(register_coalesce);
5040       OPT(compute_to_mrf);
5041       OPT(dead_code_eliminate);
5042    }
5043
5044    OPT(opt_combine_constants);
5045    OPT(lower_integer_multiplication);
5046
5047    lower_uniform_pull_constant_loads();
5048
5049    validate();
5050 }
5051
5052 /**
5053  * Three source instruction must have a GRF/MRF destination register.
5054  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
5055  */
5056 void
5057 fs_visitor::fixup_3src_null_dest()
5058 {
5059    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
5060       if (inst->is_3src() && inst->dst.is_null()) {
5061          inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
5062                             inst->dst.type);
5063       }
5064    }
5065 }
5066
5067 void
5068 fs_visitor::allocate_registers()
5069 {
5070    bool allocated_without_spills;
5071
5072    static const enum instruction_scheduler_mode pre_modes[] = {
5073       SCHEDULE_PRE,
5074       SCHEDULE_PRE_NON_LIFO,
5075       SCHEDULE_PRE_LIFO,
5076    };
5077
5078    /* Try each scheduling heuristic to see if it can successfully register
5079     * allocate without spilling.  They should be ordered by decreasing
5080     * performance but increasing likelihood of allocating.
5081     */
5082    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
5083       schedule_instructions(pre_modes[i]);
5084
5085       if (0) {
5086          assign_regs_trivial();
5087          allocated_without_spills = true;
5088       } else {
5089          allocated_without_spills = assign_regs(false);
5090       }
5091       if (allocated_without_spills)
5092          break;
5093    }
5094
5095    if (!allocated_without_spills) {
5096       /* We assume that any spilling is worse than just dropping back to
5097        * SIMD8.  There's probably actually some intermediate point where
5098        * SIMD16 with a couple of spills is still better.
5099        */
5100       if (dispatch_width == 16) {
5101          fail("Failure to register allocate.  Reduce number of "
5102               "live scalar values to avoid this.");
5103       } else {
5104          compiler->shader_perf_log(log_data,
5105                                    "%s shader triggered register spilling.  "
5106                                    "Try reducing the number of live scalar "
5107                                    "values to improve performance.\n",
5108                                    stage_name);
5109       }
5110
5111       /* Since we're out of heuristics, just go spill registers until we
5112        * get an allocation.
5113        */
5114       while (!assign_regs(true)) {
5115          if (failed)
5116             break;
5117       }
5118    }
5119
5120    /* This must come after all optimization and register allocation, since
5121     * it inserts dead code that happens to have side effects, and it does
5122     * so based on the actual physical registers in use.
5123     */
5124    insert_gen4_send_dependency_workarounds();
5125
5126    if (failed)
5127       return;
5128
5129    schedule_instructions(SCHEDULE_POST);
5130
5131    if (last_scratch > 0)
5132       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
5133 }
5134
5135 bool
5136 fs_visitor::run_vs(gl_clip_plane *clip_planes)
5137 {
5138    assert(stage == MESA_SHADER_VERTEX);
5139
5140    setup_vs_payload();
5141
5142    if (shader_time_index >= 0)
5143       emit_shader_time_begin();
5144
5145    emit_nir_code();
5146
5147    if (failed)
5148       return false;
5149
5150    compute_clip_distance(clip_planes);
5151
5152    emit_urb_writes();
5153
5154    if (shader_time_index >= 0)
5155       emit_shader_time_end();
5156
5157    calculate_cfg();
5158
5159    optimize();
5160
5161    assign_curb_setup();
5162    assign_vs_urb_setup();
5163
5164    fixup_3src_null_dest();
5165    allocate_registers();
5166
5167    return !failed;
5168 }
5169
5170 bool
5171 fs_visitor::run_gs()
5172 {
5173    assert(stage == MESA_SHADER_GEOMETRY);
5174
5175    setup_gs_payload();
5176
5177    this->final_gs_vertex_count = vgrf(glsl_type::uint_type);
5178
5179    if (gs_compile->control_data_header_size_bits > 0) {
5180       /* Create a VGRF to store accumulated control data bits. */
5181       this->control_data_bits = vgrf(glsl_type::uint_type);
5182
5183       /* If we're outputting more than 32 control data bits, then EmitVertex()
5184        * will set control_data_bits to 0 after emitting the first vertex.
5185        * Otherwise, we need to initialize it to 0 here.
5186        */
5187       if (gs_compile->control_data_header_size_bits <= 32) {
5188          const fs_builder abld = bld.annotate("initialize control data bits");
5189          abld.MOV(this->control_data_bits, brw_imm_ud(0u));
5190       }
5191    }
5192
5193    if (shader_time_index >= 0)
5194       emit_shader_time_begin();
5195
5196    emit_nir_code();
5197
5198    emit_gs_thread_end();
5199
5200    if (shader_time_index >= 0)
5201       emit_shader_time_end();
5202
5203    if (failed)
5204       return false;
5205
5206    calculate_cfg();
5207
5208    optimize();
5209
5210    assign_curb_setup();
5211    assign_gs_urb_setup();
5212
5213    fixup_3src_null_dest();
5214    allocate_registers();
5215
5216    return !failed;
5217 }
5218
5219 bool
5220 fs_visitor::run_fs(bool do_rep_send)
5221 {
5222    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
5223    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
5224
5225    assert(stage == MESA_SHADER_FRAGMENT);
5226
5227    if (devinfo->gen >= 6)
5228       setup_payload_gen6();
5229    else
5230       setup_payload_gen4();
5231
5232    if (0) {
5233       emit_dummy_fs();
5234    } else if (do_rep_send) {
5235       assert(dispatch_width == 16);
5236       emit_repclear_shader();
5237    } else {
5238       if (shader_time_index >= 0)
5239          emit_shader_time_begin();
5240
5241       calculate_urb_setup();
5242       if (nir->info.inputs_read > 0) {
5243          if (devinfo->gen < 6)
5244             emit_interpolation_setup_gen4();
5245          else
5246             emit_interpolation_setup_gen6();
5247       }
5248
5249       /* We handle discards by keeping track of the still-live pixels in f0.1.
5250        * Initialize it with the dispatched pixels.
5251        */
5252       if (wm_prog_data->uses_kill) {
5253          fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
5254          discard_init->flag_subreg = 1;
5255       }
5256
5257       /* Generate FS IR for main().  (the visitor only descends into
5258        * functions called "main").
5259        */
5260       emit_nir_code();
5261
5262       if (failed)
5263          return false;
5264
5265       if (wm_prog_data->uses_kill)
5266          bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
5267
5268       if (wm_key->alpha_test_func)
5269          emit_alpha_test();
5270
5271       emit_fb_writes();
5272
5273       if (shader_time_index >= 0)
5274          emit_shader_time_end();
5275
5276       calculate_cfg();
5277
5278       optimize();
5279
5280       assign_curb_setup();
5281       assign_urb_setup();
5282
5283       fixup_3src_null_dest();
5284       allocate_registers();
5285
5286       if (failed)
5287          return false;
5288    }
5289
5290    if (dispatch_width == 8)
5291       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
5292    else
5293       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
5294
5295    return !failed;
5296 }
5297
5298 bool
5299 fs_visitor::run_cs()
5300 {
5301    assert(stage == MESA_SHADER_COMPUTE);
5302
5303    setup_cs_payload();
5304
5305    if (shader_time_index >= 0)
5306       emit_shader_time_begin();
5307
5308    emit_nir_code();
5309
5310    if (failed)
5311       return false;
5312
5313    emit_cs_terminate();
5314
5315    if (shader_time_index >= 0)
5316       emit_shader_time_end();
5317
5318    calculate_cfg();
5319
5320    optimize();
5321
5322    assign_curb_setup();
5323
5324    fixup_3src_null_dest();
5325    allocate_registers();
5326
5327    if (failed)
5328       return false;
5329
5330    return !failed;
5331 }
5332
5333 /**
5334  * Return a bitfield where bit n is set if barycentric interpolation mode n
5335  * (see enum brw_wm_barycentric_interp_mode) is needed by the fragment shader.
5336  */
5337 static unsigned
5338 brw_compute_barycentric_interp_modes(const struct brw_device_info *devinfo,
5339                                      bool shade_model_flat,
5340                                      bool persample_shading,
5341                                      const nir_shader *shader)
5342 {
5343    unsigned barycentric_interp_modes = 0;
5344
5345    nir_foreach_variable(var, &shader->inputs) {
5346       enum glsl_interp_qualifier interp_qualifier =
5347          (enum glsl_interp_qualifier)var->data.interpolation;
5348       bool is_centroid = var->data.centroid && !persample_shading;
5349       bool is_sample = var->data.sample || persample_shading;
5350       bool is_gl_Color = (var->data.location == VARYING_SLOT_COL0) ||
5351                          (var->data.location == VARYING_SLOT_COL1);
5352
5353       /* Ignore WPOS and FACE, because they don't require interpolation. */
5354       if (var->data.location == VARYING_SLOT_POS ||
5355           var->data.location == VARYING_SLOT_FACE)
5356          continue;
5357
5358       /* Determine the set (or sets) of barycentric coordinates needed to
5359        * interpolate this variable.  Note that when
5360        * brw->needs_unlit_centroid_workaround is set, centroid interpolation
5361        * uses PIXEL interpolation for unlit pixels and CENTROID interpolation
5362        * for lit pixels, so we need both sets of barycentric coordinates.
5363        */
5364       if (interp_qualifier == INTERP_QUALIFIER_NOPERSPECTIVE) {
5365          if (is_centroid) {
5366             barycentric_interp_modes |=
5367                1 << BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
5368          } else if (is_sample) {
5369             barycentric_interp_modes |=
5370                1 << BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
5371          }
5372          if ((!is_centroid && !is_sample) ||
5373              devinfo->needs_unlit_centroid_workaround) {
5374             barycentric_interp_modes |=
5375                1 << BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
5376          }
5377       } else if (interp_qualifier == INTERP_QUALIFIER_SMOOTH ||
5378                  (!(shade_model_flat && is_gl_Color) &&
5379                   interp_qualifier == INTERP_QUALIFIER_NONE)) {
5380          if (is_centroid) {
5381             barycentric_interp_modes |=
5382                1 << BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
5383          } else if (is_sample) {
5384             barycentric_interp_modes |=
5385                1 << BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
5386          }
5387          if ((!is_centroid && !is_sample) ||
5388              devinfo->needs_unlit_centroid_workaround) {
5389             barycentric_interp_modes |=
5390                1 << BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
5391          }
5392       }
5393    }
5394
5395    return barycentric_interp_modes;
5396 }
5397
5398 static uint8_t
5399 computed_depth_mode(const nir_shader *shader)
5400 {
5401    if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
5402       switch (shader->info.fs.depth_layout) {
5403       case FRAG_DEPTH_LAYOUT_NONE:
5404       case FRAG_DEPTH_LAYOUT_ANY:
5405          return BRW_PSCDEPTH_ON;
5406       case FRAG_DEPTH_LAYOUT_GREATER:
5407          return BRW_PSCDEPTH_ON_GE;
5408       case FRAG_DEPTH_LAYOUT_LESS:
5409          return BRW_PSCDEPTH_ON_LE;
5410       case FRAG_DEPTH_LAYOUT_UNCHANGED:
5411          return BRW_PSCDEPTH_OFF;
5412       }
5413    }
5414    return BRW_PSCDEPTH_OFF;
5415 }
5416
5417 const unsigned *
5418 brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
5419                void *mem_ctx,
5420                const struct brw_wm_prog_key *key,
5421                struct brw_wm_prog_data *prog_data,
5422                const nir_shader *src_shader,
5423                struct gl_program *prog,
5424                int shader_time_index8, int shader_time_index16,
5425                bool use_rep_send,
5426                unsigned *final_assembly_size,
5427                char **error_str)
5428 {
5429    nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
5430    shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
5431                                       true);
5432    shader = brw_postprocess_nir(shader, compiler->devinfo, true);
5433
5434    /* key->alpha_test_func means simulating alpha testing via discards,
5435     * so the shader definitely kills pixels.
5436     */
5437    prog_data->uses_kill = shader->info.fs.uses_discard || key->alpha_test_func;
5438    prog_data->uses_omask =
5439       shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
5440    prog_data->computed_depth_mode = computed_depth_mode(shader);
5441    prog_data->computed_stencil =
5442       shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
5443
5444    prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
5445
5446    prog_data->barycentric_interp_modes =
5447       brw_compute_barycentric_interp_modes(compiler->devinfo,
5448                                            key->flat_shade,
5449                                            key->persample_shading,
5450                                            shader);
5451
5452    fs_visitor v(compiler, log_data, mem_ctx, key,
5453                 &prog_data->base, prog, shader, 8,
5454                 shader_time_index8);
5455    if (!v.run_fs(false /* do_rep_send */)) {
5456       if (error_str)
5457          *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
5458
5459       return NULL;
5460    }
5461
5462    cfg_t *simd16_cfg = NULL;
5463    fs_visitor v2(compiler, log_data, mem_ctx, key,
5464                  &prog_data->base, prog, shader, 16,
5465                  shader_time_index16);
5466    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) {
5467       if (!v.simd16_unsupported) {
5468          /* Try a SIMD16 compile */
5469          v2.import_uniforms(&v);
5470          if (!v2.run_fs(use_rep_send)) {
5471             compiler->shader_perf_log(log_data,
5472                                       "SIMD16 shader failed to compile: %s",
5473                                       v2.fail_msg);
5474          } else {
5475             simd16_cfg = v2.cfg;
5476          }
5477       }
5478    }
5479
5480    cfg_t *simd8_cfg;
5481    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || use_rep_send;
5482    if ((no_simd8 || compiler->devinfo->gen < 5) && simd16_cfg) {
5483       simd8_cfg = NULL;
5484       prog_data->no_8 = true;
5485    } else {
5486       simd8_cfg = v.cfg;
5487       prog_data->no_8 = false;
5488    }
5489
5490    fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base,
5491                   v.promoted_constants, v.runtime_check_aads_emit, "FS");
5492
5493    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
5494       g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s",
5495                                      shader->info.label ? shader->info.label :
5496                                                           "unnamed",
5497                                      shader->info.name));
5498    }
5499
5500    if (simd8_cfg)
5501       g.generate_code(simd8_cfg, 8);
5502    if (simd16_cfg)
5503       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
5504
5505    return g.get_assembly(final_assembly_size);
5506 }
5507
5508 fs_reg *
5509 fs_visitor::emit_cs_local_invocation_id_setup()
5510 {
5511    assert(stage == MESA_SHADER_COMPUTE);
5512
5513    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
5514
5515    struct brw_reg src =
5516       brw_vec8_grf(payload.local_invocation_id_reg, 0);
5517    src = retype(src, BRW_REGISTER_TYPE_UD);
5518    bld.MOV(*reg, src);
5519    src.nr += dispatch_width / 8;
5520    bld.MOV(offset(*reg, bld, 1), src);
5521    src.nr += dispatch_width / 8;
5522    bld.MOV(offset(*reg, bld, 2), src);
5523
5524    return reg;
5525 }
5526
5527 fs_reg *
5528 fs_visitor::emit_cs_work_group_id_setup()
5529 {
5530    assert(stage == MESA_SHADER_COMPUTE);
5531
5532    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
5533
5534    struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
5535    struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD));
5536    struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD));
5537
5538    bld.MOV(*reg, r0_1);
5539    bld.MOV(offset(*reg, bld, 1), r0_6);
5540    bld.MOV(offset(*reg, bld, 2), r0_7);
5541
5542    return reg;
5543 }
5544
5545 const unsigned *
5546 brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
5547                void *mem_ctx,
5548                const struct brw_cs_prog_key *key,
5549                struct brw_cs_prog_data *prog_data,
5550                const nir_shader *src_shader,
5551                int shader_time_index,
5552                unsigned *final_assembly_size,
5553                char **error_str)
5554 {
5555    nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
5556    shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
5557                                       true);
5558    shader = brw_postprocess_nir(shader, compiler->devinfo, true);
5559
5560    prog_data->local_size[0] = shader->info.cs.local_size[0];
5561    prog_data->local_size[1] = shader->info.cs.local_size[1];
5562    prog_data->local_size[2] = shader->info.cs.local_size[2];
5563    unsigned local_workgroup_size =
5564       shader->info.cs.local_size[0] * shader->info.cs.local_size[1] *
5565       shader->info.cs.local_size[2];
5566
5567    unsigned max_cs_threads = compiler->devinfo->max_cs_threads;
5568
5569    cfg_t *cfg = NULL;
5570    const char *fail_msg = NULL;
5571
5572    /* Now the main event: Visit the shader IR and generate our CS IR for it.
5573     */
5574    fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base,
5575                  NULL, /* Never used in core profile */
5576                  shader, 8, shader_time_index);
5577    if (!v8.run_cs()) {
5578       fail_msg = v8.fail_msg;
5579    } else if (local_workgroup_size <= 8 * max_cs_threads) {
5580       cfg = v8.cfg;
5581       prog_data->simd_size = 8;
5582    }
5583
5584    fs_visitor v16(compiler, log_data, mem_ctx, key, &prog_data->base,
5585                  NULL, /* Never used in core profile */
5586                  shader, 16, shader_time_index);
5587    if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
5588        !fail_msg && !v8.simd16_unsupported &&
5589        local_workgroup_size <= 16 * max_cs_threads) {
5590       /* Try a SIMD16 compile */
5591       v16.import_uniforms(&v8);
5592       if (!v16.run_cs()) {
5593          compiler->shader_perf_log(log_data,
5594                                    "SIMD16 shader failed to compile: %s",
5595                                    v16.fail_msg);
5596          if (!cfg) {
5597             fail_msg =
5598                "Couldn't generate SIMD16 program and not "
5599                "enough threads for SIMD8";
5600          }
5601       } else {
5602          cfg = v16.cfg;
5603          prog_data->simd_size = 16;
5604       }
5605    }
5606
5607    if (unlikely(cfg == NULL)) {
5608       assert(fail_msg);
5609       if (error_str)
5610          *error_str = ralloc_strdup(mem_ctx, fail_msg);
5611
5612       return NULL;
5613    }
5614
5615    fs_generator g(compiler, log_data, mem_ctx, (void*) key, &prog_data->base,
5616                   v8.promoted_constants, v8.runtime_check_aads_emit, "CS");
5617    if (INTEL_DEBUG & DEBUG_CS) {
5618       char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
5619                                    shader->info.label ? shader->info.label :
5620                                                         "unnamed",
5621                                    shader->info.name);
5622       g.enable_debug(name);
5623    }
5624
5625    g.generate_code(cfg, prog_data->simd_size);
5626
5627    return g.get_assembly(final_assembly_size);
5628 }