src/intel/compiler/brw_fs_generator.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs_generator.cpp
  25  *
  26  * This file supports generating code from the FS LIR to the actual
  27  * native instructions.
  28  */
  29
  30 #include "brw_eu.h"
  31 #include "brw_fs.h"
  32 #include "brw_cfg.h"
  33 #include "util/mesa-sha1.h"
  34
  35 static enum brw_reg_file
  36 brw_file_from_reg(fs_reg *reg)
  37 {
  38    switch (reg->file) {
  39    case ARF:
  40       return BRW_ARCHITECTURE_REGISTER_FILE;
  41    case FIXED_GRF:
  42    case VGRF:
  43       return BRW_GENERAL_REGISTER_FILE;
  44    case MRF:
  45       return BRW_MESSAGE_REGISTER_FILE;
  46    case IMM:
  47       return BRW_IMMEDIATE_VALUE;
  48    case BAD_FILE:
  49    case ATTR:
  50    case UNIFORM:
  51       unreachable("not reached");
  52    }
  53    return BRW_ARCHITECTURE_REGISTER_FILE;
  54 }
  55
  56 static struct brw_reg
  57 brw_reg_from_fs_reg(const struct gen_device_info *devinfo, fs_inst *inst,
  58                     fs_reg *reg, bool compressed)
  59 {
  60    struct brw_reg brw_reg;
  61
  62    switch (reg->file) {
  63    case MRF:
  64       assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
  65       /* Fallthrough */
  66    case VGRF:
  67       if (reg->stride == 0) {
  68          brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0);
  69       } else {
  70          /* From the Haswell PRM:
  71           *
  72           *  "VertStride must be used to cross GRF register boundaries. This
  73           *   rule implies that elements within a 'Width' cannot cross GRF
  74           *   boundaries."
  75           *
  76           * The maximum width value that could satisfy this restriction is:
  77           */
  78          const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));
  79
  80          /* Because the hardware can only split source regions at a whole
  81           * multiple of width during decompression (i.e. vertically), clamp
  82           * the value obtained above to the physical execution size of a
  83           * single decompressed chunk of the instruction:
  84           */
  85          const unsigned phys_width = compressed ? inst->exec_size / 2 :
  86                                      inst->exec_size;
  87
  88          const unsigned max_hw_width = 16;
  89
  90          /* XXX - The equation above is strictly speaking not correct on
  91           *       hardware that supports unbalanced GRF writes -- On Gen9+
  92           *       each decompressed chunk of the instruction may have a
  93           *       different execution size when the number of components
  94           *       written to each destination GRF is not the same.
  95           */
  96          if (reg->stride > 4) {
  97             assert(reg != &inst->dst);
  98             assert(reg->stride * type_sz(reg->type) <= REG_SIZE);
  99             brw_reg = brw_vecn_reg(1, brw_file_from_reg(reg), reg->nr, 0);
 100             brw_reg = stride(brw_reg, reg->stride, 1, 0);
 101          } else {
 102             const unsigned width = MIN3(reg_width, phys_width, max_hw_width);
 103             brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
 104             brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
 105          }
 106
 107          if (devinfo->gen == 7 && !devinfo->is_haswell) {
 108             /* From the IvyBridge PRM (EU Changes by Processor Generation, page 13):
 109              *  "Each DF (Double Float) operand uses an element size of 4 rather
 110              *   than 8 and all regioning parameters are twice what the values
 111              *   would be based on the true element size: ExecSize, Width,
 112              *   HorzStride, and VertStride. Each DF operand uses a pair of
 113              *   channels and all masking and swizzing should be adjusted
 114              *   appropriately."
 115              *
 116              * From the IvyBridge PRM (Special Requirements for Handling Double
 117              * Precision Data Types, page 71):
 118              *  "In Align1 mode, all regioning parameters like stride, execution
 119              *   size, and width must use the syntax of a pair of packed
 120              *   floats. The offsets for these data types must be 64-bit
 121              *   aligned. The execution size and regioning parameters are in terms
 122              *   of floats."
 123              *
 124              * Summarized: when handling DF-typed arguments, ExecSize,
 125              * VertStride, and Width must be doubled.
 126              *
 127              * It applies to BayTrail too.
 128              */
 129             if (type_sz(reg->type) == 8) {
 130                brw_reg.width++;
 131                if (brw_reg.vstride > 0)
 132                   brw_reg.vstride++;
 133                assert(brw_reg.hstride == BRW_HORIZONTAL_STRIDE_1);
 134             }
 135
 136             /* When converting from DF->F, we set the destination stride to 2
 137              * because each d2f conversion implicitly writes 2 floats, being
 138              * the first one the converted value. IVB/BYT actually writes two
 139              * F components per SIMD channel, and every other component is
 140              * filled with garbage.
 141              */
 142             if (reg == &inst->dst && get_exec_type_size(inst) == 8 &&
 143                 type_sz(inst->dst.type) < 8) {
 144                assert(brw_reg.hstride > BRW_HORIZONTAL_STRIDE_1);
 145                brw_reg.hstride--;
 146             }
 147          }
 148       }
 149
 150       brw_reg = retype(brw_reg, reg->type);
 151       brw_reg = byte_offset(brw_reg, reg->offset);
 152       brw_reg.abs = reg->abs;
 153       brw_reg.negate = reg->negate;
 154       break;
 155    case ARF:
 156    case FIXED_GRF:
 157    case IMM:
 158       assert(reg->offset == 0);
 159       brw_reg = reg->as_brw_reg();
 160       break;
 161    case BAD_FILE:
 162       /* Probably unused. */
 163       brw_reg = brw_null_reg();
 164       break;
 165    case ATTR:
 166    case UNIFORM:
 167       unreachable("not reached");
 168    }
 169
 170    /* On HSW+, scalar DF sources can be accessed using the normal <0,1,0>
 171     * region, but on IVB and BYT DF regions must be programmed in terms of
 172     * floats. A <0,2,1> region accomplishes this.
 173     */
 174    if (devinfo->gen == 7 && !devinfo->is_haswell &&
 175        type_sz(reg->type) == 8 &&
 176        brw_reg.vstride == BRW_VERTICAL_STRIDE_0 &&
 177        brw_reg.width == BRW_WIDTH_1 &&
 178        brw_reg.hstride == BRW_HORIZONTAL_STRIDE_0) {
 179       brw_reg.width = BRW_WIDTH_2;
 180       brw_reg.hstride = BRW_HORIZONTAL_STRIDE_1;
 181    }
 182
 183    return brw_reg;
 184 }
 185
 186 fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
 187                            void *mem_ctx,
 188                            struct brw_stage_prog_data *prog_data,
 189                            bool runtime_check_aads_emit,
 190                            gl_shader_stage stage)
 191
 192    : compiler(compiler), log_data(log_data),
 193      devinfo(compiler->devinfo),
 194      prog_data(prog_data),
 195      runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
 196      stage(stage), mem_ctx(mem_ctx)
 197 {
 198    p = rzalloc(mem_ctx, struct brw_codegen);
 199    brw_init_codegen(devinfo, p, mem_ctx);
 200
 201    /* In the FS code generator, we are very careful to ensure that we always
 202     * set the right execution size so we don't need the EU code to "help" us
 203     * by trying to infer it.  Sometimes, it infers the wrong thing.
 204     */
 205    p->automatic_exec_sizes = false;
 206 }
 207
 208 fs_generator::~fs_generator()
 209 {
 210 }
 211
 212 class ip_record : public exec_node {
 213 public:
 214    DECLARE_RALLOC_CXX_OPERATORS(ip_record)
 215
 216    ip_record(int ip)
 217    {
 218       this->ip = ip;
 219    }
 220
 221    int ip;
 222 };
 223
 224 bool
 225 fs_generator::patch_discard_jumps_to_fb_writes()
 226 {
 227    if (devinfo->gen < 6 || this->discard_halt_patches.is_empty())
 228       return false;
 229
 230    int scale = brw_jump_scale(p->devinfo);
 231
 232    /* There is a somewhat strange undocumented requirement of using
 233     * HALT, according to the simulator.  If some channel has HALTed to
 234     * a particular UIP, then by the end of the program, every channel
 235     * must have HALTed to that UIP.  Furthermore, the tracking is a
 236     * stack, so you can't do the final halt of a UIP after starting
 237     * halting to a new UIP.
 238     *
 239     * Symptoms of not emitting this instruction on actual hardware
 240     * included GPU hangs and sparkly rendering on the piglit discard
 241     * tests.
 242     */
 243    brw_inst *last_halt = gen6_HALT(p);
 244    brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
 245    brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
 246
 247    int ip = p->nr_insn;
 248
 249    foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
 250       brw_inst *patch = &p->store[patch_ip->ip];
 251
 252       assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT);
 253       /* HALT takes a half-instruction distance from the pre-incremented IP. */
 254       brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
 255    }
 256
 257    this->discard_halt_patches.make_empty();
 258    return true;
 259 }
 260
 261 void
 262 fs_generator::generate_send(fs_inst *inst,
 263                             struct brw_reg dst,
 264                             struct brw_reg desc,
 265                             struct brw_reg ex_desc,
 266                             struct brw_reg payload,
 267                             struct brw_reg payload2)
 268 {
 269    const bool dst_is_null = dst.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 270                             dst.nr == BRW_ARF_NULL;
 271    const unsigned rlen = dst_is_null ? 0 : inst->size_written / REG_SIZE;
 272
 273    uint32_t desc_imm = inst->desc |
 274       brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size);
 275
 276    uint32_t ex_desc_imm = brw_message_ex_desc(devinfo, inst->ex_mlen);
 277
 278    if (ex_desc.file != BRW_IMMEDIATE_VALUE || ex_desc.ud || ex_desc_imm) {
 279       /* If we have any sort of extended descriptor, then we need SENDS.  This
 280        * also covers the dual-payload case because ex_mlen goes in ex_desc.
 281        */
 282       brw_send_indirect_split_message(p, inst->sfid, dst, payload, payload2,
 283                                       desc, desc_imm, ex_desc, ex_desc_imm,
 284                                       inst->eot);
 285       if (inst->check_tdr)
 286          brw_inst_set_opcode(p->devinfo, brw_last_inst,
 287                              devinfo->gen >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC);
 288    } else {
 289       brw_send_indirect_message(p, inst->sfid, dst, payload, desc, desc_imm,
 290                                    inst->eot);
 291       if (inst->check_tdr)
 292          brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC);
 293    }
 294 }
 295
 296 void
 297 fs_generator::fire_fb_write(fs_inst *inst,
 298                             struct brw_reg payload,
 299                             struct brw_reg implied_header,
 300                             GLuint nr)
 301 {
 302    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
 303
 304    if (devinfo->gen < 6) {
 305       brw_push_insn_state(p);
 306       brw_set_default_exec_size(p, BRW_EXECUTE_8);
 307       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
 308       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
 309       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
 310       brw_MOV(p, offset(retype(payload, BRW_REGISTER_TYPE_UD), 1),
 311               offset(retype(implied_header, BRW_REGISTER_TYPE_UD), 1));
 312       brw_pop_insn_state(p);
 313    }
 314
 315    uint32_t msg_control = brw_fb_write_msg_control(inst, prog_data);
 316
 317    /* We assume render targets start at 0, because headerless FB write
 318     * messages set "Render Target Index" to 0.  Using a different binding
 319     * table index would make it impossible to use headerless messages.
 320     */
 321    const uint32_t surf_index = inst->target;
 322
 323    brw_inst *insn = brw_fb_WRITE(p,
 324                                  payload,
 325                                  retype(implied_header, BRW_REGISTER_TYPE_UW),
 326                                  msg_control,
 327                                  surf_index,
 328                                  nr,
 329                                  0,
 330                                  inst->eot,
 331                                  inst->last_rt,
 332                                  inst->header_size != 0);
 333
 334    if (devinfo->gen >= 6)
 335       brw_inst_set_rt_slot_group(devinfo, insn, inst->group / 16);
 336 }
 337
 338 void
 339 fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
 340 {
 341    if (devinfo->gen < 8 && !devinfo->is_haswell) {
 342       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
 343       brw_set_default_flag_reg(p, 0, 0);
 344    }
 345
 346    const struct brw_reg implied_header =
 347       devinfo->gen < 6 ? payload : brw_null_reg();
 348
 349    if (inst->base_mrf >= 0)
 350       payload = brw_message_reg(inst->base_mrf);
 351
 352    if (!runtime_check_aads_emit) {
 353       fire_fb_write(inst, payload, implied_header, inst->mlen);
 354    } else {
 355       /* This can only happen in gen < 6 */
 356       assert(devinfo->gen < 6);
 357
 358       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
 359
 360       /* Check runtime bit to detect if we have to send AA data or not */
 361       brw_push_insn_state(p);
 362       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
 363       brw_set_default_exec_size(p, BRW_EXECUTE_1);
 364       brw_AND(p,
 365               v1_null_ud,
 366               retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
 367               brw_imm_ud(1<<26));
 368       brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
 369
 370       int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
 371       brw_pop_insn_state(p);
 372       {
 373          /* Don't send AA data */
 374          fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
 375       }
 376       brw_land_fwd_jump(p, jmp);
 377       fire_fb_write(inst, payload, implied_header, inst->mlen);
 378    }
 379 }
 380
 381 void
 382 fs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst,
 383                                struct brw_reg payload)
 384 {
 385    assert(inst->size_written % REG_SIZE == 0);
 386    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
 387    /* We assume that render targets start at binding table index 0. */
 388    const unsigned surf_index = inst->target;
 389
 390    gen9_fb_READ(p, dst, payload, surf_index,
 391                 inst->header_size, inst->size_written / REG_SIZE,
 392                 prog_data->persample_dispatch);
 393 }
 394
 395 void
 396 fs_generator::generate_mov_indirect(fs_inst *inst,
 397                                     struct brw_reg dst,
 398                                     struct brw_reg reg,
 399                                     struct brw_reg indirect_byte_offset)
 400 {
 401    assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD);
 402    assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE);
 403    assert(!reg.abs && !reg.negate);
 404    assert(reg.type == dst.type);
 405
 406    unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;
 407
 408    if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) {
 409       imm_byte_offset += indirect_byte_offset.ud;
 410
 411       reg.nr = imm_byte_offset / REG_SIZE;
 412       reg.subnr = imm_byte_offset % REG_SIZE;
 413       brw_MOV(p, dst, reg);
 414    } else {
 415       /* Prior to Broadwell, there are only 8 address registers. */
 416       assert(inst->exec_size <= 8 || devinfo->gen >= 8);
 417
 418       /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
 419       struct brw_reg addr = vec8(brw_address_reg(0));
 420
 421       /* Whether we can use destination dependency control without running the
 422        * risk of a hang if an instruction gets shot down.
 423        */
 424       const bool use_dep_ctrl = !inst->predicate &&
 425                                 inst->exec_size == dispatch_width;
 426       brw_inst *insn;
 427
 428       /* The destination stride of an instruction (in bytes) must be greater
 429        * than or equal to the size of the rest of the instruction.  Since the
 430        * address register is of type UW, we can't use a D-type instruction.
 431        * In order to get around this, re retype to UW and use a stride.
 432        */
 433       indirect_byte_offset =
 434          retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);
 435
 436       /* There are a number of reasons why we don't use the base offset here.
 437        * One reason is that the field is only 9 bits which means we can only
 438        * use it to access the first 16 GRFs.  Also, from the Haswell PRM
 439        * section "Register Region Restrictions":
 440        *
 441        *    "The lower bits of the AddressImmediate must not overflow to
 442        *    change the register address.  The lower 5 bits of Address
 443        *    Immediate when added to lower 5 bits of address register gives
 444        *    the sub-register offset. The upper bits of Address Immediate
 445        *    when added to upper bits of address register gives the register
 446        *    address. Any overflow from sub-register offset is dropped."
 447        *
 448        * Since the indirect may cause us to cross a register boundary, this
 449        * makes the base offset almost useless.  We could try and do something
 450        * clever where we use a actual base offset if base_offset % 32 == 0 but
 451        * that would mean we were generating different code depending on the
 452        * base offset.  Instead, for the sake of consistency, we'll just do the
 453        * add ourselves.  This restriction is only listed in the Haswell PRM
 454        * but empirical testing indicates that it applies on all older
 455        * generations and is lifted on Broadwell.
 456        *
 457        * In the end, while base_offset is nice to look at in the generated
 458        * code, using it saves us 0 instructions and would require quite a bit
 459        * of case-by-case work.  It's just not worth it.
 460        *
 461        * Due to a hardware bug some platforms (particularly Gen11+) seem to
 462        * require the address components of all channels to be valid whether or
 463        * not they're active, which causes issues if we use VxH addressing
 464        * under non-uniform control-flow.  We can easily work around that by
 465        * initializing the whole address register with a pipelined NoMask MOV
 466        * instruction.
 467        */
 468       if (devinfo->gen >= 7) {
 469          insn = brw_MOV(p, addr, brw_imm_uw(imm_byte_offset));
 470          brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
 471          brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
 472          if (devinfo->gen >= 12)
 473             brw_set_default_swsb(p, tgl_swsb_null());
 474          else
 475             brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl);
 476       }
 477
 478       insn = brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
 479       if (devinfo->gen >= 12)
 480          brw_set_default_swsb(p, tgl_swsb_regdist(1));
 481       else if (devinfo->gen >= 7)
 482          brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl);
 483
 484       if (type_sz(reg.type) > 4 &&
 485           ((devinfo->gen == 7 && !devinfo->is_haswell) ||
 486            devinfo->is_cherryview || gen_device_info_is_9lp(devinfo) ||
 487            !devinfo->has_64bit_float)) {
 488          /* IVB has an issue (which we found empirically) where it reads two
 489           * address register components per channel for indirectly addressed
 490           * 64-bit sources.
 491           *
 492           * From the Cherryview PRM Vol 7. "Register Region Restrictions":
 493           *
 494           *    "When source or destination datatype is 64b or operation is
 495           *    integer DWord multiply, indirect addressing must not be used."
 496           *
 497           * To work around both of these, we do two integer MOVs insead of one
 498           * 64-bit MOV.  Because no double value should ever cross a register
 499           * boundary, it's safe to use the immediate offset in the indirect
 500           * here to handle adding 4 bytes to the offset and avoid the extra
 501           * ADD to the register file.
 502           */
 503          brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
 504                     retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
 505          brw_set_default_swsb(p, tgl_swsb_null());
 506          brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
 507                     retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
 508       } else {
 509          struct brw_reg ind_src = brw_VxH_indirect(0, 0);
 510
 511          brw_inst *mov = brw_MOV(p, dst, retype(ind_src, reg.type));
 512
 513          if (devinfo->gen == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE &&
 514              !inst->get_next()->is_tail_sentinel() &&
 515              ((fs_inst *)inst->get_next())->mlen > 0) {
 516             /* From the Sandybridge PRM:
 517              *
 518              *    "[Errata: DevSNB(SNB)] If MRF register is updated by any
 519              *    instruction that “indexed/indirect” source AND is followed
 520              *    by a send, the instruction requires a “Switch”. This is to
 521              *    avoid race condition where send may dispatch before MRF is
 522              *    updated."
 523              */
 524             brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH);
 525          }
 526       }
 527    }
 528 }
 529
 530 void
 531 fs_generator::generate_shuffle(fs_inst *inst,
 532                                struct brw_reg dst,
 533                                struct brw_reg src,
 534                                struct brw_reg idx)
 535 {
 536    /* Ivy bridge has some strange behavior that makes this a real pain to
 537     * implement for 64-bit values so we just don't bother.
 538     */
 539    assert(devinfo->gen >= 8 || devinfo->is_haswell || type_sz(src.type) <= 4);
 540
 541    /* Because we're using the address register, we're limited to 8-wide
 542     * execution on gen7.  On gen8, we're limited to 16-wide by the address
 543     * register file and 8-wide for 64-bit types.  We could try and make this
 544     * instruction splittable higher up in the compiler but that gets weird
 545     * because it reads all of the channels regardless of execution size.  It's
 546     * easier just to split it here.
 547     */
 548    const unsigned lower_width =
 549       (devinfo->gen <= 7 || type_sz(src.type) > 4) ?
 550       8 : MIN2(16, inst->exec_size);
 551
 552    brw_set_default_exec_size(p, cvt(lower_width) - 1);
 553    for (unsigned group = 0; group < inst->exec_size; group += lower_width) {
 554       brw_set_default_group(p, group);
 555
 556       if ((src.vstride == 0 && src.hstride == 0) ||
 557           idx.file == BRW_IMMEDIATE_VALUE) {
 558          /* Trivial, the source is already uniform or the index is a constant.
 559           * We will typically not get here if the optimizer is doing its job,
 560           * but asserting would be mean.
 561           */
 562          const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
 563          brw_MOV(p, suboffset(dst, group), stride(suboffset(src, i), 0, 1, 0));
 564       } else {
 565          /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
 566          struct brw_reg addr = vec8(brw_address_reg(0));
 567
 568          struct brw_reg group_idx = suboffset(idx, group);
 569
 570          if (lower_width == 8 && group_idx.width == BRW_WIDTH_16) {
 571             /* Things get grumpy if the register is too wide. */
 572             group_idx.width--;
 573             group_idx.vstride--;
 574          }
 575
 576          assert(type_sz(group_idx.type) <= 4);
 577          if (type_sz(group_idx.type) == 4) {
 578             /* The destination stride of an instruction (in bytes) must be
 579              * greater than or equal to the size of the rest of the
 580              * instruction.  Since the address register is of type UW, we
 581              * can't use a D-type instruction.  In order to get around this,
 582              * re retype to UW and use a stride.
 583              */
 584             group_idx = retype(spread(group_idx, 2), BRW_REGISTER_TYPE_W);
 585          }
 586
 587          /* Take into account the component size and horizontal stride. */
 588          assert(src.vstride == src.hstride + src.width);
 589          brw_SHL(p, addr, group_idx,
 590                  brw_imm_uw(util_logbase2(type_sz(src.type)) +
 591                             src.hstride - 1));
 592
 593          /* Add on the register start offset */
 594          brw_set_default_swsb(p, tgl_swsb_regdist(1));
 595          brw_ADD(p, addr, addr, brw_imm_uw(src.nr * REG_SIZE + src.subnr));
 596
 597          if (type_sz(src.type) > 4 &&
 598              ((devinfo->gen == 7 && !devinfo->is_haswell) ||
 599               devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
 600             /* IVB has an issue (which we found empirically) where it reads
 601              * two address register components per channel for indirectly
 602              * addressed 64-bit sources.
 603              *
 604              * From the Cherryview PRM Vol 7. "Register Region Restrictions":
 605              *
 606              *    "When source or destination datatype is 64b or operation is
 607              *    integer DWord multiply, indirect addressing must not be
 608              *    used."
 609              *
 610              * To work around both of these, we do two integer MOVs insead of
 611              * one 64-bit MOV.  Because no double value should ever cross a
 612              * register boundary, it's safe to use the immediate offset in the
 613              * indirect here to handle adding 4 bytes to the offset and avoid
 614              * the extra ADD to the register file.
 615              */
 616             struct brw_reg gdst = suboffset(dst, group);
 617             struct brw_reg dst_d = retype(spread(gdst, 2),
 618                                           BRW_REGISTER_TYPE_D);
 619             assert(dst.hstride == 1);
 620             brw_MOV(p, dst_d,
 621                     retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
 622             brw_set_default_swsb(p, tgl_swsb_null());
 623             brw_MOV(p, byte_offset(dst_d, 4),
 624                     retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
 625          } else {
 626             brw_MOV(p, suboffset(dst, group * dst.hstride),
 627                     retype(brw_VxH_indirect(0, 0), src.type));
 628          }
 629       }
 630
 631       brw_set_default_swsb(p, tgl_swsb_null());
 632    }
 633 }
 634
 635 void
 636 fs_generator::generate_quad_swizzle(const fs_inst *inst,
 637                                     struct brw_reg dst, struct brw_reg src,
 638                                     unsigned swiz)
 639 {
 640    /* Requires a quad. */
 641    assert(inst->exec_size >= 4);
 642
 643    if (src.file == BRW_IMMEDIATE_VALUE ||
 644        has_scalar_region(src)) {
 645       /* The value is uniform across all channels */
 646       brw_MOV(p, dst, src);
 647
 648    } else if (devinfo->gen < 11 && type_sz(src.type) == 4) {
 649       /* This only works on 8-wide 32-bit values */
 650       assert(inst->exec_size == 8);
 651       assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
 652       assert(src.vstride == src.width + 1);
 653       brw_set_default_access_mode(p, BRW_ALIGN_16);
 654       struct brw_reg swiz_src = stride(src, 4, 4, 1);
 655       swiz_src.swizzle = swiz;
 656       brw_MOV(p, dst, swiz_src);
 657
 658    } else {
 659       assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
 660       assert(src.vstride == src.width + 1);
 661       const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz, 0));
 662
 663       switch (swiz) {
 664       case BRW_SWIZZLE_XXXX:
 665       case BRW_SWIZZLE_YYYY:
 666       case BRW_SWIZZLE_ZZZZ:
 667       case BRW_SWIZZLE_WWWW:
 668          brw_MOV(p, dst, stride(src_0, 4, 4, 0));
 669          break;
 670
 671       case BRW_SWIZZLE_XXZZ:
 672       case BRW_SWIZZLE_YYWW:
 673          brw_MOV(p, dst, stride(src_0, 2, 2, 0));
 674          break;
 675
 676       case BRW_SWIZZLE_XYXY:
 677       case BRW_SWIZZLE_ZWZW:
 678          assert(inst->exec_size == 4);
 679          brw_MOV(p, dst, stride(src_0, 0, 2, 1));
 680          break;
 681
 682       default:
 683          assert(inst->force_writemask_all);
 684          brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - 1);
 685
 686          for (unsigned c = 0; c < 4; c++) {
 687             brw_inst *insn = brw_MOV(
 688                p, stride(suboffset(dst, c),
 689                          4 * inst->dst.stride, 1, 4 * inst->dst.stride),
 690                stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, 0));
 691
 692             if (devinfo->gen < 12) {
 693                brw_inst_set_no_dd_clear(devinfo, insn, c < 3);
 694                brw_inst_set_no_dd_check(devinfo, insn, c > 0);
 695             }
 696
 697             brw_set_default_swsb(p, tgl_swsb_null());
 698          }
 699
 700          break;
 701       }
 702    }
 703 }
 704
 705 void
 706 fs_generator::generate_urb_read(fs_inst *inst,
 707                                 struct brw_reg dst,
 708                                 struct brw_reg header)
 709 {
 710    assert(inst->size_written % REG_SIZE == 0);
 711    assert(header.file == BRW_GENERAL_REGISTER_FILE);
 712    assert(header.type == BRW_REGISTER_TYPE_UD);
 713
 714    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
 715    brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
 716    brw_set_src0(p, send, header);
 717    if (devinfo->gen < 12)
 718       brw_set_src1(p, send, brw_imm_ud(0u));
 719
 720    brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB);
 721    brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ);
 722
 723    if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT)
 724       brw_inst_set_urb_per_slot_offset(p->devinfo, send, true);
 725
 726    brw_inst_set_mlen(p->devinfo, send, inst->mlen);
 727    brw_inst_set_rlen(p->devinfo, send, inst->size_written / REG_SIZE);
 728    brw_inst_set_header_present(p->devinfo, send, true);
 729    brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset);
 730 }
 731
 732 void
 733 fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload)
 734 {
 735    brw_inst *insn;
 736
 737     /* WaClearTDRRegBeforeEOTForNonPS.
 738      *
 739      *   WA: Clear tdr register before send EOT in all non-PS shader kernels
 740      *
 741      *   mov(8) tdr0:ud 0x0:ud {NoMask}"
 742      */
 743    if (inst->eot && p->devinfo->gen == 10) {
 744       brw_push_insn_state(p);
 745       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
 746       brw_MOV(p, brw_tdr_reg(), brw_imm_uw(0));
 747       brw_pop_insn_state(p);
 748    }
 749
 750    insn = brw_next_insn(p, BRW_OPCODE_SEND);
 751
 752    brw_set_dest(p, insn, brw_null_reg());
 753    brw_set_src0(p, insn, payload);
 754    if (devinfo->gen < 12)
 755       brw_set_src1(p, insn, brw_imm_ud(0u));
 756
 757    brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB);
 758    brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE);
 759
 760    if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
 761        inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
 762       brw_inst_set_urb_per_slot_offset(p->devinfo, insn, true);
 763
 764    if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
 765        inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
 766       brw_inst_set_urb_channel_mask_present(p->devinfo, insn, true);
 767
 768    brw_inst_set_mlen(p->devinfo, insn, inst->mlen);
 769    brw_inst_set_rlen(p->devinfo, insn, 0);
 770    brw_inst_set_eot(p->devinfo, insn, inst->eot);
 771    brw_inst_set_header_present(p->devinfo, insn, true);
 772    brw_inst_set_urb_global_offset(p->devinfo, insn, inst->offset);
 773 }
 774
 775 void
 776 fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
 777 {
 778    struct brw_inst *insn;
 779
 780    insn = brw_next_insn(p, BRW_OPCODE_SEND);
 781
 782    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
 783    brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW));
 784    if (devinfo->gen < 12)
 785       brw_set_src1(p, insn, brw_imm_ud(0u));
 786
 787    /* Terminate a compute shader by sending a message to the thread spawner.
 788     */
 789    brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER);
 790    brw_inst_set_mlen(devinfo, insn, 1);
 791    brw_inst_set_rlen(devinfo, insn, 0);
 792    brw_inst_set_eot(devinfo, insn, inst->eot);
 793    brw_inst_set_header_present(devinfo, insn, false);
 794
 795    brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */
 796
 797    if (devinfo->gen < 11) {
 798       brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */
 799
 800       /* Note that even though the thread has a URB resource associated with it,
 801        * we set the "do not dereference URB" bit, because the URB resource is
 802        * managed by the fixed-function unit, so it will free it automatically.
 803        */
 804       brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */
 805    }
 806
 807    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
 808 }
 809
 810 void
 811 fs_generator::generate_barrier(fs_inst *, struct brw_reg src)
 812 {
 813    brw_barrier(p, src);
 814    if (devinfo->gen >= 12) {
 815       brw_set_default_swsb(p, tgl_swsb_null());
 816       brw_SYNC(p, TGL_SYNC_BAR);
 817    } else {
 818       brw_WAIT(p);
 819    }
 820 }
 821
 822 bool
 823 fs_generator::generate_linterp(fs_inst *inst,
 824                                struct brw_reg dst, struct brw_reg *src)
 825 {
 826    /* PLN reads:
 827     *                      /   in SIMD16   \
 828     *    -----------------------------------
 829     *   | src1+0 | src1+1 | src1+2 | src1+3 |
 830     *   |-----------------------------------|
 831     *   |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
 832     *    -----------------------------------
 833     *
 834     * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys:
 835     *
 836     *    -----------------------------------
 837     *   | src1+0 | src1+1 | src1+2 | src1+3 |
 838     *   |-----------------------------------|
 839     *   |(x0, x1)|(y0, y1)|        |        | in SIMD8
 840     *   |-----------------------------------|
 841     *   |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16
 842     *    -----------------------------------
 843     *
 844     * See also: emit_interpolation_setup_gen4().
 845     */
 846    struct brw_reg delta_x = src[0];
 847    struct brw_reg delta_y = offset(src[0], inst->exec_size / 8);
 848    struct brw_reg interp = src[1];
 849    brw_inst *i[2];
 850
 851    /* nir_lower_interpolation() will do the lowering to MAD instructions for
 852     * us on gen11+
 853     */
 854    assert(devinfo->gen < 11);
 855
 856    if (devinfo->has_pln) {
 857       if (devinfo->gen <= 6 && (delta_x.nr & 1) != 0) {
 858          /* From the Sandy Bridge PRM Vol. 4, Pt. 2, Section 8.3.53, "Plane":
 859           *
 860           *    "[DevSNB]:<src1> must be even register aligned.
 861           *
 862           * This restriction is lifted on Ivy Bridge.
 863           *
 864           * This means that we need to split PLN into LINE+MAC on-the-fly.
 865           * Unfortunately, the inputs are laid out for PLN and not LINE+MAC so
 866           * we have to split into SIMD8 pieces.  For gen4 (!has_pln), the
 867           * coordinate registers are laid out differently so we leave it as a
 868           * SIMD16 instruction.
 869           */
 870          assert(inst->exec_size == 8 || inst->exec_size == 16);
 871          assert(inst->group % 16 == 0);
 872
 873          brw_push_insn_state(p);
 874          brw_set_default_exec_size(p, BRW_EXECUTE_8);
 875
 876          /* Thanks to two accumulators, we can emit all the LINEs and then all
 877           * the MACs.  This improves parallelism a bit.
 878           */
 879          for (unsigned g = 0; g < inst->exec_size / 8; g++) {
 880             brw_inst *line = brw_LINE(p, brw_null_reg(), interp,
 881                                       offset(delta_x, g * 2));
 882             brw_inst_set_group(devinfo, line, inst->group + g * 8);
 883
 884             /* LINE writes the accumulator automatically on gen4-5.  On Sandy
 885              * Bridge and later, we have to explicitly enable it.
 886              */
 887             if (devinfo->gen >= 6)
 888                brw_inst_set_acc_wr_control(p->devinfo, line, true);
 889
 890             /* brw_set_default_saturate() is called before emitting
 891              * instructions, so the saturate bit is set in each instruction,
 892              * so we need to unset it on the LINE instructions.
 893              */
 894             brw_inst_set_saturate(p->devinfo, line, false);
 895          }
 896
 897          for (unsigned g = 0; g < inst->exec_size / 8; g++) {
 898             brw_inst *mac = brw_MAC(p, offset(dst, g), suboffset(interp, 1),
 899                                     offset(delta_x, g * 2 + 1));
 900             brw_inst_set_group(devinfo, mac, inst->group + g * 8);
 901             brw_inst_set_cond_modifier(p->devinfo, mac, inst->conditional_mod);
 902          }
 903
 904          brw_pop_insn_state(p);
 905
 906          return true;
 907       } else {
 908          brw_PLN(p, dst, interp, delta_x);
 909
 910          return false;
 911       }
 912    } else {
 913       i[0] = brw_LINE(p, brw_null_reg(), interp, delta_x);
 914       i[1] = brw_MAC(p, dst, suboffset(interp, 1), delta_y);
 915
 916       brw_inst_set_cond_modifier(p->devinfo, i[1], inst->conditional_mod);
 917
 918       /* brw_set_default_saturate() is called before emitting instructions, so
 919        * the saturate bit is set in each instruction, so we need to unset it on
 920        * the first instruction.
 921        */
 922       brw_inst_set_saturate(p->devinfo, i[0], false);
 923
 924       return true;
 925    }
 926 }
 927
 928 void
 929 fs_generator::generate_get_buffer_size(fs_inst *inst,
 930                                        struct brw_reg dst,
 931                                        struct brw_reg src,
 932                                        struct brw_reg surf_index)
 933 {
 934    assert(devinfo->gen >= 7);
 935    assert(surf_index.file == BRW_IMMEDIATE_VALUE);
 936
 937    uint32_t simd_mode;
 938    int rlen = 4;
 939
 940    switch (inst->exec_size) {
 941    case 8:
 942       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 943       break;
 944    case 16:
 945       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 946       break;
 947    default:
 948       unreachable("Invalid width for texture instruction");
 949    }
 950
 951    if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
 952       rlen = 8;
 953       dst = vec16(dst);
 954    }
 955
 956    brw_SAMPLE(p,
 957               retype(dst, BRW_REGISTER_TYPE_UW),
 958               inst->base_mrf,
 959               src,
 960               surf_index.ud,
 961               0,
 962               GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
 963               rlen, /* response length */
 964               inst->mlen,
 965               inst->header_size > 0,
 966               simd_mode,
 967               BRW_SAMPLER_RETURN_FORMAT_SINT32);
 968 }
 969
 970 void
 971 fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst,
 972                            struct brw_reg surface_index,
 973                            struct brw_reg sampler_index)
 974 {
 975    assert(devinfo->gen < 7);
 976    assert(inst->size_written % REG_SIZE == 0);
 977    int msg_type = -1;
 978    uint32_t simd_mode;
 979    uint32_t return_format;
 980
 981    /* Sampler EOT message of less than the dispatch width would kill the
 982     * thread prematurely.
 983     */
 984    assert(!inst->eot || inst->exec_size == dispatch_width);
 985
 986    switch (dst.type) {
 987    case BRW_REGISTER_TYPE_D:
 988       return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
 989       break;
 990    case BRW_REGISTER_TYPE_UD:
 991       return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
 992       break;
 993    default:
 994       return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
 995       break;
 996    }
 997
 998    /* Stomp the resinfo output type to UINT32.  On gens 4-5, the output type
 999     * is set as part of the message descriptor.  On gen4, the PRM seems to
1000     * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
1001     * later gens UINT32 is required.  Once you hit Sandy Bridge, the bit is
1002     * gone from the message descriptor entirely and you just get UINT32 all
1003     * the time regasrdless.  Since we can really only do non-UINT32 on gen4,
1004     * just stomp it to UINT32 all the time.
1005     */
1006    if (inst->opcode == SHADER_OPCODE_TXS)
1007       return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
1008
1009    switch (inst->exec_size) {
1010    case 8:
1011       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1012       break;
1013    case 16:
1014       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1015       break;
1016    default:
1017       unreachable("Invalid width for texture instruction");
1018    }
1019
1020    if (devinfo->gen >= 5) {
1021       switch (inst->opcode) {
1022       case SHADER_OPCODE_TEX:
1023          if (inst->shadow_compare) {
1024             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
1025          } else {
1026             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
1027          }
1028          break;
1029       case FS_OPCODE_TXB:
1030          if (inst->shadow_compare) {
1031             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
1032          } else {
1033             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
1034          }
1035          break;
1036       case SHADER_OPCODE_TXL:
1037          if (inst->shadow_compare) {
1038             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
1039          } else {
1040             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
1041          }
1042          break;
1043       case SHADER_OPCODE_TXS:
1044          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
1045          break;
1046       case SHADER_OPCODE_TXD:
1047          assert(!inst->shadow_compare);
1048          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
1049          break;
1050       case SHADER_OPCODE_TXF:
1051          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
1052          break;
1053       case SHADER_OPCODE_TXF_CMS:
1054          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
1055          break;
1056       case SHADER_OPCODE_LOD:
1057          msg_type = GEN5_SAMPLER_MESSAGE_LOD;
1058          break;
1059       case SHADER_OPCODE_TG4:
1060          assert(devinfo->gen == 6);
1061          assert(!inst->shadow_compare);
1062          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
1063          break;
1064       case SHADER_OPCODE_SAMPLEINFO:
1065          msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
1066          break;
1067       default:
1068          unreachable("not reached");
1069       }
1070    } else {
1071       switch (inst->opcode) {
1072       case SHADER_OPCODE_TEX:
1073          /* Note that G45 and older determines shadow compare and dispatch width
1074           * from message length for most messages.
1075           */
1076          if (inst->exec_size == 8) {
1077             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
1078             if (inst->shadow_compare) {
1079                assert(inst->mlen == 6);
1080             } else {
1081                assert(inst->mlen <= 4);
1082             }
1083          } else {
1084             if (inst->shadow_compare) {
1085                msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1086                assert(inst->mlen == 9);
1087             } else {
1088                msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1089                assert(inst->mlen <= 7 && inst->mlen % 2 == 1);
1090             }
1091          }
1092          break;
1093       case FS_OPCODE_TXB:
1094          if (inst->shadow_compare) {
1095             assert(inst->exec_size == 8);
1096             assert(inst->mlen == 6);
1097             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
1098          } else {
1099             assert(inst->mlen == 9);
1100             msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1101             simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1102          }
1103          break;
1104       case SHADER_OPCODE_TXL:
1105          if (inst->shadow_compare) {
1106             assert(inst->exec_size == 8);
1107             assert(inst->mlen == 6);
1108             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
1109          } else {
1110             assert(inst->mlen == 9);
1111             msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
1112             simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1113          }
1114          break;
1115       case SHADER_OPCODE_TXD:
1116          /* There is no sample_d_c message; comparisons are done manually */
1117          assert(inst->exec_size == 8);
1118          assert(inst->mlen == 7 || inst->mlen == 10);
1119          msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
1120          break;
1121       case SHADER_OPCODE_TXF:
1122          assert(inst->mlen <= 9 && inst->mlen % 2 == 1);
1123          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
1124          simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1125          break;
1126       case SHADER_OPCODE_TXS:
1127          assert(inst->mlen == 3);
1128          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
1129          simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1130          break;
1131       default:
1132          unreachable("not reached");
1133       }
1134    }
1135    assert(msg_type != -1);
1136
1137    if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
1138       dst = vec16(dst);
1139    }
1140
1141    assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
1142
1143    /* Load the message header if present.  If there's a texture offset,
1144     * we need to set it up explicitly and load the offset bitfield.
1145     * Otherwise, we can use an implied move from g0 to the first message reg.
1146     */
1147    struct brw_reg src = brw_null_reg();
1148    if (inst->header_size != 0) {
1149       if (devinfo->gen < 6 && !inst->offset) {
1150          /* Set up an implied move from g0 to the MRF. */
1151          src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
1152       } else {
1153          const tgl_swsb swsb = brw_get_default_swsb(p);
1154          assert(inst->base_mrf != -1);
1155          struct brw_reg header_reg = brw_message_reg(inst->base_mrf);
1156
1157          brw_push_insn_state(p);
1158          brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
1159          brw_set_default_exec_size(p, BRW_EXECUTE_8);
1160          brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1161          brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1162          /* Explicitly set up the message header by copying g0 to the MRF. */
1163          brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
1164          brw_set_default_swsb(p, tgl_swsb_regdist(1));
1165
1166          brw_set_default_exec_size(p, BRW_EXECUTE_1);
1167          if (inst->offset) {
1168             /* Set the offset bits in DWord 2. */
1169             brw_MOV(p, get_element_ud(header_reg, 2),
1170                        brw_imm_ud(inst->offset));
1171          }
1172
1173          brw_pop_insn_state(p);
1174          brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
1175       }
1176    }
1177
1178    uint32_t base_binding_table_index;
1179    switch (inst->opcode) {
1180    case SHADER_OPCODE_TG4:
1181       base_binding_table_index = prog_data->binding_table.gather_texture_start;
1182       break;
1183    default:
1184       base_binding_table_index = prog_data->binding_table.texture_start;
1185       break;
1186    }
1187
1188    assert(surface_index.file == BRW_IMMEDIATE_VALUE);
1189    assert(sampler_index.file == BRW_IMMEDIATE_VALUE);
1190
1191    brw_SAMPLE(p,
1192               retype(dst, BRW_REGISTER_TYPE_UW),
1193               inst->base_mrf,
1194               src,
1195               surface_index.ud + base_binding_table_index,
1196               sampler_index.ud % 16,
1197               msg_type,
1198               inst->size_written / REG_SIZE,
1199               inst->mlen,
1200               inst->header_size != 0,
1201               simd_mode,
1202               return_format);
1203 }
1204
1205
1206 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
1207  * looking like:
1208  *
1209  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
1210  *
1211  * Ideally, we want to produce:
1212  *
1213  *           DDX                     DDY
1214  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
1215  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
1216  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
1217  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
1218  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
1219  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
1220  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
1221  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
1222  *
1223  * and add another set of two more subspans if in 16-pixel dispatch mode.
1224  *
1225  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
1226  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
1227  * pair.  But the ideal approximation may impose a huge performance cost on
1228  * sample_d.  On at least Haswell, sample_d instruction does some
1229  * optimizations if the same LOD is used for all pixels in the subspan.
1230  *
1231  * For DDY, we need to use ALIGN16 mode since it's capable of doing the
1232  * appropriate swizzling.
1233  */
1234 void
1235 fs_generator::generate_ddx(const fs_inst *inst,
1236                            struct brw_reg dst, struct brw_reg src)
1237 {
1238    unsigned vstride, width;
1239
1240    if (devinfo->gen >= 8) {
1241       if (inst->opcode == FS_OPCODE_DDX_FINE) {
1242          /* produce accurate derivatives */
1243          vstride = BRW_VERTICAL_STRIDE_2;
1244          width = BRW_WIDTH_2;
1245       } else {
1246          /* replicate the derivative at the top-left pixel to other pixels */
1247          vstride = BRW_VERTICAL_STRIDE_4;
1248          width = BRW_WIDTH_4;
1249       }
1250
1251       struct brw_reg src0 = byte_offset(src, type_sz(src.type));;
1252       struct brw_reg src1 = src;
1253
1254       src0.vstride = vstride;
1255       src0.width   = width;
1256       src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1257       src1.vstride = vstride;
1258       src1.width   = width;
1259       src1.hstride = BRW_HORIZONTAL_STRIDE_0;
1260
1261       brw_ADD(p, dst, src0, negate(src1));
1262    } else {
1263       /* On Haswell and earlier, the region used above appears to not work
1264        * correctly for compressed instructions.  At least on Haswell and
1265        * Iron Lake, compressed ALIGN16 instructions do work.  Since we
1266        * would have to split to SIMD8 no matter which method we choose, we
1267        * may as well use ALIGN16 on all platforms gen7 and earlier.
1268        */
1269       struct brw_reg src0 = stride(src, 4, 4, 1);
1270       struct brw_reg src1 = stride(src, 4, 4, 1);
1271       if (inst->opcode == FS_OPCODE_DDX_FINE) {
1272          src0.swizzle = BRW_SWIZZLE_XXZZ;
1273          src1.swizzle = BRW_SWIZZLE_YYWW;
1274       } else {
1275          src0.swizzle = BRW_SWIZZLE_XXXX;
1276          src1.swizzle = BRW_SWIZZLE_YYYY;
1277       }
1278
1279       brw_push_insn_state(p);
1280       brw_set_default_access_mode(p, BRW_ALIGN_16);
1281       brw_ADD(p, dst, negate(src0), src1);
1282       brw_pop_insn_state(p);
1283    }
1284 }
1285
1286 /* The negate_value boolean is used to negate the derivative computation for
1287  * FBOs, since they place the origin at the upper left instead of the lower
1288  * left.
1289  */
1290 void
1291 fs_generator::generate_ddy(const fs_inst *inst,
1292                            struct brw_reg dst, struct brw_reg src)
1293 {
1294    const uint32_t type_size = type_sz(src.type);
1295
1296    if (inst->opcode == FS_OPCODE_DDY_FINE) {
1297       /* produce accurate derivatives.
1298        *
1299        * From the Broadwell PRM, Volume 7 (3D-Media-GPGPU)
1300        * "Register Region Restrictions", Section "1. Special Restrictions":
1301        *
1302        *    "In Align16 mode, the channel selects and channel enables apply to
1303        *     a pair of half-floats, because these parameters are defined for
1304        *     DWord elements ONLY. This is applicable when both source and
1305        *     destination are half-floats."
1306        *
1307        * So for half-float operations we use the Gen11+ Align1 path. CHV
1308        * inherits its FP16 hardware from SKL, so it is not affected.
1309        */
1310       if (devinfo->gen >= 11 ||
1311           (devinfo->is_broadwell && src.type == BRW_REGISTER_TYPE_HF)) {
1312          src = stride(src, 0, 2, 1);
1313
1314          brw_push_insn_state(p);
1315          brw_set_default_exec_size(p, BRW_EXECUTE_4);
1316          for (uint32_t g = 0; g < inst->exec_size; g += 4) {
1317             brw_set_default_group(p, inst->group + g);
1318             brw_ADD(p, byte_offset(dst, g * type_size),
1319                        negate(byte_offset(src,  g * type_size)),
1320                        byte_offset(src, (g + 2) * type_size));
1321             brw_set_default_swsb(p, tgl_swsb_null());
1322          }
1323          brw_pop_insn_state(p);
1324       } else {
1325          struct brw_reg src0 = stride(src, 4, 4, 1);
1326          struct brw_reg src1 = stride(src, 4, 4, 1);
1327          src0.swizzle = BRW_SWIZZLE_XYXY;
1328          src1.swizzle = BRW_SWIZZLE_ZWZW;
1329
1330          brw_push_insn_state(p);
1331          brw_set_default_access_mode(p, BRW_ALIGN_16);
1332          brw_ADD(p, dst, negate(src0), src1);
1333          brw_pop_insn_state(p);
1334       }
1335    } else {
1336       /* replicate the derivative at the top-left pixel to other pixels */
1337       if (devinfo->gen >= 8) {
1338          struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size);
1339          struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size);
1340
1341          brw_ADD(p, dst, negate(src0), src1);
1342       } else {
1343          /* On Haswell and earlier, the region used above appears to not work
1344           * correctly for compressed instructions.  At least on Haswell and
1345           * Iron Lake, compressed ALIGN16 instructions do work.  Since we
1346           * would have to split to SIMD8 no matter which method we choose, we
1347           * may as well use ALIGN16 on all platforms gen7 and earlier.
1348           */
1349          struct brw_reg src0 = stride(src, 4, 4, 1);
1350          struct brw_reg src1 = stride(src, 4, 4, 1);
1351          src0.swizzle = BRW_SWIZZLE_XXXX;
1352          src1.swizzle = BRW_SWIZZLE_ZZZZ;
1353
1354          brw_push_insn_state(p);
1355          brw_set_default_access_mode(p, BRW_ALIGN_16);
1356          brw_ADD(p, dst, negate(src0), src1);
1357          brw_pop_insn_state(p);
1358       }
1359    }
1360 }
1361
1362 void
1363 fs_generator::generate_discard_jump(fs_inst *)
1364 {
1365    assert(devinfo->gen >= 6);
1366
1367    /* This HALT will be patched up at FB write time to point UIP at the end of
1368     * the program, and at brw_uip_jip() JIP will be set to the end of the
1369     * current block (or the program).
1370     */
1371    this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
1372    gen6_HALT(p);
1373 }
1374
1375 void
1376 fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
1377 {
1378    /* The 32-wide messages only respect the first 16-wide half of the channel
1379     * enable signals which are replicated identically for the second group of
1380     * 16 channels, so we cannot use them unless the write is marked
1381     * force_writemask_all.
1382     */
1383    const unsigned lower_size = inst->force_writemask_all ? inst->exec_size :
1384                                MIN2(16, inst->exec_size);
1385    const unsigned block_size = 4 * lower_size / REG_SIZE;
1386    const tgl_swsb swsb = brw_get_default_swsb(p);
1387    assert(inst->mlen != 0);
1388
1389    brw_push_insn_state(p);
1390    brw_set_default_exec_size(p, cvt(lower_size) - 1);
1391    brw_set_default_compression(p, lower_size > 8);
1392
1393    for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
1394       brw_set_default_group(p, inst->group + lower_size * i);
1395
1396       if (i > 0) {
1397          assert(swsb.mode & TGL_SBID_SET);
1398          brw_set_default_swsb(p, tgl_swsb_sbid(TGL_SBID_SRC, swsb.sbid));
1399       } else {
1400          brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
1401       }
1402
1403       brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0),
1404               retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD));
1405
1406       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
1407       brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
1408                                     block_size,
1409                                     inst->offset + block_size * REG_SIZE * i);
1410    }
1411
1412    brw_pop_insn_state(p);
1413 }
1414
1415 void
1416 fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
1417 {
1418    assert(inst->exec_size <= 16 || inst->force_writemask_all);
1419    assert(inst->mlen != 0);
1420
1421    brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
1422                                 inst->exec_size / 8, inst->offset);
1423 }
1424
1425 void
1426 fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
1427 {
1428    assert(inst->exec_size <= 16 || inst->force_writemask_all);
1429
1430    gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
1431 }
1432
1433 void
1434 fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
1435                                                   struct brw_reg dst,
1436                                                   struct brw_reg index,
1437                                                   struct brw_reg offset)
1438 {
1439    assert(type_sz(dst.type) == 4);
1440    assert(inst->mlen != 0);
1441
1442    assert(index.file == BRW_IMMEDIATE_VALUE &&
1443           index.type == BRW_REGISTER_TYPE_UD);
1444    uint32_t surf_index = index.ud;
1445
1446    assert(offset.file == BRW_IMMEDIATE_VALUE &&
1447           offset.type == BRW_REGISTER_TYPE_UD);
1448    uint32_t read_offset = offset.ud;
1449
1450    brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
1451                         read_offset, surf_index);
1452 }
1453
1454 void
1455 fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
1456                                                        struct brw_reg dst,
1457                                                        struct brw_reg index,
1458                                                        struct brw_reg payload)
1459 {
1460    assert(index.type == BRW_REGISTER_TYPE_UD);
1461    assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1462    assert(type_sz(dst.type) == 4);
1463
1464    if (index.file == BRW_IMMEDIATE_VALUE) {
1465       const uint32_t surf_index = index.ud;
1466
1467       brw_push_insn_state(p);
1468       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1469       brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1470       brw_pop_insn_state(p);
1471
1472       brw_inst_set_sfid(devinfo, send, GEN6_SFID_DATAPORT_CONSTANT_CACHE);
1473       brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
1474       brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
1475       brw_set_desc(p, send,
1476                    brw_message_desc(devinfo, 1, DIV_ROUND_UP(inst->size_written,
1477                                                              REG_SIZE), true) |
1478                    brw_dp_read_desc(devinfo, surf_index,
1479                                     BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size),
1480                                     GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
1481                                     BRW_DATAPORT_READ_TARGET_DATA_CACHE));
1482
1483    } else {
1484       const tgl_swsb swsb = brw_get_default_swsb(p);
1485       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1486
1487       brw_push_insn_state(p);
1488       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1489
1490       /* a0.0 = surf_index & 0xff */
1491       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
1492       brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1493       brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
1494       brw_set_dest(p, insn_and, addr);
1495       brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1496       brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1497
1498       /* dst = send(payload, a0.0 | <descriptor>) */
1499       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
1500       brw_send_indirect_message(
1501          p, GEN6_SFID_DATAPORT_CONSTANT_CACHE,
1502          retype(dst, BRW_REGISTER_TYPE_UD),
1503          retype(payload, BRW_REGISTER_TYPE_UD), addr,
1504          brw_message_desc(devinfo, 1,
1505                           DIV_ROUND_UP(inst->size_written, REG_SIZE), true) |
1506          brw_dp_read_desc(devinfo, 0 /* surface */,
1507                           BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size),
1508                           GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
1509                           BRW_DATAPORT_READ_TARGET_DATA_CACHE),
1510          false /* EOT */);
1511
1512       brw_pop_insn_state(p);
1513    }
1514 }
1515
1516 void
1517 fs_generator::generate_varying_pull_constant_load_gen4(fs_inst *inst,
1518                                                        struct brw_reg dst,
1519                                                        struct brw_reg index)
1520 {
1521    assert(devinfo->gen < 7); /* Should use the gen7 variant. */
1522    assert(inst->header_size != 0);
1523    assert(inst->mlen);
1524
1525    assert(index.file == BRW_IMMEDIATE_VALUE &&
1526           index.type == BRW_REGISTER_TYPE_UD);
1527    uint32_t surf_index = index.ud;
1528
1529    uint32_t simd_mode, rlen, msg_type;
1530    if (inst->exec_size == 16) {
1531       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1532       rlen = 8;
1533    } else {
1534       assert(inst->exec_size == 8);
1535       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1536       rlen = 4;
1537    }
1538
1539    if (devinfo->gen >= 5)
1540       msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
1541    else {
1542       /* We always use the SIMD16 message so that we only have to load U, and
1543        * not V or R.
1544        */
1545       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
1546       assert(inst->mlen == 3);
1547       assert(inst->size_written == 8 * REG_SIZE);
1548       rlen = 8;
1549       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1550    }
1551
1552    struct brw_reg header = brw_vec8_grf(0, 0);
1553    gen6_resolve_implied_move(p, &header, inst->base_mrf);
1554
1555    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1556    brw_inst_set_compression(devinfo, send, false);
1557    brw_inst_set_sfid(devinfo, send, BRW_SFID_SAMPLER);
1558    brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1559    brw_set_src0(p, send, header);
1560    if (devinfo->gen < 6)
1561       brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf);
1562
1563    /* Our surface is set up as floats, regardless of what actual data is
1564     * stored in it.
1565     */
1566    uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
1567    brw_set_desc(p, send,
1568                 brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size) |
1569                 brw_sampler_desc(devinfo, surf_index,
1570                                  0, /* sampler (unused) */
1571                                  msg_type, simd_mode, return_format));
1572 }
1573
1574 void
1575 fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
1576                                                 struct brw_reg dst,
1577                                                 struct brw_reg src,
1578                                                 struct brw_reg msg_data,
1579                                                 unsigned msg_type)
1580 {
1581    const bool has_payload = inst->src[0].file != BAD_FILE;
1582    assert(msg_data.type == BRW_REGISTER_TYPE_UD);
1583    assert(inst->size_written % REG_SIZE == 0);
1584
1585    brw_pixel_interpolator_query(p,
1586          retype(dst, BRW_REGISTER_TYPE_UW),
1587          /* If we don't have a payload, what we send doesn't matter */
1588          has_payload ? src : brw_vec8_grf(0, 0),
1589          inst->pi_noperspective,
1590          msg_type,
1591          msg_data,
1592          has_payload ? 2 * inst->exec_size / 8 : 1,
1593          inst->size_written / REG_SIZE);
1594 }
1595
1596 /* Sets vstride=1, width=4, hstride=0 of register src1 during
1597  * the ADD instruction.
1598  */
1599 void
1600 fs_generator::generate_set_sample_id(fs_inst *inst,
1601                                      struct brw_reg dst,
1602                                      struct brw_reg src0,
1603                                      struct brw_reg src1)
1604 {
1605    assert(dst.type == BRW_REGISTER_TYPE_D ||
1606           dst.type == BRW_REGISTER_TYPE_UD);
1607    assert(src0.type == BRW_REGISTER_TYPE_D ||
1608           src0.type == BRW_REGISTER_TYPE_UD);
1609
1610    const struct brw_reg reg = stride(src1, 1, 4, 0);
1611    const unsigned lower_size = MIN2(inst->exec_size,
1612                                     devinfo->gen >= 8 ? 16 : 8);
1613
1614    for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
1615       brw_inst *insn = brw_ADD(p, offset(dst, i * lower_size / 8),
1616                                offset(src0, (src0.vstride == 0 ? 0 : (1 << (src0.vstride - 1)) *
1617                                              (i * lower_size / (1 << src0.width))) *
1618                                             type_sz(src0.type) / REG_SIZE),
1619                                suboffset(reg, i * lower_size / 4));
1620       brw_inst_set_exec_size(devinfo, insn, cvt(lower_size) - 1);
1621       brw_inst_set_group(devinfo, insn, inst->group + lower_size * i);
1622       brw_inst_set_compression(devinfo, insn, lower_size > 8);
1623       brw_set_default_swsb(p, tgl_swsb_null());
1624    }
1625 }
1626
1627 void
1628 fs_generator::generate_pack_half_2x16_split(fs_inst *,
1629                                             struct brw_reg dst,
1630                                             struct brw_reg x,
1631                                             struct brw_reg y)
1632 {
1633    assert(devinfo->gen >= 7);
1634    assert(dst.type == BRW_REGISTER_TYPE_UD);
1635    assert(x.type == BRW_REGISTER_TYPE_F);
1636    assert(y.type == BRW_REGISTER_TYPE_F);
1637
1638    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
1639     *
1640     *   Because this instruction does not have a 16-bit floating-point type,
1641     *   the destination data type must be Word (W).
1642     *
1643     *   The destination must be DWord-aligned and specify a horizontal stride
1644     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
1645     *   each destination channel and the upper word is not modified.
1646     */
1647    struct brw_reg dst_w = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1648
1649    /* Give each 32-bit channel of dst the form below, where "." means
1650     * unchanged.
1651     *   0x....hhhh
1652     */
1653    brw_F32TO16(p, dst_w, y);
1654
1655    /* Now the form:
1656     *   0xhhhh0000
1657     */
1658    brw_set_default_swsb(p, tgl_swsb_regdist(1));
1659    brw_SHL(p, dst, dst, brw_imm_ud(16u));
1660
1661    /* And, finally the form of packHalf2x16's output:
1662     *   0xhhhhllll
1663     */
1664    brw_F32TO16(p, dst_w, x);
1665 }
1666
1667 void
1668 fs_generator::generate_shader_time_add(fs_inst *,
1669                                        struct brw_reg payload,
1670                                        struct brw_reg offset,
1671                                        struct brw_reg value)
1672 {
1673    const tgl_swsb swsb = brw_get_default_swsb(p);
1674
1675    assert(devinfo->gen >= 7);
1676    brw_push_insn_state(p);
1677    brw_set_default_mask_control(p, true);
1678    brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
1679
1680    assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1681    struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
1682                                           offset.type);
1683    struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
1684                                          value.type);
1685
1686    assert(offset.file == BRW_IMMEDIATE_VALUE);
1687    if (value.file == BRW_GENERAL_REGISTER_FILE) {
1688       value.width = BRW_WIDTH_1;
1689       value.hstride = BRW_HORIZONTAL_STRIDE_0;
1690       value.vstride = BRW_VERTICAL_STRIDE_0;
1691    } else {
1692       assert(value.file == BRW_IMMEDIATE_VALUE);
1693    }
1694
1695    /* Trying to deal with setup of the params from the IR is crazy in the FS8
1696     * case, and we don't really care about squeezing every bit of performance
1697     * out of this path, so we just emit the MOVs from here.
1698     */
1699    brw_MOV(p, payload_offset, offset);
1700    brw_set_default_swsb(p, tgl_swsb_null());
1701    brw_MOV(p, payload_value, value);
1702    brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
1703    brw_shader_time_add(p, payload,
1704                        prog_data->binding_table.shader_time_start);
1705    brw_pop_insn_state(p);
1706 }
1707
1708 void
1709 fs_generator::enable_debug(const char *shader_name)
1710 {
1711    debug_flag = true;
1712    this->shader_name = shader_name;
1713 }
1714
1715 int
1716 fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
1717                             struct shader_stats shader_stats,
1718                             const brw::performance &perf,
1719                             struct brw_compile_stats *stats)
1720 {
1721    /* align to 64 byte boundary. */
1722    while (p->next_insn_offset % 64)
1723       brw_NOP(p);
1724
1725    this->dispatch_width = dispatch_width;
1726
1727    int start_offset = p->next_insn_offset;
1728
1729    /* `send_count` explicitly does not include spills or fills, as we'd
1730     * like to use it as a metric for intentional memory access or other
1731     * shared function use.  Otherwise, subtle changes to scheduling or
1732     * register allocation could cause it to fluctuate wildly - and that
1733     * effect is already counted in spill/fill counts.
1734     */
1735    int spill_count = 0, fill_count = 0;
1736    int loop_count = 0, send_count = 0, nop_count = 0;
1737    bool is_accum_used = false;
1738
1739    struct disasm_info *disasm_info = disasm_initialize(devinfo, cfg);
1740
1741    foreach_block_and_inst (block, fs_inst, inst, cfg) {
1742       if (inst->opcode == SHADER_OPCODE_UNDEF)
1743          continue;
1744
1745       struct brw_reg src[4], dst;
1746       unsigned int last_insn_offset = p->next_insn_offset;
1747       bool multiple_instructions_emitted = false;
1748
1749       /* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the
1750        * "Register Region Restrictions" section: for BDW, SKL:
1751        *
1752        *    "A POW/FDIV operation must not be followed by an instruction
1753        *     that requires two destination registers."
1754        *
1755        * The documentation is often lacking annotations for Atom parts,
1756        * and empirically this affects CHV as well.
1757        */
1758       if (devinfo->gen >= 8 &&
1759           devinfo->gen <= 9 &&
1760           p->nr_insn > 1 &&
1761           brw_inst_opcode(devinfo, brw_last_inst) == BRW_OPCODE_MATH &&
1762           brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW &&
1763           inst->dst.component_size(inst->exec_size) > REG_SIZE) {
1764          brw_NOP(p);
1765          last_insn_offset = p->next_insn_offset;
1766
1767          /* In order to avoid spurious instruction count differences when the
1768           * instruction schedule changes, keep track of the number of inserted
1769           * NOPs.
1770           */
1771          nop_count++;
1772       }
1773
1774       /* GEN:BUG:14010017096:
1775        *
1776        * Clear accumulator register before end of thread.
1777        */
1778       if (inst->eot && is_accum_used && devinfo->gen >= 12) {
1779          brw_set_default_exec_size(p, BRW_EXECUTE_16);
1780          brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1781          brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1782          brw_MOV(p, brw_acc_reg(8), brw_imm_f(0.0f));
1783          last_insn_offset = p->next_insn_offset;
1784       }
1785
1786       if (!is_accum_used && !inst->eot) {
1787          is_accum_used = inst->writes_accumulator_implicitly(devinfo) ||
1788                          inst->dst.is_accumulator();
1789       }
1790
1791       if (unlikely(debug_flag))
1792          disasm_annotate(disasm_info, inst, p->next_insn_offset);
1793
1794       /* If the instruction writes to more than one register, it needs to be
1795        * explicitly marked as compressed on Gen <= 5.  On Gen >= 6 the
1796        * hardware figures out by itself what the right compression mode is,
1797        * but we still need to know whether the instruction is compressed to
1798        * set up the source register regions appropriately.
1799        *
1800        * XXX - This is wrong for instructions that write a single register but
1801        *       read more than one which should strictly speaking be treated as
1802        *       compressed.  For instructions that don't write any registers it
1803        *       relies on the destination being a null register of the correct
1804        *       type and regioning so the instruction is considered compressed
1805        *       or not accordingly.
1806        */
1807       const bool compressed =
1808            inst->dst.component_size(inst->exec_size) > REG_SIZE;
1809       brw_set_default_compression(p, compressed);
1810       brw_set_default_group(p, inst->group);
1811
1812       for (unsigned int i = 0; i < inst->sources; i++) {
1813          src[i] = brw_reg_from_fs_reg(devinfo, inst,
1814                                       &inst->src[i], compressed);
1815          /* The accumulator result appears to get used for the
1816           * conditional modifier generation.  When negating a UD
1817           * value, there is a 33rd bit generated for the sign in the
1818           * accumulator value, so now you can't check, for example,
1819           * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
1820           */
1821          assert(!inst->conditional_mod ||
1822                 inst->src[i].type != BRW_REGISTER_TYPE_UD ||
1823                 !inst->src[i].negate);
1824       }
1825       dst = brw_reg_from_fs_reg(devinfo, inst,
1826                                 &inst->dst, compressed);
1827
1828       brw_set_default_access_mode(p, BRW_ALIGN_1);
1829       brw_set_default_predicate_control(p, inst->predicate);
1830       brw_set_default_predicate_inverse(p, inst->predicate_inverse);
1831       /* On gen7 and above, hardware automatically adds the group onto the
1832        * flag subregister number.  On Sandy Bridge and older, we have to do it
1833        * ourselves.
1834        */
1835       const unsigned flag_subreg = inst->flag_subreg +
1836          (devinfo->gen >= 7 ? 0 : inst->group / 16);
1837       brw_set_default_flag_reg(p, flag_subreg / 2, flag_subreg % 2);
1838       brw_set_default_saturate(p, inst->saturate);
1839       brw_set_default_mask_control(p, inst->force_writemask_all);
1840       brw_set_default_acc_write_control(p, inst->writes_accumulator);
1841       brw_set_default_swsb(p, inst->sched);
1842
1843       unsigned exec_size = inst->exec_size;
1844       if (devinfo->gen == 7 && !devinfo->is_haswell &&
1845           (get_exec_type_size(inst) == 8 || type_sz(inst->dst.type) == 8)) {
1846          exec_size *= 2;
1847       }
1848
1849       brw_set_default_exec_size(p, cvt(exec_size) - 1);
1850
1851       assert(inst->force_writemask_all || inst->exec_size >= 4);
1852       assert(inst->force_writemask_all || inst->group % inst->exec_size == 0);
1853       assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen));
1854       assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
1855
1856       switch (inst->opcode) {
1857       case BRW_OPCODE_SYNC:
1858          assert(src[0].file == BRW_IMMEDIATE_VALUE);
1859          brw_SYNC(p, tgl_sync_function(src[0].ud));
1860          break;
1861       case BRW_OPCODE_MOV:
1862          brw_MOV(p, dst, src[0]);
1863          break;
1864       case BRW_OPCODE_ADD:
1865          brw_ADD(p, dst, src[0], src[1]);
1866          break;
1867       case BRW_OPCODE_MUL:
1868          brw_MUL(p, dst, src[0], src[1]);
1869          break;
1870       case BRW_OPCODE_AVG:
1871          brw_AVG(p, dst, src[0], src[1]);
1872          break;
1873       case BRW_OPCODE_MACH:
1874          brw_MACH(p, dst, src[0], src[1]);
1875          break;
1876
1877       case BRW_OPCODE_LINE:
1878          brw_LINE(p, dst, src[0], src[1]);
1879          break;
1880
1881       case BRW_OPCODE_MAD:
1882          assert(devinfo->gen >= 6);
1883          if (devinfo->gen < 10)
1884             brw_set_default_access_mode(p, BRW_ALIGN_16);
1885          brw_MAD(p, dst, src[0], src[1], src[2]);
1886          break;
1887
1888       case BRW_OPCODE_LRP:
1889          assert(devinfo->gen >= 6 && devinfo->gen <= 10);
1890          if (devinfo->gen < 10)
1891             brw_set_default_access_mode(p, BRW_ALIGN_16);
1892          brw_LRP(p, dst, src[0], src[1], src[2]);
1893          break;
1894
1895       case BRW_OPCODE_FRC:
1896          brw_FRC(p, dst, src[0]);
1897          break;
1898       case BRW_OPCODE_RNDD:
1899          brw_RNDD(p, dst, src[0]);
1900          break;
1901       case BRW_OPCODE_RNDE:
1902          brw_RNDE(p, dst, src[0]);
1903          break;
1904       case BRW_OPCODE_RNDZ:
1905          brw_RNDZ(p, dst, src[0]);
1906          break;
1907
1908       case BRW_OPCODE_AND:
1909          brw_AND(p, dst, src[0], src[1]);
1910          break;
1911       case BRW_OPCODE_OR:
1912          brw_OR(p, dst, src[0], src[1]);
1913          break;
1914       case BRW_OPCODE_XOR:
1915          brw_XOR(p, dst, src[0], src[1]);
1916          break;
1917       case BRW_OPCODE_NOT:
1918          brw_NOT(p, dst, src[0]);
1919          break;
1920       case BRW_OPCODE_ASR:
1921          brw_ASR(p, dst, src[0], src[1]);
1922          break;
1923       case BRW_OPCODE_SHR:
1924          brw_SHR(p, dst, src[0], src[1]);
1925          break;
1926       case BRW_OPCODE_SHL:
1927          brw_SHL(p, dst, src[0], src[1]);
1928          break;
1929       case BRW_OPCODE_ROL:
1930          assert(devinfo->gen >= 11);
1931          assert(src[0].type == dst.type);
1932          brw_ROL(p, dst, src[0], src[1]);
1933          break;
1934       case BRW_OPCODE_ROR:
1935          assert(devinfo->gen >= 11);
1936          assert(src[0].type == dst.type);
1937          brw_ROR(p, dst, src[0], src[1]);
1938          break;
1939       case BRW_OPCODE_F32TO16:
1940          assert(devinfo->gen >= 7);
1941          brw_F32TO16(p, dst, src[0]);
1942          break;
1943       case BRW_OPCODE_F16TO32:
1944          assert(devinfo->gen >= 7);
1945          brw_F16TO32(p, dst, src[0]);
1946          break;
1947       case BRW_OPCODE_CMP:
1948          if (inst->exec_size >= 16 && devinfo->gen == 7 && !devinfo->is_haswell &&
1949              dst.file == BRW_ARCHITECTURE_REGISTER_FILE) {
1950             /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround
1951              * implemented in the compiler is not sufficient. Overriding the
1952              * type when the destination is the null register is necessary but
1953              * not sufficient by itself.
1954              */
1955             assert(dst.nr == BRW_ARF_NULL);
1956             dst.type = BRW_REGISTER_TYPE_D;
1957          }
1958          brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1959          break;
1960       case BRW_OPCODE_SEL:
1961          brw_SEL(p, dst, src[0], src[1]);
1962          break;
1963       case BRW_OPCODE_CSEL:
1964          assert(devinfo->gen >= 8);
1965          if (devinfo->gen < 10)
1966             brw_set_default_access_mode(p, BRW_ALIGN_16);
1967          brw_CSEL(p, dst, src[0], src[1], src[2]);
1968          break;
1969       case BRW_OPCODE_BFREV:
1970          assert(devinfo->gen >= 7);
1971          brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
1972                    retype(src[0], BRW_REGISTER_TYPE_UD));
1973          break;
1974       case BRW_OPCODE_FBH:
1975          assert(devinfo->gen >= 7);
1976          brw_FBH(p, retype(dst, src[0].type), src[0]);
1977          break;
1978       case BRW_OPCODE_FBL:
1979          assert(devinfo->gen >= 7);
1980          brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD),
1981                  retype(src[0], BRW_REGISTER_TYPE_UD));
1982          break;
1983       case BRW_OPCODE_LZD:
1984          brw_LZD(p, dst, src[0]);
1985          break;
1986       case BRW_OPCODE_CBIT:
1987          assert(devinfo->gen >= 7);
1988          brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD),
1989                   retype(src[0], BRW_REGISTER_TYPE_UD));
1990          break;
1991       case BRW_OPCODE_ADDC:
1992          assert(devinfo->gen >= 7);
1993          brw_ADDC(p, dst, src[0], src[1]);
1994          break;
1995       case BRW_OPCODE_SUBB:
1996          assert(devinfo->gen >= 7);
1997          brw_SUBB(p, dst, src[0], src[1]);
1998          break;
1999       case BRW_OPCODE_MAC:
2000          brw_MAC(p, dst, src[0], src[1]);
2001          break;
2002
2003       case BRW_OPCODE_BFE:
2004          assert(devinfo->gen >= 7);
2005          if (devinfo->gen < 10)
2006             brw_set_default_access_mode(p, BRW_ALIGN_16);
2007          brw_BFE(p, dst, src[0], src[1], src[2]);
2008          break;
2009
2010       case BRW_OPCODE_BFI1:
2011          assert(devinfo->gen >= 7);
2012          brw_BFI1(p, dst, src[0], src[1]);
2013          break;
2014       case BRW_OPCODE_BFI2:
2015          assert(devinfo->gen >= 7);
2016          if (devinfo->gen < 10)
2017             brw_set_default_access_mode(p, BRW_ALIGN_16);
2018          brw_BFI2(p, dst, src[0], src[1], src[2]);
2019          break;
2020
2021       case BRW_OPCODE_IF:
2022          if (inst->src[0].file != BAD_FILE) {
2023             /* The instruction has an embedded compare (only allowed on gen6) */
2024             assert(devinfo->gen == 6);
2025             gen6_IF(p, inst->conditional_mod, src[0], src[1]);
2026          } else {
2027             brw_IF(p, brw_get_default_exec_size(p));
2028          }
2029          break;
2030
2031       case BRW_OPCODE_ELSE:
2032          brw_ELSE(p);
2033          break;
2034       case BRW_OPCODE_ENDIF:
2035          brw_ENDIF(p);
2036          break;
2037
2038       case BRW_OPCODE_DO:
2039          brw_DO(p, brw_get_default_exec_size(p));
2040          break;
2041
2042       case BRW_OPCODE_BREAK:
2043          brw_BREAK(p);
2044          break;
2045       case BRW_OPCODE_CONTINUE:
2046          brw_CONT(p);
2047          break;
2048
2049       case BRW_OPCODE_WHILE:
2050          brw_WHILE(p);
2051          loop_count++;
2052          break;
2053
2054       case SHADER_OPCODE_RCP:
2055       case SHADER_OPCODE_RSQ:
2056       case SHADER_OPCODE_SQRT:
2057       case SHADER_OPCODE_EXP2:
2058       case SHADER_OPCODE_LOG2:
2059       case SHADER_OPCODE_SIN:
2060       case SHADER_OPCODE_COS:
2061          assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
2062          if (devinfo->gen >= 6) {
2063             assert(inst->mlen == 0);
2064             assert(devinfo->gen >= 7 || inst->exec_size == 8);
2065             gen6_math(p, dst, brw_math_function(inst->opcode),
2066                       src[0], brw_null_reg());
2067          } else {
2068             assert(inst->mlen >= 1);
2069             assert(devinfo->gen == 5 || devinfo->is_g4x || inst->exec_size == 8);
2070             gen4_math(p, dst,
2071                       brw_math_function(inst->opcode),
2072                       inst->base_mrf, src[0],
2073                       BRW_MATH_PRECISION_FULL);
2074             send_count++;
2075          }
2076          break;
2077       case SHADER_OPCODE_INT_QUOTIENT:
2078       case SHADER_OPCODE_INT_REMAINDER:
2079       case SHADER_OPCODE_POW:
2080          assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
2081          if (devinfo->gen >= 6) {
2082             assert(inst->mlen == 0);
2083             assert((devinfo->gen >= 7 && inst->opcode == SHADER_OPCODE_POW) ||
2084                    inst->exec_size == 8);
2085             gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
2086          } else {
2087             assert(inst->mlen >= 1);
2088             assert(inst->exec_size == 8);
2089             gen4_math(p, dst, brw_math_function(inst->opcode),
2090                       inst->base_mrf, src[0],
2091                       BRW_MATH_PRECISION_FULL);
2092             send_count++;
2093          }
2094          break;
2095       case FS_OPCODE_LINTERP:
2096          multiple_instructions_emitted = generate_linterp(inst, dst, src);
2097          break;
2098       case FS_OPCODE_PIXEL_X:
2099          assert(src[0].type == BRW_REGISTER_TYPE_UW);
2100          src[0].subnr = 0 * type_sz(src[0].type);
2101          brw_MOV(p, dst, stride(src[0], 8, 4, 1));
2102          break;
2103       case FS_OPCODE_PIXEL_Y:
2104          assert(src[0].type == BRW_REGISTER_TYPE_UW);
2105          src[0].subnr = 4 * type_sz(src[0].type);
2106          brw_MOV(p, dst, stride(src[0], 8, 4, 1));
2107          break;
2108
2109       case SHADER_OPCODE_SEND:
2110          generate_send(inst, dst, src[0], src[1], src[2],
2111                        inst->ex_mlen > 0 ? src[3] : brw_null_reg());
2112          if ((inst->desc & 0xff) == BRW_BTI_STATELESS ||
2113              (inst->desc & 0xff) == GEN8_BTI_STATELESS_NON_COHERENT) {
2114             if (inst->size_written)
2115                fill_count++;
2116             else
2117                spill_count++;
2118          } else {
2119             send_count++;
2120          }
2121          break;
2122
2123       case SHADER_OPCODE_GET_BUFFER_SIZE:
2124          generate_get_buffer_size(inst, dst, src[0], src[1]);
2125          send_count++;
2126          break;
2127       case SHADER_OPCODE_TEX:
2128       case FS_OPCODE_TXB:
2129       case SHADER_OPCODE_TXD:
2130       case SHADER_OPCODE_TXF:
2131       case SHADER_OPCODE_TXF_CMS:
2132       case SHADER_OPCODE_TXL:
2133       case SHADER_OPCODE_TXS:
2134       case SHADER_OPCODE_LOD:
2135       case SHADER_OPCODE_TG4:
2136       case SHADER_OPCODE_SAMPLEINFO:
2137          assert(inst->src[0].file == BAD_FILE);
2138          generate_tex(inst, dst, src[1], src[2]);
2139          send_count++;
2140          break;
2141
2142       case FS_OPCODE_DDX_COARSE:
2143       case FS_OPCODE_DDX_FINE:
2144          generate_ddx(inst, dst, src[0]);
2145          break;
2146       case FS_OPCODE_DDY_COARSE:
2147       case FS_OPCODE_DDY_FINE:
2148          generate_ddy(inst, dst, src[0]);
2149          break;
2150
2151       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
2152          generate_scratch_write(inst, src[0]);
2153          spill_count++;
2154          break;
2155
2156       case SHADER_OPCODE_GEN4_SCRATCH_READ:
2157          generate_scratch_read(inst, dst);
2158          fill_count++;
2159          break;
2160
2161       case SHADER_OPCODE_GEN7_SCRATCH_READ:
2162          generate_scratch_read_gen7(inst, dst);
2163          fill_count++;
2164          break;
2165
2166       case SHADER_OPCODE_MOV_INDIRECT:
2167          generate_mov_indirect(inst, dst, src[0], src[1]);
2168          break;
2169
2170       case SHADER_OPCODE_URB_READ_SIMD8:
2171       case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
2172          generate_urb_read(inst, dst, src[0]);
2173          send_count++;
2174          break;
2175
2176       case SHADER_OPCODE_URB_WRITE_SIMD8:
2177       case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
2178       case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
2179       case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
2180          generate_urb_write(inst, src[0]);
2181          send_count++;
2182          break;
2183
2184       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
2185          assert(inst->force_writemask_all);
2186          generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
2187          send_count++;
2188          break;
2189
2190       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
2191          assert(inst->force_writemask_all);
2192          generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
2193          send_count++;
2194          break;
2195
2196       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
2197          generate_varying_pull_constant_load_gen4(inst, dst, src[0]);
2198          send_count++;
2199          break;
2200
2201       case FS_OPCODE_REP_FB_WRITE:
2202       case FS_OPCODE_FB_WRITE:
2203          generate_fb_write(inst, src[0]);
2204          send_count++;
2205          break;
2206
2207       case FS_OPCODE_FB_READ:
2208          generate_fb_read(inst, dst, src[0]);
2209          send_count++;
2210          break;
2211
2212       case FS_OPCODE_DISCARD_JUMP:
2213          generate_discard_jump(inst);
2214          break;
2215
2216       case SHADER_OPCODE_SHADER_TIME_ADD:
2217          generate_shader_time_add(inst, src[0], src[1], src[2]);
2218          break;
2219
2220       case SHADER_OPCODE_MEMORY_FENCE: {
2221          assert(src[1].file == BRW_IMMEDIATE_VALUE);
2222          assert(src[2].file == BRW_IMMEDIATE_VALUE);
2223          const unsigned sends =
2224             brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, src[1].ud,
2225                              src[2].ud);
2226          send_count += sends;
2227          break;
2228       }
2229
2230       case FS_OPCODE_SCHEDULING_FENCE:
2231          if (unlikely(debug_flag))
2232             disasm_info->use_tail = true;
2233          break;
2234
2235       case SHADER_OPCODE_INTERLOCK:
2236          assert(devinfo->gen >= 9);
2237          /* The interlock is basically a memory fence issued via sendc */
2238          brw_memory_fence(p, dst, src[0], BRW_OPCODE_SENDC, false, /* bti */ 0);
2239          break;
2240
2241       case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
2242          const struct brw_reg mask =
2243             brw_stage_has_packed_dispatch(devinfo, stage,
2244                                           prog_data) ? brw_imm_ud(~0u) :
2245             stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
2246             brw_dmask_reg();
2247          brw_find_live_channel(p, dst, mask);
2248          break;
2249       }
2250       case FS_OPCODE_LOAD_LIVE_CHANNELS: {
2251          assert(devinfo->gen >= 8);
2252          assert(inst->force_writemask_all && inst->group == 0);
2253          assert(inst->dst.file == BAD_FILE);
2254          brw_set_default_exec_size(p, BRW_EXECUTE_1);
2255          brw_MOV(p, retype(brw_flag_subreg(inst->flag_subreg),
2256                            BRW_REGISTER_TYPE_UD),
2257                  retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
2258          break;
2259       }
2260       case SHADER_OPCODE_BROADCAST:
2261          assert(inst->force_writemask_all);
2262          brw_broadcast(p, dst, src[0], src[1]);
2263          break;
2264
2265       case SHADER_OPCODE_SHUFFLE:
2266          generate_shuffle(inst, dst, src[0], src[1]);
2267          break;
2268
2269       case SHADER_OPCODE_SEL_EXEC:
2270          assert(inst->force_writemask_all);
2271          brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2272          brw_MOV(p, dst, src[1]);
2273          brw_set_default_mask_control(p, BRW_MASK_ENABLE);
2274          brw_set_default_swsb(p, tgl_swsb_null());
2275          brw_MOV(p, dst, src[0]);
2276          break;
2277
2278       case SHADER_OPCODE_QUAD_SWIZZLE:
2279          assert(src[1].file == BRW_IMMEDIATE_VALUE);
2280          assert(src[1].type == BRW_REGISTER_TYPE_UD);
2281          generate_quad_swizzle(inst, dst, src[0], src[1].ud);
2282          break;
2283
2284       case SHADER_OPCODE_CLUSTER_BROADCAST: {
2285          assert(!src[0].negate && !src[0].abs);
2286          assert(src[1].file == BRW_IMMEDIATE_VALUE);
2287          assert(src[1].type == BRW_REGISTER_TYPE_UD);
2288          assert(src[2].file == BRW_IMMEDIATE_VALUE);
2289          assert(src[2].type == BRW_REGISTER_TYPE_UD);
2290          const unsigned component = src[1].ud;
2291          const unsigned cluster_size = src[2].ud;
2292          unsigned vstride = cluster_size;
2293          unsigned width = cluster_size;
2294
2295          /* The maximum exec_size is 32, but the maximum width is only 16. */
2296          if (inst->exec_size == width) {
2297             vstride = 0;
2298             width = 1;
2299          }
2300
2301          struct brw_reg strided = stride(suboffset(src[0], component),
2302                                          vstride, width, 0);
2303          if (type_sz(src[0].type) > 4 &&
2304              (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
2305             /* IVB has an issue (which we found empirically) where it reads
2306              * two address register components per channel for indirectly
2307              * addressed 64-bit sources.
2308              *
2309              * From the Cherryview PRM Vol 7. "Register Region Restrictions":
2310              *
2311              *    "When source or destination datatype is 64b or operation is
2312              *    integer DWord multiply, indirect addressing must not be
2313              *    used."
2314              *
2315              * To work around both of these, we do two integer MOVs insead of
2316              * one 64-bit MOV.  Because no double value should ever cross a
2317              * register boundary, it's safe to use the immediate offset in the
2318              * indirect here to handle adding 4 bytes to the offset and avoid
2319              * the extra ADD to the register file.
2320              */
2321             assert(src[0].type == dst.type);
2322             brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
2323                        subscript(strided, BRW_REGISTER_TYPE_D, 0));
2324             brw_set_default_swsb(p, tgl_swsb_null());
2325             brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
2326                        subscript(strided, BRW_REGISTER_TYPE_D, 1));
2327          } else {
2328             brw_MOV(p, dst, strided);
2329          }
2330          break;
2331       }
2332
2333       case FS_OPCODE_SET_SAMPLE_ID:
2334          generate_set_sample_id(inst, dst, src[0], src[1]);
2335          break;
2336
2337       case FS_OPCODE_PACK_HALF_2x16_SPLIT:
2338           generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
2339           break;
2340
2341       case FS_OPCODE_PLACEHOLDER_HALT:
2342          /* This is the place where the final HALT needs to be inserted if
2343           * we've emitted any discards.  If not, this will emit no code.
2344           */
2345          if (!patch_discard_jumps_to_fb_writes()) {
2346             if (unlikely(debug_flag)) {
2347                disasm_info->use_tail = true;
2348             }
2349          }
2350          break;
2351
2352       case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2353          generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2354                                            GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE);
2355          send_count++;
2356          break;
2357
2358       case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2359          generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2360                                            GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET);
2361          send_count++;
2362          break;
2363
2364       case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2365          generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2366                                            GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET);
2367          send_count++;
2368          break;
2369
2370       case CS_OPCODE_CS_TERMINATE:
2371          generate_cs_terminate(inst, src[0]);
2372          send_count++;
2373          break;
2374
2375       case SHADER_OPCODE_BARRIER:
2376          generate_barrier(inst, src[0]);
2377          send_count++;
2378          break;
2379
2380       case BRW_OPCODE_DIM:
2381          assert(devinfo->is_haswell);
2382          assert(src[0].type == BRW_REGISTER_TYPE_DF);
2383          assert(dst.type == BRW_REGISTER_TYPE_DF);
2384          brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
2385          break;
2386
2387       case SHADER_OPCODE_RND_MODE: {
2388          assert(src[0].file == BRW_IMMEDIATE_VALUE);
2389          /*
2390           * Changes the floating point rounding mode updating the control
2391           * register field defined at cr0.0[5-6] bits.
2392           */
2393          enum brw_rnd_mode mode =
2394             (enum brw_rnd_mode) (src[0].d << BRW_CR0_RND_MODE_SHIFT);
2395          brw_float_controls_mode(p, mode, BRW_CR0_RND_MODE_MASK);
2396       }
2397          break;
2398
2399       case SHADER_OPCODE_FLOAT_CONTROL_MODE:
2400          assert(src[0].file == BRW_IMMEDIATE_VALUE);
2401          assert(src[1].file == BRW_IMMEDIATE_VALUE);
2402          brw_float_controls_mode(p, src[0].d, src[1].d);
2403          break;
2404
2405       default:
2406          unreachable("Unsupported opcode");
2407
2408       case SHADER_OPCODE_LOAD_PAYLOAD:
2409          unreachable("Should be lowered by lower_load_payload()");
2410       }
2411
2412       if (multiple_instructions_emitted)
2413          continue;
2414
2415       if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
2416          assert(p->next_insn_offset == last_insn_offset + 16 ||
2417                 !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
2418                  "emitting more than 1 instruction");
2419
2420          brw_inst *last = &p->store[last_insn_offset / 16];
2421
2422          if (inst->conditional_mod)
2423             brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
2424          if (devinfo->gen < 12) {
2425             brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
2426             brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
2427          }
2428       }
2429    }
2430
2431    brw_set_uip_jip(p, start_offset);
2432
2433    /* end of program sentinel */
2434    disasm_new_inst_group(disasm_info, p->next_insn_offset);
2435
2436 #ifndef NDEBUG
2437    bool validated =
2438 #else
2439    if (unlikely(debug_flag))
2440 #endif
2441       brw_validate_instructions(devinfo, p->store,
2442                                 start_offset,
2443                                 p->next_insn_offset,
2444                                 disasm_info);
2445
2446    int before_size = p->next_insn_offset - start_offset;
2447    brw_compact_instructions(p, start_offset, disasm_info);
2448    int after_size = p->next_insn_offset - start_offset;
2449
2450    if (unlikely(debug_flag)) {
2451       unsigned char sha1[21];
2452       char sha1buf[41];
2453
2454       _mesa_sha1_compute(p->store + start_offset / sizeof(brw_inst),
2455                          after_size, sha1);
2456       _mesa_sha1_format(sha1buf, sha1);
2457
2458       fprintf(stderr, "Native code for %s (sha1 %s)\n"
2459               "SIMD%d shader: %d instructions. %d loops. %u cycles. "
2460               "%d:%d spills:fills, %u sends, "
2461               "scheduled with mode %s. "
2462               "Promoted %u constants. "
2463               "Compacted %d to %d bytes (%.0f%%)\n",
2464               shader_name, sha1buf,
2465               dispatch_width, before_size / 16,
2466               loop_count, perf.latency,
2467               spill_count, fill_count, send_count,
2468               shader_stats.scheduler_mode,
2469               shader_stats.promoted_constants,
2470               before_size, after_size,
2471               100.0f * (before_size - after_size) / before_size);
2472
2473       /* overriding the shader makes disasm_info invalid */
2474       if (!brw_try_override_assembly(p, start_offset, sha1buf)) {
2475          dump_assembly(p->store, disasm_info, perf.block_latency);
2476       } else {
2477          fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf);
2478       }
2479    }
2480    ralloc_free(disasm_info);
2481    assert(validated);
2482
2483    compiler->shader_debug_log(log_data,
2484                               "%s SIMD%d shader: %d inst, %d loops, %u cycles, "
2485                               "%d:%d spills:fills, %u sends, "
2486                               "scheduled with mode %s, "
2487                               "Promoted %u constants, "
2488                               "compacted %d to %d bytes.",
2489                               _mesa_shader_stage_to_abbrev(stage),
2490                               dispatch_width, before_size / 16 - nop_count,
2491                               loop_count, perf.latency,
2492                               spill_count, fill_count, send_count,
2493                               shader_stats.scheduler_mode,
2494                               shader_stats.promoted_constants,
2495                               before_size, after_size);
2496    if (stats) {
2497       stats->dispatch_width = dispatch_width;
2498       stats->instructions = before_size / 16 - nop_count;
2499       stats->sends = send_count;
2500       stats->loops = loop_count;
2501       stats->cycles = perf.latency;
2502       stats->spills = spill_count;
2503       stats->fills = fill_count;
2504    }
2505
2506    return start_offset;
2507 }
2508
2509 const unsigned *
2510 fs_generator::get_assembly()
2511 {
2512    return brw_get_program(p, &prog_data->program_size);
2513 }