src/intel/compiler/brw_fs_generator.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs_generator.cpp
  25  *
  26  * This file supports generating code from the FS LIR to the actual
  27  * native instructions.
  28  */
  29
  30 #include "brw_eu.h"
  31 #include "brw_fs.h"
  32 #include "brw_cfg.h"
  33
  34 static enum brw_reg_file
  35 brw_file_from_reg(fs_reg *reg)
  36 {
  37    switch (reg->file) {
  38    case ARF:
  39       return BRW_ARCHITECTURE_REGISTER_FILE;
  40    case FIXED_GRF:
  41    case VGRF:
  42       return BRW_GENERAL_REGISTER_FILE;
  43    case MRF:
  44       return BRW_MESSAGE_REGISTER_FILE;
  45    case IMM:
  46       return BRW_IMMEDIATE_VALUE;
  47    case BAD_FILE:
  48    case ATTR:
  49    case UNIFORM:
  50       unreachable("not reached");
  51    }
  52    return BRW_ARCHITECTURE_REGISTER_FILE;
  53 }
  54
  55 static struct brw_reg
  56 brw_reg_from_fs_reg(const struct gen_device_info *devinfo, fs_inst *inst,
  57                     fs_reg *reg, bool compressed)
  58 {
  59    struct brw_reg brw_reg;
  60
  61    switch (reg->file) {
  62    case MRF:
  63       assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
  64       /* Fallthrough */
  65    case VGRF:
  66       if (reg->stride == 0) {
  67          brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0);
  68       } else {
  69          /* From the Haswell PRM:
  70           *
  71           *  "VertStride must be used to cross GRF register boundaries. This
  72           *   rule implies that elements within a 'Width' cannot cross GRF
  73           *   boundaries."
  74           *
  75           * The maximum width value that could satisfy this restriction is:
  76           */
  77          const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));
  78
  79          /* Because the hardware can only split source regions at a whole
  80           * multiple of width during decompression (i.e. vertically), clamp
  81           * the value obtained above to the physical execution size of a
  82           * single decompressed chunk of the instruction:
  83           */
  84          const unsigned phys_width = compressed ? inst->exec_size / 2 :
  85                                      inst->exec_size;
  86
  87          /* XXX - The equation above is strictly speaking not correct on
  88           *       hardware that supports unbalanced GRF writes -- On Gen9+
  89           *       each decompressed chunk of the instruction may have a
  90           *       different execution size when the number of components
  91           *       written to each destination GRF is not the same.
  92           */
  93          const unsigned width = MIN2(reg_width, phys_width);
  94          brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
  95          brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
  96
  97          if (devinfo->gen == 7 && !devinfo->is_haswell) {
  98             /* From the IvyBridge PRM (EU Changes by Processor Generation, page 13):
  99              *  "Each DF (Double Float) operand uses an element size of 4 rather
 100              *   than 8 and all regioning parameters are twice what the values
 101              *   would be based on the true element size: ExecSize, Width,
 102              *   HorzStride, and VertStride. Each DF operand uses a pair of
 103              *   channels and all masking and swizzing should be adjusted
 104              *   appropriately."
 105              *
 106              * From the IvyBridge PRM (Special Requirements for Handling Double
 107              * Precision Data Types, page 71):
 108              *  "In Align1 mode, all regioning parameters like stride, execution
 109              *   size, and width must use the syntax of a pair of packed
 110              *   floats. The offsets for these data types must be 64-bit
 111              *   aligned. The execution size and regioning parameters are in terms
 112              *   of floats."
 113              *
 114              * Summarized: when handling DF-typed arguments, ExecSize,
 115              * VertStride, and Width must be doubled.
 116              *
 117              * It applies to BayTrail too.
 118              */
 119             if (type_sz(reg->type) == 8) {
 120                brw_reg.width++;
 121                if (brw_reg.vstride > 0)
 122                   brw_reg.vstride++;
 123                assert(brw_reg.hstride == BRW_HORIZONTAL_STRIDE_1);
 124             }
 125
 126             /* When converting from DF->F, we set the destination stride to 2
 127              * because each d2f conversion implicitly writes 2 floats, being
 128              * the first one the converted value. IVB/BYT actually writes two
 129              * F components per SIMD channel, and every other component is
 130              * filled with garbage.
 131              */
 132             if (reg == &inst->dst && get_exec_type_size(inst) == 8 &&
 133                 type_sz(inst->dst.type) < 8) {
 134                assert(brw_reg.hstride > BRW_HORIZONTAL_STRIDE_1);
 135                brw_reg.hstride--;
 136             }
 137          }
 138       }
 139
 140       brw_reg = retype(brw_reg, reg->type);
 141       brw_reg = byte_offset(brw_reg, reg->offset);
 142       brw_reg.abs = reg->abs;
 143       brw_reg.negate = reg->negate;
 144       break;
 145    case ARF:
 146    case FIXED_GRF:
 147    case IMM:
 148       assert(reg->offset == 0);
 149       brw_reg = reg->as_brw_reg();
 150       break;
 151    case BAD_FILE:
 152       /* Probably unused. */
 153       brw_reg = brw_null_reg();
 154       break;
 155    case ATTR:
 156    case UNIFORM:
 157       unreachable("not reached");
 158    }
 159
 160    /* On HSW+, scalar DF sources can be accessed using the normal <0,1,0>
 161     * region, but on IVB and BYT DF regions must be programmed in terms of
 162     * floats. A <0,2,1> region accomplishes this.
 163     */
 164    if (devinfo->gen == 7 && !devinfo->is_haswell &&
 165        type_sz(reg->type) == 8 &&
 166        brw_reg.vstride == BRW_VERTICAL_STRIDE_0 &&
 167        brw_reg.width == BRW_WIDTH_1 &&
 168        brw_reg.hstride == BRW_HORIZONTAL_STRIDE_0) {
 169       brw_reg.width = BRW_WIDTH_2;
 170       brw_reg.hstride = BRW_HORIZONTAL_STRIDE_1;
 171    }
 172
 173    return brw_reg;
 174 }
 175
 176 fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
 177                            void *mem_ctx,
 178                            const void *key,
 179                            struct brw_stage_prog_data *prog_data,
 180                            unsigned promoted_constants,
 181                            bool runtime_check_aads_emit,
 182                            gl_shader_stage stage)
 183
 184    : compiler(compiler), log_data(log_data),
 185      devinfo(compiler->devinfo), key(key),
 186      prog_data(prog_data),
 187      promoted_constants(promoted_constants),
 188      runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
 189      stage(stage), mem_ctx(mem_ctx)
 190 {
 191    p = rzalloc(mem_ctx, struct brw_codegen);
 192    brw_init_codegen(devinfo, p, mem_ctx);
 193 }
 194
 195 fs_generator::~fs_generator()
 196 {
 197 }
 198
 199 class ip_record : public exec_node {
 200 public:
 201    DECLARE_RALLOC_CXX_OPERATORS(ip_record)
 202
 203    ip_record(int ip)
 204    {
 205       this->ip = ip;
 206    }
 207
 208    int ip;
 209 };
 210
 211 bool
 212 fs_generator::patch_discard_jumps_to_fb_writes()
 213 {
 214    if (devinfo->gen < 6 || this->discard_halt_patches.is_empty())
 215       return false;
 216
 217    int scale = brw_jump_scale(p->devinfo);
 218
 219    /* There is a somewhat strange undocumented requirement of using
 220     * HALT, according to the simulator.  If some channel has HALTed to
 221     * a particular UIP, then by the end of the program, every channel
 222     * must have HALTed to that UIP.  Furthermore, the tracking is a
 223     * stack, so you can't do the final halt of a UIP after starting
 224     * halting to a new UIP.
 225     *
 226     * Symptoms of not emitting this instruction on actual hardware
 227     * included GPU hangs and sparkly rendering on the piglit discard
 228     * tests.
 229     */
 230    brw_inst *last_halt = gen6_HALT(p);
 231    brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
 232    brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
 233
 234    int ip = p->nr_insn;
 235
 236    foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
 237       brw_inst *patch = &p->store[patch_ip->ip];
 238
 239       assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT);
 240       /* HALT takes a half-instruction distance from the pre-incremented IP. */
 241       brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
 242    }
 243
 244    this->discard_halt_patches.make_empty();
 245    return true;
 246 }
 247
 248 void
 249 fs_generator::fire_fb_write(fs_inst *inst,
 250                             struct brw_reg payload,
 251                             struct brw_reg implied_header,
 252                             GLuint nr)
 253 {
 254    uint32_t msg_control;
 255
 256    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
 257
 258    if (devinfo->gen < 6) {
 259       brw_push_insn_state(p);
 260       brw_set_default_exec_size(p, BRW_EXECUTE_8);
 261       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
 262       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
 263       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
 264       brw_MOV(p, offset(payload, 1), brw_vec8_grf(1, 0));
 265       brw_pop_insn_state(p);
 266    }
 267
 268    if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
 269       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
 270    else if (prog_data->dual_src_blend) {
 271       if (!inst->group)
 272          msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
 273       else
 274          msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
 275    } else if (inst->exec_size == 16)
 276       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
 277    else
 278       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
 279
 280    uint32_t surf_index =
 281       prog_data->binding_table.render_target_start + inst->target;
 282
 283    bool last_render_target = inst->eot ||
 284                              (prog_data->dual_src_blend && dispatch_width == 16);
 285
 286
 287    brw_fb_WRITE(p,
 288                 payload,
 289                 implied_header,
 290                 msg_control,
 291                 surf_index,
 292                 nr,
 293                 0,
 294                 inst->eot,
 295                 last_render_target,
 296                 inst->header_size != 0);
 297
 298    brw_mark_surface_used(&prog_data->base, surf_index);
 299 }
 300
 301 void
 302 fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
 303 {
 304    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
 305    const brw_wm_prog_key * const key = (brw_wm_prog_key * const) this->key;
 306    struct brw_reg implied_header;
 307
 308    if (devinfo->gen < 8 && !devinfo->is_haswell) {
 309       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
 310    }
 311
 312    if (inst->base_mrf >= 0)
 313       payload = brw_message_reg(inst->base_mrf);
 314
 315    /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
 316     * move, here's g1.
 317     */
 318    if (inst->header_size != 0) {
 319       brw_push_insn_state(p);
 320       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
 321       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
 322       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
 323       brw_set_default_flag_reg(p, 0, 0);
 324
 325       /* On HSW, the GPU will use the predicate on SENDC, unless the header is
 326        * present.
 327        */
 328       if (prog_data->uses_kill) {
 329          struct brw_reg pixel_mask;
 330
 331          if (devinfo->gen >= 6)
 332             pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
 333          else
 334             pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
 335
 336          brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
 337       }
 338
 339       if (devinfo->gen >= 6) {
 340          brw_push_insn_state(p);
 341          brw_set_default_exec_size(p, BRW_EXECUTE_16);
 342          brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
 343          brw_MOV(p,
 344                  retype(payload, BRW_REGISTER_TYPE_UD),
 345                  retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
 346          brw_pop_insn_state(p);
 347
 348          if (inst->target > 0 && key->replicate_alpha) {
 349             /* Set "Source0 Alpha Present to RenderTarget" bit in message
 350              * header.
 351              */
 352             brw_OR(p,
 353                    vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
 354                    vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
 355                    brw_imm_ud(0x1 << 11));
 356          }
 357
 358          if (inst->target > 0) {
 359             /* Set the render target index for choosing BLEND_STATE. */
 360             brw_MOV(p, retype(vec1(suboffset(payload, 2)),
 361                               BRW_REGISTER_TYPE_UD),
 362                     brw_imm_ud(inst->target));
 363          }
 364
 365          /* Set computes stencil to render target */
 366          if (prog_data->computed_stencil) {
 367             brw_OR(p,
 368                    vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
 369                    vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
 370                    brw_imm_ud(0x1 << 14));
 371          }
 372
 373          implied_header = brw_null_reg();
 374       } else {
 375          implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
 376       }
 377
 378       brw_pop_insn_state(p);
 379    } else {
 380       implied_header = brw_null_reg();
 381    }
 382
 383    if (!runtime_check_aads_emit) {
 384       fire_fb_write(inst, payload, implied_header, inst->mlen);
 385    } else {
 386       /* This can only happen in gen < 6 */
 387       assert(devinfo->gen < 6);
 388
 389       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
 390
 391       /* Check runtime bit to detect if we have to send AA data or not */
 392       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
 393       brw_AND(p,
 394               v1_null_ud,
 395               retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
 396               brw_imm_ud(1<<26));
 397       brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
 398
 399       int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
 400       brw_inst_set_exec_size(p->devinfo, brw_last_inst, BRW_EXECUTE_1);
 401       {
 402          /* Don't send AA data */
 403          fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
 404       }
 405       brw_land_fwd_jump(p, jmp);
 406       fire_fb_write(inst, payload, implied_header, inst->mlen);
 407    }
 408 }
 409
 410 void
 411 fs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst,
 412                                struct brw_reg payload)
 413 {
 414    assert(inst->size_written % REG_SIZE == 0);
 415    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
 416    const unsigned surf_index =
 417       prog_data->binding_table.render_target_start + inst->target;
 418
 419    gen9_fb_READ(p, dst, payload, surf_index,
 420                 inst->header_size, inst->size_written / REG_SIZE,
 421                 prog_data->persample_dispatch);
 422
 423    brw_mark_surface_used(&prog_data->base, surf_index);
 424 }
 425
 426 void
 427 fs_generator::generate_mov_indirect(fs_inst *inst,
 428                                     struct brw_reg dst,
 429                                     struct brw_reg reg,
 430                                     struct brw_reg indirect_byte_offset)
 431 {
 432    assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD);
 433    assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE);
 434
 435    unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;
 436
 437    if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) {
 438       imm_byte_offset += indirect_byte_offset.ud;
 439
 440       reg.nr = imm_byte_offset / REG_SIZE;
 441       reg.subnr = imm_byte_offset % REG_SIZE;
 442       brw_MOV(p, dst, reg);
 443    } else {
 444       /* Prior to Broadwell, there are only 8 address registers. */
 445       assert(inst->exec_size == 8 || devinfo->gen >= 8);
 446
 447       /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
 448       struct brw_reg addr = vec8(brw_address_reg(0));
 449
 450       /* The destination stride of an instruction (in bytes) must be greater
 451        * than or equal to the size of the rest of the instruction.  Since the
 452        * address register is of type UW, we can't use a D-type instruction.
 453        * In order to get around this, re retype to UW and use a stride.
 454        */
 455       indirect_byte_offset =
 456          retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);
 457
 458       /* There are a number of reasons why we don't use the base offset here.
 459        * One reason is that the field is only 9 bits which means we can only
 460        * use it to access the first 16 GRFs.  Also, from the Haswell PRM
 461        * section "Register Region Restrictions":
 462        *
 463        *    "The lower bits of the AddressImmediate must not overflow to
 464        *    change the register address.  The lower 5 bits of Address
 465        *    Immediate when added to lower 5 bits of address register gives
 466        *    the sub-register offset. The upper bits of Address Immediate
 467        *    when added to upper bits of address register gives the register
 468        *    address. Any overflow from sub-register offset is dropped."
 469        *
 470        * Since the indirect may cause us to cross a register boundary, this
 471        * makes the base offset almost useless.  We could try and do something
 472        * clever where we use a actual base offset if base_offset % 32 == 0 but
 473        * that would mean we were generating different code depending on the
 474        * base offset.  Instead, for the sake of consistency, we'll just do the
 475        * add ourselves.  This restriction is only listed in the Haswell PRM
 476        * but empirical testing indicates that it applies on all older
 477        * generations and is lifted on Broadwell.
 478        *
 479        * In the end, while base_offset is nice to look at in the generated
 480        * code, using it saves us 0 instructions and would require quite a bit
 481        * of case-by-case work.  It's just not worth it.
 482        */
 483       brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
 484       struct brw_reg ind_src = brw_VxH_indirect(0, 0);
 485
 486       brw_inst *mov = brw_MOV(p, dst, retype(ind_src, reg.type));
 487
 488       if (devinfo->gen == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE &&
 489           !inst->get_next()->is_tail_sentinel() &&
 490           ((fs_inst *)inst->get_next())->mlen > 0) {
 491          /* From the Sandybridge PRM:
 492           *
 493           *    "[Errata: DevSNB(SNB)] If MRF register is updated by any
 494           *    instruction that “indexed/indirect” source AND is followed by a
 495           *    send, the instruction requires a “Switch”. This is to avoid
 496           *    race condition where send may dispatch before MRF is updated."
 497           */
 498          brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH);
 499       }
 500    }
 501 }
 502
 503 void
 504 fs_generator::generate_urb_read(fs_inst *inst,
 505                                 struct brw_reg dst,
 506                                 struct brw_reg header)
 507 {
 508    assert(inst->size_written % REG_SIZE == 0);
 509    assert(header.file == BRW_GENERAL_REGISTER_FILE);
 510    assert(header.type == BRW_REGISTER_TYPE_UD);
 511
 512    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
 513    brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
 514    brw_set_src0(p, send, header);
 515    brw_set_src1(p, send, brw_imm_ud(0u));
 516
 517    brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB);
 518    brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ);
 519
 520    if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT)
 521       brw_inst_set_urb_per_slot_offset(p->devinfo, send, true);
 522
 523    brw_inst_set_mlen(p->devinfo, send, inst->mlen);
 524    brw_inst_set_rlen(p->devinfo, send, inst->size_written / REG_SIZE);
 525    brw_inst_set_header_present(p->devinfo, send, true);
 526    brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset);
 527 }
 528
 529 void
 530 fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload)
 531 {
 532    brw_inst *insn;
 533
 534    insn = brw_next_insn(p, BRW_OPCODE_SEND);
 535
 536    brw_set_dest(p, insn, brw_null_reg());
 537    brw_set_src0(p, insn, payload);
 538    brw_set_src1(p, insn, brw_imm_d(0));
 539
 540    brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB);
 541    brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE);
 542
 543    if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
 544        inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
 545       brw_inst_set_urb_per_slot_offset(p->devinfo, insn, true);
 546
 547    if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
 548        inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
 549       brw_inst_set_urb_channel_mask_present(p->devinfo, insn, true);
 550
 551    brw_inst_set_mlen(p->devinfo, insn, inst->mlen);
 552    brw_inst_set_rlen(p->devinfo, insn, 0);
 553    brw_inst_set_eot(p->devinfo, insn, inst->eot);
 554    brw_inst_set_header_present(p->devinfo, insn, true);
 555    brw_inst_set_urb_global_offset(p->devinfo, insn, inst->offset);
 556 }
 557
 558 void
 559 fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
 560 {
 561    struct brw_inst *insn;
 562
 563    insn = brw_next_insn(p, BRW_OPCODE_SEND);
 564
 565    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
 566    brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW));
 567    brw_set_src1(p, insn, brw_imm_d(0));
 568
 569    /* Terminate a compute shader by sending a message to the thread spawner.
 570     */
 571    brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER);
 572    brw_inst_set_mlen(devinfo, insn, 1);
 573    brw_inst_set_rlen(devinfo, insn, 0);
 574    brw_inst_set_eot(devinfo, insn, inst->eot);
 575    brw_inst_set_header_present(devinfo, insn, false);
 576
 577    brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */
 578    brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */
 579
 580    /* Note that even though the thread has a URB resource associated with it,
 581     * we set the "do not dereference URB" bit, because the URB resource is
 582     * managed by the fixed-function unit, so it will free it automatically.
 583     */
 584    brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */
 585
 586    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
 587 }
 588
 589 void
 590 fs_generator::generate_barrier(fs_inst *inst, struct brw_reg src)
 591 {
 592    brw_barrier(p, src);
 593    brw_WAIT(p);
 594 }
 595
 596 void
 597 fs_generator::generate_linterp(fs_inst *inst,
 598                              struct brw_reg dst, struct brw_reg *src)
 599 {
 600    /* PLN reads:
 601     *                      /   in SIMD16   \
 602     *    -----------------------------------
 603     *   | src1+0 | src1+1 | src1+2 | src1+3 |
 604     *   |-----------------------------------|
 605     *   |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
 606     *    -----------------------------------
 607     *
 608     * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys:
 609     *
 610     *    -----------------------------------
 611     *   | src1+0 | src1+1 | src1+2 | src1+3 |
 612     *   |-----------------------------------|
 613     *   |(x0, x1)|(y0, y1)|        |        | in SIMD8
 614     *   |-----------------------------------|
 615     *   |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16
 616     *    -----------------------------------
 617     *
 618     * See also: emit_interpolation_setup_gen4().
 619     */
 620    struct brw_reg delta_x = src[0];
 621    struct brw_reg delta_y = offset(src[0], inst->exec_size / 8);
 622    struct brw_reg interp = src[1];
 623
 624    if (devinfo->has_pln &&
 625        (devinfo->gen >= 7 || (delta_x.nr & 1) == 0)) {
 626       brw_PLN(p, dst, interp, delta_x);
 627    } else {
 628       brw_LINE(p, brw_null_reg(), interp, delta_x);
 629       brw_MAC(p, dst, suboffset(interp, 1), delta_y);
 630    }
 631 }
 632
 633 void
 634 fs_generator::generate_get_buffer_size(fs_inst *inst,
 635                                        struct brw_reg dst,
 636                                        struct brw_reg src,
 637                                        struct brw_reg surf_index)
 638 {
 639    assert(devinfo->gen >= 7);
 640    assert(surf_index.file == BRW_IMMEDIATE_VALUE);
 641
 642    uint32_t simd_mode;
 643    int rlen = 4;
 644
 645    switch (inst->exec_size) {
 646    case 8:
 647       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 648       break;
 649    case 16:
 650       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 651       break;
 652    default:
 653       unreachable("Invalid width for texture instruction");
 654    }
 655
 656    if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
 657       rlen = 8;
 658       dst = vec16(dst);
 659    }
 660
 661    brw_SAMPLE(p,
 662               retype(dst, BRW_REGISTER_TYPE_UW),
 663               inst->base_mrf,
 664               src,
 665               surf_index.ud,
 666               0,
 667               GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
 668               rlen, /* response length */
 669               inst->mlen,
 670               inst->header_size > 0,
 671               simd_mode,
 672               BRW_SAMPLER_RETURN_FORMAT_SINT32);
 673
 674    brw_mark_surface_used(prog_data, surf_index.ud);
 675 }
 676
 677 void
 678 fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
 679                            struct brw_reg surface_index,
 680                            struct brw_reg sampler_index)
 681 {
 682    assert(inst->size_written % REG_SIZE == 0);
 683    int msg_type = -1;
 684    uint32_t simd_mode;
 685    uint32_t return_format;
 686    bool is_combined_send = inst->eot;
 687
 688    switch (dst.type) {
 689    case BRW_REGISTER_TYPE_D:
 690       return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
 691       break;
 692    case BRW_REGISTER_TYPE_UD:
 693       return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
 694       break;
 695    default:
 696       return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
 697       break;
 698    }
 699
 700    /* Stomp the resinfo output type to UINT32.  On gens 4-5, the output type
 701     * is set as part of the message descriptor.  On gen4, the PRM seems to
 702     * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
 703     * later gens UINT32 is required.  Once you hit Sandy Bridge, the bit is
 704     * gone from the message descriptor entirely and you just get UINT32 all
 705     * the time regasrdless.  Since we can really only do non-UINT32 on gen4,
 706     * just stomp it to UINT32 all the time.
 707     */
 708    if (inst->opcode == SHADER_OPCODE_TXS)
 709       return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
 710
 711    switch (inst->exec_size) {
 712    case 8:
 713       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 714       break;
 715    case 16:
 716       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 717       break;
 718    default:
 719       unreachable("Invalid width for texture instruction");
 720    }
 721
 722    if (devinfo->gen >= 5) {
 723       switch (inst->opcode) {
 724       case SHADER_OPCODE_TEX:
 725          if (inst->shadow_compare) {
 726             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
 727          } else {
 728             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
 729          }
 730          break;
 731       case FS_OPCODE_TXB:
 732          if (inst->shadow_compare) {
 733             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
 734          } else {
 735             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
 736          }
 737          break;
 738       case SHADER_OPCODE_TXL:
 739          if (inst->shadow_compare) {
 740             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
 741          } else {
 742             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
 743          }
 744          break;
 745       case SHADER_OPCODE_TXL_LZ:
 746          assert(devinfo->gen >= 9);
 747          if (inst->shadow_compare) {
 748             msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ;
 749          } else {
 750             msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LZ;
 751          }
 752          break;
 753       case SHADER_OPCODE_TXS:
 754          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
 755          break;
 756       case SHADER_OPCODE_TXD:
 757          if (inst->shadow_compare) {
 758             /* Gen7.5+.  Otherwise, lowered in NIR */
 759             assert(devinfo->gen >= 8 || devinfo->is_haswell);
 760             msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
 761          } else {
 762             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
 763          }
 764          break;
 765       case SHADER_OPCODE_TXF:
 766          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
 767          break;
 768       case SHADER_OPCODE_TXF_LZ:
 769          assert(devinfo->gen >= 9);
 770          msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
 771          break;
 772       case SHADER_OPCODE_TXF_CMS_W:
 773          assert(devinfo->gen >= 9);
 774          msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
 775          break;
 776       case SHADER_OPCODE_TXF_CMS:
 777          if (devinfo->gen >= 7)
 778             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
 779          else
 780             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
 781          break;
 782       case SHADER_OPCODE_TXF_UMS:
 783          assert(devinfo->gen >= 7);
 784          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
 785          break;
 786       case SHADER_OPCODE_TXF_MCS:
 787          assert(devinfo->gen >= 7);
 788          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
 789          break;
 790       case SHADER_OPCODE_LOD:
 791          msg_type = GEN5_SAMPLER_MESSAGE_LOD;
 792          break;
 793       case SHADER_OPCODE_TG4:
 794          if (inst->shadow_compare) {
 795             assert(devinfo->gen >= 7);
 796             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
 797          } else {
 798             assert(devinfo->gen >= 6);
 799             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
 800          }
 801          break;
 802       case SHADER_OPCODE_TG4_OFFSET:
 803          assert(devinfo->gen >= 7);
 804          if (inst->shadow_compare) {
 805             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
 806          } else {
 807             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
 808          }
 809          break;
 810       case SHADER_OPCODE_SAMPLEINFO:
 811          msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
 812          break;
 813       default:
 814          unreachable("not reached");
 815       }
 816    } else {
 817       switch (inst->opcode) {
 818       case SHADER_OPCODE_TEX:
 819          /* Note that G45 and older determines shadow compare and dispatch width
 820           * from message length for most messages.
 821           */
 822          if (inst->exec_size == 8) {
 823             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
 824             if (inst->shadow_compare) {
 825                assert(inst->mlen == 6);
 826             } else {
 827                assert(inst->mlen <= 4);
 828             }
 829          } else {
 830             if (inst->shadow_compare) {
 831                msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
 832                assert(inst->mlen == 9);
 833             } else {
 834                msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
 835                assert(inst->mlen <= 7 && inst->mlen % 2 == 1);
 836             }
 837          }
 838          break;
 839       case FS_OPCODE_TXB:
 840          if (inst->shadow_compare) {
 841             assert(inst->exec_size == 8);
 842             assert(inst->mlen == 6);
 843             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
 844          } else {
 845             assert(inst->mlen == 9);
 846             msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
 847             simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 848          }
 849          break;
 850       case SHADER_OPCODE_TXL:
 851          if (inst->shadow_compare) {
 852             assert(inst->exec_size == 8);
 853             assert(inst->mlen == 6);
 854             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
 855          } else {
 856             assert(inst->mlen == 9);
 857             msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
 858             simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 859          }
 860          break;
 861       case SHADER_OPCODE_TXD:
 862          /* There is no sample_d_c message; comparisons are done manually */
 863          assert(inst->exec_size == 8);
 864          assert(inst->mlen == 7 || inst->mlen == 10);
 865          msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
 866          break;
 867       case SHADER_OPCODE_TXF:
 868          assert(inst->mlen <= 9 && inst->mlen % 2 == 1);
 869          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
 870          simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 871          break;
 872       case SHADER_OPCODE_TXS:
 873          assert(inst->mlen == 3);
 874          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
 875          simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 876          break;
 877       default:
 878          unreachable("not reached");
 879       }
 880    }
 881    assert(msg_type != -1);
 882
 883    if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
 884       dst = vec16(dst);
 885    }
 886
 887    assert(devinfo->gen < 7 || inst->header_size == 0 ||
 888           src.file == BRW_GENERAL_REGISTER_FILE);
 889
 890    assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
 891
 892    /* Load the message header if present.  If there's a texture offset,
 893     * we need to set it up explicitly and load the offset bitfield.
 894     * Otherwise, we can use an implied move from g0 to the first message reg.
 895     */
 896    if (inst->header_size != 0) {
 897       if (devinfo->gen < 6 && !inst->offset) {
 898          /* Set up an implied move from g0 to the MRF. */
 899          src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
 900       } else {
 901          struct brw_reg header_reg;
 902
 903          if (devinfo->gen >= 7) {
 904             header_reg = src;
 905          } else {
 906             assert(inst->base_mrf != -1);
 907             header_reg = brw_message_reg(inst->base_mrf);
 908          }
 909
 910          brw_push_insn_state(p);
 911          brw_set_default_exec_size(p, BRW_EXECUTE_8);
 912          brw_set_default_mask_control(p, BRW_MASK_DISABLE);
 913          brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
 914          /* Explicitly set up the message header by copying g0 to the MRF. */
 915          brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
 916
 917          if (inst->offset) {
 918             /* Set the offset bits in DWord 2. */
 919             brw_MOV(p, get_element_ud(header_reg, 2),
 920                        brw_imm_ud(inst->offset));
 921          } else if (stage != MESA_SHADER_VERTEX &&
 922                     stage != MESA_SHADER_FRAGMENT) {
 923             /* The vertex and fragment stages have g0.2 set to 0, so
 924              * header0.2 is 0 when g0 is copied. Other stages may not, so we
 925              * must set it to 0 to avoid setting undesirable bits in the
 926              * message.
 927              */
 928             brw_MOV(p, get_element_ud(header_reg, 2), brw_imm_ud(0));
 929          }
 930
 931          brw_adjust_sampler_state_pointer(p, header_reg, sampler_index);
 932          brw_pop_insn_state(p);
 933       }
 934    }
 935
 936    uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
 937          inst->opcode == SHADER_OPCODE_TG4_OFFSET)
 938          ? prog_data->binding_table.gather_texture_start
 939          : prog_data->binding_table.texture_start;
 940
 941    if (surface_index.file == BRW_IMMEDIATE_VALUE &&
 942        sampler_index.file == BRW_IMMEDIATE_VALUE) {
 943       uint32_t surface = surface_index.ud;
 944       uint32_t sampler = sampler_index.ud;
 945
 946       brw_SAMPLE(p,
 947                  retype(dst, BRW_REGISTER_TYPE_UW),
 948                  inst->base_mrf,
 949                  src,
 950                  surface + base_binding_table_index,
 951                  sampler % 16,
 952                  msg_type,
 953                  inst->size_written / REG_SIZE,
 954                  inst->mlen,
 955                  inst->header_size != 0,
 956                  simd_mode,
 957                  return_format);
 958
 959       brw_mark_surface_used(prog_data, surface + base_binding_table_index);
 960    } else {
 961       /* Non-const sampler index */
 962
 963       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
 964       struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
 965       struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
 966
 967       brw_push_insn_state(p);
 968       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
 969       brw_set_default_access_mode(p, BRW_ALIGN_1);
 970
 971       if (brw_regs_equal(&surface_reg, &sampler_reg)) {
 972          brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
 973       } else {
 974          if (sampler_reg.file == BRW_IMMEDIATE_VALUE) {
 975             brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8));
 976          } else {
 977             brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
 978             brw_OR(p, addr, addr, surface_reg);
 979          }
 980       }
 981       if (base_binding_table_index)
 982          brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
 983       brw_AND(p, addr, addr, brw_imm_ud(0xfff));
 984
 985       brw_pop_insn_state(p);
 986
 987       /* dst = send(offset, a0.0 | <descriptor>) */
 988       brw_inst *insn = brw_send_indirect_message(
 989          p, BRW_SFID_SAMPLER, dst, src, addr);
 990       brw_set_sampler_message(p, insn,
 991                               0 /* surface */,
 992                               0 /* sampler */,
 993                               msg_type,
 994                               inst->size_written / REG_SIZE,
 995                               inst->mlen /* mlen */,
 996                               inst->header_size != 0 /* header */,
 997                               simd_mode,
 998                               return_format);
 999
1000       /* visitor knows more than we do about the surface limit required,
1001        * so has already done marking.
1002        */
1003    }
1004
1005    if (is_combined_send) {
1006       brw_inst_set_eot(p->devinfo, brw_last_inst, true);
1007       brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC);
1008    }
1009 }
1010
1011
1012 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
1013  * looking like:
1014  *
1015  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
1016  *
1017  * Ideally, we want to produce:
1018  *
1019  *           DDX                     DDY
1020  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
1021  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
1022  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
1023  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
1024  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
1025  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
1026  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
1027  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
1028  *
1029  * and add another set of two more subspans if in 16-pixel dispatch mode.
1030  *
1031  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
1032  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
1033  * pair.  But the ideal approximation may impose a huge performance cost on
1034  * sample_d.  On at least Haswell, sample_d instruction does some
1035  * optimizations if the same LOD is used for all pixels in the subspan.
1036  *
1037  * For DDY, we need to use ALIGN16 mode since it's capable of doing the
1038  * appropriate swizzling.
1039  */
1040 void
1041 fs_generator::generate_ddx(enum opcode opcode,
1042                            struct brw_reg dst, struct brw_reg src)
1043 {
1044    unsigned vstride, width;
1045
1046    if (opcode == FS_OPCODE_DDX_FINE) {
1047       /* produce accurate derivatives */
1048       vstride = BRW_VERTICAL_STRIDE_2;
1049       width = BRW_WIDTH_2;
1050    } else {
1051       /* replicate the derivative at the top-left pixel to other pixels */
1052       vstride = BRW_VERTICAL_STRIDE_4;
1053       width = BRW_WIDTH_4;
1054    }
1055
1056    struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
1057                                  src.negate, src.abs,
1058                                  BRW_REGISTER_TYPE_F,
1059                                  vstride,
1060                                  width,
1061                                  BRW_HORIZONTAL_STRIDE_0,
1062                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1063    struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
1064                                  src.negate, src.abs,
1065                                  BRW_REGISTER_TYPE_F,
1066                                  vstride,
1067                                  width,
1068                                  BRW_HORIZONTAL_STRIDE_0,
1069                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1070    brw_ADD(p, dst, src0, negate(src1));
1071 }
1072
1073 /* The negate_value boolean is used to negate the derivative computation for
1074  * FBOs, since they place the origin at the upper left instead of the lower
1075  * left.
1076  */
1077 void
1078 fs_generator::generate_ddy(enum opcode opcode,
1079                            struct brw_reg dst, struct brw_reg src)
1080 {
1081    if (opcode == FS_OPCODE_DDY_FINE) {
1082       /* produce accurate derivatives */
1083       struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
1084                                     src.negate, src.abs,
1085                                     BRW_REGISTER_TYPE_F,
1086                                     BRW_VERTICAL_STRIDE_4,
1087                                     BRW_WIDTH_4,
1088                                     BRW_HORIZONTAL_STRIDE_1,
1089                                     BRW_SWIZZLE_XYXY, WRITEMASK_XYZW);
1090       struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
1091                                     src.negate, src.abs,
1092                                     BRW_REGISTER_TYPE_F,
1093                                     BRW_VERTICAL_STRIDE_4,
1094                                     BRW_WIDTH_4,
1095                                     BRW_HORIZONTAL_STRIDE_1,
1096                                     BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW);
1097       brw_push_insn_state(p);
1098       brw_set_default_access_mode(p, BRW_ALIGN_16);
1099       brw_ADD(p, dst, negate(src0), src1);
1100       brw_pop_insn_state(p);
1101    } else {
1102       /* replicate the derivative at the top-left pixel to other pixels */
1103       struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
1104                                     src.negate, src.abs,
1105                                     BRW_REGISTER_TYPE_F,
1106                                     BRW_VERTICAL_STRIDE_4,
1107                                     BRW_WIDTH_4,
1108                                     BRW_HORIZONTAL_STRIDE_0,
1109                                     BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1110       struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
1111                                     src.negate, src.abs,
1112                                     BRW_REGISTER_TYPE_F,
1113                                     BRW_VERTICAL_STRIDE_4,
1114                                     BRW_WIDTH_4,
1115                                     BRW_HORIZONTAL_STRIDE_0,
1116                                     BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1117       brw_ADD(p, dst, negate(src0), src1);
1118    }
1119 }
1120
1121 void
1122 fs_generator::generate_discard_jump(fs_inst *inst)
1123 {
1124    assert(devinfo->gen >= 6);
1125
1126    /* This HALT will be patched up at FB write time to point UIP at the end of
1127     * the program, and at brw_uip_jip() JIP will be set to the end of the
1128     * current block (or the program).
1129     */
1130    this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
1131    gen6_HALT(p);
1132 }
1133
1134 void
1135 fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
1136 {
1137    /* The 32-wide messages only respect the first 16-wide half of the channel
1138     * enable signals which are replicated identically for the second group of
1139     * 16 channels, so we cannot use them unless the write is marked
1140     * force_writemask_all.
1141     */
1142    const unsigned lower_size = inst->force_writemask_all ? inst->exec_size :
1143                                MIN2(16, inst->exec_size);
1144    const unsigned block_size = 4 * lower_size / REG_SIZE;
1145    assert(inst->mlen != 0);
1146
1147    brw_push_insn_state(p);
1148    brw_set_default_exec_size(p, cvt(lower_size) - 1);
1149    brw_set_default_compression(p, lower_size > 8);
1150
1151    for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
1152       brw_set_default_group(p, inst->group + lower_size * i);
1153
1154       brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0),
1155               retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD));
1156
1157       brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
1158                                     block_size,
1159                                     inst->offset + block_size * REG_SIZE * i);
1160    }
1161
1162    brw_pop_insn_state(p);
1163 }
1164
1165 void
1166 fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
1167 {
1168    assert(inst->exec_size <= 16 || inst->force_writemask_all);
1169    assert(inst->mlen != 0);
1170
1171    brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
1172                                 inst->exec_size / 8, inst->offset);
1173 }
1174
1175 void
1176 fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
1177 {
1178    assert(inst->exec_size <= 16 || inst->force_writemask_all);
1179
1180    gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
1181 }
1182
1183 void
1184 fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
1185                                                   struct brw_reg dst,
1186                                                   struct brw_reg index,
1187                                                   struct brw_reg offset)
1188 {
1189    assert(type_sz(dst.type) == 4);
1190    assert(inst->mlen != 0);
1191
1192    assert(index.file == BRW_IMMEDIATE_VALUE &&
1193           index.type == BRW_REGISTER_TYPE_UD);
1194    uint32_t surf_index = index.ud;
1195
1196    assert(offset.file == BRW_IMMEDIATE_VALUE &&
1197           offset.type == BRW_REGISTER_TYPE_UD);
1198    uint32_t read_offset = offset.ud;
1199
1200    brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
1201                         read_offset, surf_index);
1202 }
1203
1204 void
1205 fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
1206                                                        struct brw_reg dst,
1207                                                        struct brw_reg index,
1208                                                        struct brw_reg payload)
1209 {
1210    assert(index.type == BRW_REGISTER_TYPE_UD);
1211    assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1212    assert(type_sz(dst.type) == 4);
1213
1214    if (index.file == BRW_IMMEDIATE_VALUE) {
1215       const uint32_t surf_index = index.ud;
1216
1217       brw_push_insn_state(p);
1218       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1219       brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1220       brw_pop_insn_state(p);
1221
1222       brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
1223       brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
1224       brw_set_dp_read_message(p, send, surf_index,
1225                               BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size),
1226                               GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
1227                               GEN6_SFID_DATAPORT_CONSTANT_CACHE,
1228                               1, /* mlen */
1229                               true, /* header */
1230                               DIV_ROUND_UP(inst->size_written, REG_SIZE));
1231
1232    } else {
1233       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1234
1235       brw_push_insn_state(p);
1236       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1237
1238       /* a0.0 = surf_index & 0xff */
1239       brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1240       brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
1241       brw_set_dest(p, insn_and, addr);
1242       brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1243       brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1244
1245       /* dst = send(payload, a0.0 | <descriptor>) */
1246       brw_inst *insn = brw_send_indirect_message(
1247          p, GEN6_SFID_DATAPORT_CONSTANT_CACHE,
1248          retype(dst, BRW_REGISTER_TYPE_UD),
1249          retype(payload, BRW_REGISTER_TYPE_UD), addr);
1250       brw_set_dp_read_message(p, insn, 0 /* surface */,
1251                               BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size),
1252                               GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
1253                               GEN6_SFID_DATAPORT_CONSTANT_CACHE,
1254                               1, /* mlen */
1255                               true, /* header */
1256                               DIV_ROUND_UP(inst->size_written, REG_SIZE));
1257
1258       brw_pop_insn_state(p);
1259    }
1260 }
1261
1262 void
1263 fs_generator::generate_varying_pull_constant_load_gen4(fs_inst *inst,
1264                                                        struct brw_reg dst,
1265                                                        struct brw_reg index)
1266 {
1267    assert(devinfo->gen < 7); /* Should use the gen7 variant. */
1268    assert(inst->header_size != 0);
1269    assert(inst->mlen);
1270
1271    assert(index.file == BRW_IMMEDIATE_VALUE &&
1272           index.type == BRW_REGISTER_TYPE_UD);
1273    uint32_t surf_index = index.ud;
1274
1275    uint32_t simd_mode, rlen, msg_type;
1276    if (inst->exec_size == 16) {
1277       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1278       rlen = 8;
1279    } else {
1280       assert(inst->exec_size == 8);
1281       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1282       rlen = 4;
1283    }
1284
1285    if (devinfo->gen >= 5)
1286       msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
1287    else {
1288       /* We always use the SIMD16 message so that we only have to load U, and
1289        * not V or R.
1290        */
1291       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
1292       assert(inst->mlen == 3);
1293       assert(inst->size_written == 8 * REG_SIZE);
1294       rlen = 8;
1295       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1296    }
1297
1298    struct brw_reg header = brw_vec8_grf(0, 0);
1299    gen6_resolve_implied_move(p, &header, inst->base_mrf);
1300
1301    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1302    brw_inst_set_compression(devinfo, send, false);
1303    brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1304    brw_set_src0(p, send, header);
1305    if (devinfo->gen < 6)
1306       brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf);
1307
1308    /* Our surface is set up as floats, regardless of what actual data is
1309     * stored in it.
1310     */
1311    uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
1312    brw_set_sampler_message(p, send,
1313                            surf_index,
1314                            0, /* sampler (unused) */
1315                            msg_type,
1316                            rlen,
1317                            inst->mlen,
1318                            inst->header_size != 0,
1319                            simd_mode,
1320                            return_format);
1321 }
1322
1323 void
1324 fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
1325                                                        struct brw_reg dst,
1326                                                        struct brw_reg index,
1327                                                        struct brw_reg offset)
1328 {
1329    assert(devinfo->gen >= 7);
1330    /* Varying-offset pull constant loads are treated as a normal expression on
1331     * gen7, so the fact that it's a send message is hidden at the IR level.
1332     */
1333    assert(inst->header_size == 0);
1334    assert(!inst->mlen);
1335    assert(index.type == BRW_REGISTER_TYPE_UD);
1336
1337    uint32_t simd_mode, rlen, mlen;
1338    if (inst->exec_size == 16) {
1339       mlen = 2;
1340       rlen = 8;
1341       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1342    } else {
1343       assert(inst->exec_size == 8);
1344       mlen = 1;
1345       rlen = 4;
1346       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1347    }
1348
1349    if (index.file == BRW_IMMEDIATE_VALUE) {
1350
1351       uint32_t surf_index = index.ud;
1352
1353       brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1354       brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1355       brw_set_src0(p, send, offset);
1356       brw_set_sampler_message(p, send,
1357                               surf_index,
1358                               0, /* LD message ignores sampler unit */
1359                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1360                               rlen,
1361                               mlen,
1362                               false, /* no header */
1363                               simd_mode,
1364                               0);
1365
1366    } else {
1367
1368       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1369
1370       brw_push_insn_state(p);
1371       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1372
1373       /* a0.0 = surf_index & 0xff */
1374       brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1375       brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
1376       brw_set_dest(p, insn_and, addr);
1377       brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1378       brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1379
1380       brw_pop_insn_state(p);
1381
1382       /* dst = send(offset, a0.0 | <descriptor>) */
1383       brw_inst *insn = brw_send_indirect_message(
1384          p, BRW_SFID_SAMPLER, retype(dst, BRW_REGISTER_TYPE_UW),
1385          offset, addr);
1386       brw_set_sampler_message(p, insn,
1387                               0 /* surface */,
1388                               0 /* sampler */,
1389                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1390                               rlen /* rlen */,
1391                               mlen /* mlen */,
1392                               false /* header */,
1393                               simd_mode,
1394                               0);
1395    }
1396 }
1397
1398 /**
1399  * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
1400  * into the flags register (f0.0).
1401  *
1402  * Used only on Gen6 and above.
1403  */
1404 void
1405 fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst)
1406 {
1407    struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg);
1408    struct brw_reg dispatch_mask;
1409
1410    if (devinfo->gen >= 6)
1411       dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
1412    else
1413       dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1414
1415    brw_push_insn_state(p);
1416    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1417    brw_MOV(p, flags, dispatch_mask);
1418    brw_pop_insn_state(p);
1419 }
1420
1421 void
1422 fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
1423                                                 struct brw_reg dst,
1424                                                 struct brw_reg src,
1425                                                 struct brw_reg msg_data,
1426                                                 unsigned msg_type)
1427 {
1428    assert(inst->size_written % REG_SIZE == 0);
1429    assert(msg_data.type == BRW_REGISTER_TYPE_UD);
1430
1431    brw_pixel_interpolator_query(p,
1432          retype(dst, BRW_REGISTER_TYPE_UW),
1433          src,
1434          inst->pi_noperspective,
1435          msg_type,
1436          msg_data,
1437          inst->mlen,
1438          inst->size_written / REG_SIZE);
1439 }
1440
1441 /* Sets vstride=1, width=4, hstride=0 of register src1 during
1442  * the ADD instruction.
1443  */
1444 void
1445 fs_generator::generate_set_sample_id(fs_inst *inst,
1446                                      struct brw_reg dst,
1447                                      struct brw_reg src0,
1448                                      struct brw_reg src1)
1449 {
1450    assert(dst.type == BRW_REGISTER_TYPE_D ||
1451           dst.type == BRW_REGISTER_TYPE_UD);
1452    assert(src0.type == BRW_REGISTER_TYPE_D ||
1453           src0.type == BRW_REGISTER_TYPE_UD);
1454
1455    struct brw_reg reg = stride(src1, 1, 4, 0);
1456    if (devinfo->gen >= 8 || inst->exec_size == 8) {
1457       brw_ADD(p, dst, src0, reg);
1458    } else if (inst->exec_size == 16) {
1459       brw_push_insn_state(p);
1460       brw_set_default_exec_size(p, BRW_EXECUTE_8);
1461       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1462       brw_ADD(p, firsthalf(dst), firsthalf(src0), reg);
1463       brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1464       brw_ADD(p, sechalf(dst), sechalf(src0), suboffset(reg, 2));
1465       brw_pop_insn_state(p);
1466    }
1467 }
1468
1469 void
1470 fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
1471                                             struct brw_reg dst,
1472                                             struct brw_reg x,
1473                                             struct brw_reg y)
1474 {
1475    assert(devinfo->gen >= 7);
1476    assert(dst.type == BRW_REGISTER_TYPE_UD);
1477    assert(x.type == BRW_REGISTER_TYPE_F);
1478    assert(y.type == BRW_REGISTER_TYPE_F);
1479
1480    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
1481     *
1482     *   Because this instruction does not have a 16-bit floating-point type,
1483     *   the destination data type must be Word (W).
1484     *
1485     *   The destination must be DWord-aligned and specify a horizontal stride
1486     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
1487     *   each destination channel and the upper word is not modified.
1488     */
1489    struct brw_reg dst_w = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1490
1491    /* Give each 32-bit channel of dst the form below, where "." means
1492     * unchanged.
1493     *   0x....hhhh
1494     */
1495    brw_F32TO16(p, dst_w, y);
1496
1497    /* Now the form:
1498     *   0xhhhh0000
1499     */
1500    brw_SHL(p, dst, dst, brw_imm_ud(16u));
1501
1502    /* And, finally the form of packHalf2x16's output:
1503     *   0xhhhhllll
1504     */
1505    brw_F32TO16(p, dst_w, x);
1506 }
1507
1508 void
1509 fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
1510                                               struct brw_reg dst,
1511                                               struct brw_reg src)
1512 {
1513    assert(devinfo->gen >= 7);
1514    assert(dst.type == BRW_REGISTER_TYPE_F);
1515    assert(src.type == BRW_REGISTER_TYPE_UD);
1516
1517    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1518     *
1519     *   Because this instruction does not have a 16-bit floating-point type,
1520     *   the source data type must be Word (W). The destination type must be
1521     *   F (Float).
1522     */
1523    struct brw_reg src_w = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1524
1525    /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
1526     * For the Y case, we wish to access only the upper word; therefore
1527     * a 16-bit subregister offset is needed.
1528     */
1529    assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
1530           inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
1531    if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
1532       src_w.subnr += 2;
1533
1534    brw_F16TO32(p, dst, src_w);
1535 }
1536
1537 void
1538 fs_generator::generate_shader_time_add(fs_inst *inst,
1539                                        struct brw_reg payload,
1540                                        struct brw_reg offset,
1541                                        struct brw_reg value)
1542 {
1543    assert(devinfo->gen >= 7);
1544    brw_push_insn_state(p);
1545    brw_set_default_mask_control(p, true);
1546
1547    assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1548    struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
1549                                           offset.type);
1550    struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
1551                                          value.type);
1552
1553    assert(offset.file == BRW_IMMEDIATE_VALUE);
1554    if (value.file == BRW_GENERAL_REGISTER_FILE) {
1555       value.width = BRW_WIDTH_1;
1556       value.hstride = BRW_HORIZONTAL_STRIDE_0;
1557       value.vstride = BRW_VERTICAL_STRIDE_0;
1558    } else {
1559       assert(value.file == BRW_IMMEDIATE_VALUE);
1560    }
1561
1562    /* Trying to deal with setup of the params from the IR is crazy in the FS8
1563     * case, and we don't really care about squeezing every bit of performance
1564     * out of this path, so we just emit the MOVs from here.
1565     */
1566    brw_MOV(p, payload_offset, offset);
1567    brw_MOV(p, payload_value, value);
1568    brw_shader_time_add(p, payload,
1569                        prog_data->binding_table.shader_time_start);
1570    brw_pop_insn_state(p);
1571
1572    brw_mark_surface_used(prog_data,
1573                          prog_data->binding_table.shader_time_start);
1574 }
1575
1576 void
1577 fs_generator::enable_debug(const char *shader_name)
1578 {
1579    debug_flag = true;
1580    this->shader_name = shader_name;
1581 }
1582
1583 int
1584 fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
1585 {
1586    /* align to 64 byte boundary. */
1587    while (p->next_insn_offset % 64)
1588       brw_NOP(p);
1589
1590    this->dispatch_width = dispatch_width;
1591
1592    int start_offset = p->next_insn_offset;
1593    int spill_count = 0, fill_count = 0;
1594    int loop_count = 0;
1595
1596    struct annotation_info annotation;
1597    memset(&annotation, 0, sizeof(annotation));
1598
1599    foreach_block_and_inst (block, fs_inst, inst, cfg) {
1600       struct brw_reg src[3], dst;
1601       unsigned int last_insn_offset = p->next_insn_offset;
1602       bool multiple_instructions_emitted = false;
1603
1604       /* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the
1605        * "Register Region Restrictions" section: for BDW, SKL:
1606        *
1607        *    "A POW/FDIV operation must not be followed by an instruction
1608        *     that requires two destination registers."
1609        *
1610        * The documentation is often lacking annotations for Atom parts,
1611        * and empirically this affects CHV as well.
1612        */
1613       if (devinfo->gen >= 8 &&
1614           p->nr_insn > 1 &&
1615           brw_inst_opcode(devinfo, brw_last_inst) == BRW_OPCODE_MATH &&
1616           brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW &&
1617           inst->dst.component_size(inst->exec_size) > REG_SIZE) {
1618          brw_NOP(p);
1619          last_insn_offset = p->next_insn_offset;
1620       }
1621
1622       if (unlikely(debug_flag))
1623          annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset);
1624
1625       /* If the instruction writes to more than one register, it needs to be
1626        * explicitly marked as compressed on Gen <= 5.  On Gen >= 6 the
1627        * hardware figures out by itself what the right compression mode is,
1628        * but we still need to know whether the instruction is compressed to
1629        * set up the source register regions appropriately.
1630        *
1631        * XXX - This is wrong for instructions that write a single register but
1632        *       read more than one which should strictly speaking be treated as
1633        *       compressed.  For instructions that don't write any registers it
1634        *       relies on the destination being a null register of the correct
1635        *       type and regioning so the instruction is considered compressed
1636        *       or not accordingly.
1637        */
1638       const bool compressed =
1639            inst->dst.component_size(inst->exec_size) > REG_SIZE;
1640       brw_set_default_compression(p, compressed);
1641       brw_set_default_group(p, inst->group);
1642
1643       for (unsigned int i = 0; i < inst->sources; i++) {
1644          src[i] = brw_reg_from_fs_reg(devinfo, inst,
1645                                       &inst->src[i], compressed);
1646          /* The accumulator result appears to get used for the
1647           * conditional modifier generation.  When negating a UD
1648           * value, there is a 33rd bit generated for the sign in the
1649           * accumulator value, so now you can't check, for example,
1650           * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
1651           */
1652          assert(!inst->conditional_mod ||
1653                 inst->src[i].type != BRW_REGISTER_TYPE_UD ||
1654                 !inst->src[i].negate);
1655       }
1656       dst = brw_reg_from_fs_reg(devinfo, inst,
1657                                 &inst->dst, compressed);
1658
1659       brw_set_default_access_mode(p, BRW_ALIGN_1);
1660       brw_set_default_predicate_control(p, inst->predicate);
1661       brw_set_default_predicate_inverse(p, inst->predicate_inverse);
1662       brw_set_default_flag_reg(p, 0, inst->flag_subreg);
1663       brw_set_default_saturate(p, inst->saturate);
1664       brw_set_default_mask_control(p, inst->force_writemask_all);
1665       brw_set_default_acc_write_control(p, inst->writes_accumulator);
1666
1667       unsigned exec_size = inst->exec_size;
1668       if (devinfo->gen == 7 && !devinfo->is_haswell &&
1669           (get_exec_type_size(inst) == 8 || type_sz(inst->dst.type) == 8)) {
1670          exec_size *= 2;
1671       }
1672
1673       brw_set_default_exec_size(p, cvt(exec_size) - 1);
1674
1675       assert(inst->force_writemask_all || inst->exec_size >= 4);
1676       assert(inst->force_writemask_all || inst->group % inst->exec_size == 0);
1677       assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen));
1678       assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
1679
1680       switch (inst->opcode) {
1681       case BRW_OPCODE_MOV:
1682          brw_MOV(p, dst, src[0]);
1683          break;
1684       case BRW_OPCODE_ADD:
1685          brw_ADD(p, dst, src[0], src[1]);
1686          break;
1687       case BRW_OPCODE_MUL:
1688          brw_MUL(p, dst, src[0], src[1]);
1689          break;
1690       case BRW_OPCODE_AVG:
1691          brw_AVG(p, dst, src[0], src[1]);
1692          break;
1693       case BRW_OPCODE_MACH:
1694          brw_MACH(p, dst, src[0], src[1]);
1695          break;
1696
1697       case BRW_OPCODE_LINE:
1698          brw_LINE(p, dst, src[0], src[1]);
1699          break;
1700
1701       case BRW_OPCODE_MAD:
1702          assert(devinfo->gen >= 6);
1703          brw_set_default_access_mode(p, BRW_ALIGN_16);
1704          brw_MAD(p, dst, src[0], src[1], src[2]);
1705          break;
1706
1707       case BRW_OPCODE_LRP:
1708          assert(devinfo->gen >= 6);
1709          brw_set_default_access_mode(p, BRW_ALIGN_16);
1710          brw_LRP(p, dst, src[0], src[1], src[2]);
1711          break;
1712
1713       case BRW_OPCODE_FRC:
1714          brw_FRC(p, dst, src[0]);
1715          break;
1716       case BRW_OPCODE_RNDD:
1717          brw_RNDD(p, dst, src[0]);
1718          break;
1719       case BRW_OPCODE_RNDE:
1720          brw_RNDE(p, dst, src[0]);
1721          break;
1722       case BRW_OPCODE_RNDZ:
1723          brw_RNDZ(p, dst, src[0]);
1724          break;
1725
1726       case BRW_OPCODE_AND:
1727          brw_AND(p, dst, src[0], src[1]);
1728          break;
1729       case BRW_OPCODE_OR:
1730          brw_OR(p, dst, src[0], src[1]);
1731          break;
1732       case BRW_OPCODE_XOR:
1733          brw_XOR(p, dst, src[0], src[1]);
1734          break;
1735       case BRW_OPCODE_NOT:
1736          brw_NOT(p, dst, src[0]);
1737          break;
1738       case BRW_OPCODE_ASR:
1739          brw_ASR(p, dst, src[0], src[1]);
1740          break;
1741       case BRW_OPCODE_SHR:
1742          brw_SHR(p, dst, src[0], src[1]);
1743          break;
1744       case BRW_OPCODE_SHL:
1745          brw_SHL(p, dst, src[0], src[1]);
1746          break;
1747       case BRW_OPCODE_F32TO16:
1748          assert(devinfo->gen >= 7);
1749          brw_F32TO16(p, dst, src[0]);
1750          break;
1751       case BRW_OPCODE_F16TO32:
1752          assert(devinfo->gen >= 7);
1753          brw_F16TO32(p, dst, src[0]);
1754          break;
1755       case BRW_OPCODE_CMP:
1756          if (inst->exec_size >= 16 && devinfo->gen == 7 && !devinfo->is_haswell &&
1757              dst.file == BRW_ARCHITECTURE_REGISTER_FILE) {
1758             /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround
1759              * implemented in the compiler is not sufficient. Overriding the
1760              * type when the destination is the null register is necessary but
1761              * not sufficient by itself.
1762              */
1763             assert(dst.nr == BRW_ARF_NULL);
1764             dst.type = BRW_REGISTER_TYPE_D;
1765          }
1766          brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1767          break;
1768       case BRW_OPCODE_SEL:
1769          brw_SEL(p, dst, src[0], src[1]);
1770          break;
1771       case BRW_OPCODE_BFREV:
1772          assert(devinfo->gen >= 7);
1773          /* BFREV only supports UD type for src and dst. */
1774          brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
1775                       retype(src[0], BRW_REGISTER_TYPE_UD));
1776          break;
1777       case BRW_OPCODE_FBH:
1778          assert(devinfo->gen >= 7);
1779          /* FBH only supports UD type for dst. */
1780          brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1781          break;
1782       case BRW_OPCODE_FBL:
1783          assert(devinfo->gen >= 7);
1784          /* FBL only supports UD type for dst. */
1785          brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1786          break;
1787       case BRW_OPCODE_LZD:
1788          brw_LZD(p, dst, src[0]);
1789          break;
1790       case BRW_OPCODE_CBIT:
1791          assert(devinfo->gen >= 7);
1792          /* CBIT only supports UD type for dst. */
1793          brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1794          break;
1795       case BRW_OPCODE_ADDC:
1796          assert(devinfo->gen >= 7);
1797          brw_ADDC(p, dst, src[0], src[1]);
1798          break;
1799       case BRW_OPCODE_SUBB:
1800          assert(devinfo->gen >= 7);
1801          brw_SUBB(p, dst, src[0], src[1]);
1802          break;
1803       case BRW_OPCODE_MAC:
1804          brw_MAC(p, dst, src[0], src[1]);
1805          break;
1806
1807       case BRW_OPCODE_BFE:
1808          assert(devinfo->gen >= 7);
1809          brw_set_default_access_mode(p, BRW_ALIGN_16);
1810          brw_BFE(p, dst, src[0], src[1], src[2]);
1811          break;
1812
1813       case BRW_OPCODE_BFI1:
1814          assert(devinfo->gen >= 7);
1815          brw_BFI1(p, dst, src[0], src[1]);
1816          break;
1817       case BRW_OPCODE_BFI2:
1818          assert(devinfo->gen >= 7);
1819          brw_set_default_access_mode(p, BRW_ALIGN_16);
1820          brw_BFI2(p, dst, src[0], src[1], src[2]);
1821          break;
1822
1823       case BRW_OPCODE_IF:
1824          if (inst->src[0].file != BAD_FILE) {
1825             /* The instruction has an embedded compare (only allowed on gen6) */
1826             assert(devinfo->gen == 6);
1827             gen6_IF(p, inst->conditional_mod, src[0], src[1]);
1828          } else {
1829             brw_IF(p, brw_inst_exec_size(devinfo, p->current));
1830          }
1831          break;
1832
1833       case BRW_OPCODE_ELSE:
1834          brw_ELSE(p);
1835          break;
1836       case BRW_OPCODE_ENDIF:
1837          brw_ENDIF(p);
1838          break;
1839
1840       case BRW_OPCODE_DO:
1841          brw_DO(p, brw_inst_exec_size(devinfo, p->current));
1842          break;
1843
1844       case BRW_OPCODE_BREAK:
1845          brw_BREAK(p);
1846          break;
1847       case BRW_OPCODE_CONTINUE:
1848          brw_CONT(p);
1849          break;
1850
1851       case BRW_OPCODE_WHILE:
1852          brw_WHILE(p);
1853          loop_count++;
1854          break;
1855
1856       case SHADER_OPCODE_RCP:
1857       case SHADER_OPCODE_RSQ:
1858       case SHADER_OPCODE_SQRT:
1859       case SHADER_OPCODE_EXP2:
1860       case SHADER_OPCODE_LOG2:
1861       case SHADER_OPCODE_SIN:
1862       case SHADER_OPCODE_COS:
1863          assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1864          if (devinfo->gen >= 6) {
1865             assert(inst->mlen == 0);
1866             assert(devinfo->gen >= 7 || inst->exec_size == 8);
1867             gen6_math(p, dst, brw_math_function(inst->opcode),
1868                       src[0], brw_null_reg());
1869          } else {
1870             assert(inst->mlen >= 1);
1871             assert(devinfo->gen == 5 || devinfo->is_g4x || inst->exec_size == 8);
1872             gen4_math(p, dst,
1873                       brw_math_function(inst->opcode),
1874                       inst->base_mrf, src[0],
1875                       BRW_MATH_PRECISION_FULL);
1876          }
1877          break;
1878       case SHADER_OPCODE_INT_QUOTIENT:
1879       case SHADER_OPCODE_INT_REMAINDER:
1880       case SHADER_OPCODE_POW:
1881          assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1882          if (devinfo->gen >= 6) {
1883             assert(inst->mlen == 0);
1884             assert((devinfo->gen >= 7 && inst->opcode == SHADER_OPCODE_POW) ||
1885                    inst->exec_size == 8);
1886             gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
1887          } else {
1888             assert(inst->mlen >= 1);
1889             assert(inst->exec_size == 8);
1890             gen4_math(p, dst, brw_math_function(inst->opcode),
1891                       inst->base_mrf, src[0],
1892                       BRW_MATH_PRECISION_FULL);
1893          }
1894          break;
1895       case FS_OPCODE_CINTERP:
1896          brw_MOV(p, dst, src[0]);
1897          break;
1898       case FS_OPCODE_LINTERP:
1899          generate_linterp(inst, dst, src);
1900          break;
1901       case FS_OPCODE_PIXEL_X:
1902          assert(src[0].type == BRW_REGISTER_TYPE_UW);
1903          src[0].subnr = 0 * type_sz(src[0].type);
1904          brw_MOV(p, dst, stride(src[0], 8, 4, 1));
1905          break;
1906       case FS_OPCODE_PIXEL_Y:
1907          assert(src[0].type == BRW_REGISTER_TYPE_UW);
1908          src[0].subnr = 4 * type_sz(src[0].type);
1909          brw_MOV(p, dst, stride(src[0], 8, 4, 1));
1910          break;
1911       case FS_OPCODE_GET_BUFFER_SIZE:
1912          generate_get_buffer_size(inst, dst, src[0], src[1]);
1913          break;
1914       case SHADER_OPCODE_TEX:
1915       case FS_OPCODE_TXB:
1916       case SHADER_OPCODE_TXD:
1917       case SHADER_OPCODE_TXF:
1918       case SHADER_OPCODE_TXF_LZ:
1919       case SHADER_OPCODE_TXF_CMS:
1920       case SHADER_OPCODE_TXF_CMS_W:
1921       case SHADER_OPCODE_TXF_UMS:
1922       case SHADER_OPCODE_TXF_MCS:
1923       case SHADER_OPCODE_TXL:
1924       case SHADER_OPCODE_TXL_LZ:
1925       case SHADER_OPCODE_TXS:
1926       case SHADER_OPCODE_LOD:
1927       case SHADER_OPCODE_TG4:
1928       case SHADER_OPCODE_TG4_OFFSET:
1929       case SHADER_OPCODE_SAMPLEINFO:
1930          generate_tex(inst, dst, src[0], src[1], src[2]);
1931          break;
1932       case FS_OPCODE_DDX_COARSE:
1933       case FS_OPCODE_DDX_FINE:
1934          generate_ddx(inst->opcode, dst, src[0]);
1935          break;
1936       case FS_OPCODE_DDY_COARSE:
1937       case FS_OPCODE_DDY_FINE:
1938          generate_ddy(inst->opcode, dst, src[0]);
1939          break;
1940
1941       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1942          generate_scratch_write(inst, src[0]);
1943          spill_count++;
1944          break;
1945
1946       case SHADER_OPCODE_GEN4_SCRATCH_READ:
1947          generate_scratch_read(inst, dst);
1948          fill_count++;
1949          break;
1950
1951       case SHADER_OPCODE_GEN7_SCRATCH_READ:
1952          generate_scratch_read_gen7(inst, dst);
1953          fill_count++;
1954          break;
1955
1956       case SHADER_OPCODE_MOV_INDIRECT:
1957          generate_mov_indirect(inst, dst, src[0], src[1]);
1958          break;
1959
1960       case SHADER_OPCODE_URB_READ_SIMD8:
1961       case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
1962          generate_urb_read(inst, dst, src[0]);
1963          break;
1964
1965       case SHADER_OPCODE_URB_WRITE_SIMD8:
1966       case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
1967       case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
1968       case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
1969          generate_urb_write(inst, src[0]);
1970          break;
1971
1972       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1973          assert(inst->force_writemask_all);
1974          generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
1975          break;
1976
1977       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
1978          assert(inst->force_writemask_all);
1979          generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1980          break;
1981
1982       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
1983          generate_varying_pull_constant_load_gen4(inst, dst, src[0]);
1984          break;
1985
1986       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
1987          generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1988          break;
1989
1990       case FS_OPCODE_REP_FB_WRITE:
1991       case FS_OPCODE_FB_WRITE:
1992          generate_fb_write(inst, src[0]);
1993          break;
1994
1995       case FS_OPCODE_FB_READ:
1996          generate_fb_read(inst, dst, src[0]);
1997          break;
1998
1999       case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
2000          generate_mov_dispatch_to_flags(inst);
2001          break;
2002
2003       case FS_OPCODE_DISCARD_JUMP:
2004          generate_discard_jump(inst);
2005          break;
2006
2007       case SHADER_OPCODE_SHADER_TIME_ADD:
2008          generate_shader_time_add(inst, src[0], src[1], src[2]);
2009          break;
2010
2011       case SHADER_OPCODE_UNTYPED_ATOMIC:
2012          assert(src[2].file == BRW_IMMEDIATE_VALUE);
2013          brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud,
2014                             inst->mlen, !inst->dst.is_null());
2015          break;
2016
2017       case SHADER_OPCODE_UNTYPED_SURFACE_READ:
2018          assert(src[2].file == BRW_IMMEDIATE_VALUE);
2019          brw_untyped_surface_read(p, dst, src[0], src[1],
2020                                   inst->mlen, src[2].ud);
2021          break;
2022
2023       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
2024          assert(src[2].file == BRW_IMMEDIATE_VALUE);
2025          brw_untyped_surface_write(p, src[0], src[1],
2026                                    inst->mlen, src[2].ud);
2027          break;
2028
2029       case SHADER_OPCODE_TYPED_ATOMIC:
2030          assert(src[2].file == BRW_IMMEDIATE_VALUE);
2031          brw_typed_atomic(p, dst, src[0], src[1],
2032                           src[2].ud, inst->mlen, !inst->dst.is_null());
2033          break;
2034
2035       case SHADER_OPCODE_TYPED_SURFACE_READ:
2036          assert(src[2].file == BRW_IMMEDIATE_VALUE);
2037          brw_typed_surface_read(p, dst, src[0], src[1],
2038                                 inst->mlen, src[2].ud);
2039          break;
2040
2041       case SHADER_OPCODE_TYPED_SURFACE_WRITE:
2042          assert(src[2].file == BRW_IMMEDIATE_VALUE);
2043          brw_typed_surface_write(p, src[0], src[1], inst->mlen, src[2].ud);
2044          break;
2045
2046       case SHADER_OPCODE_MEMORY_FENCE:
2047          brw_memory_fence(p, dst);
2048          break;
2049
2050       case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
2051          const struct brw_reg mask =
2052             brw_stage_has_packed_dispatch(devinfo, stage,
2053                                           prog_data) ? brw_imm_ud(~0u) :
2054             stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
2055             brw_dmask_reg();
2056          brw_find_live_channel(p, dst, mask);
2057          break;
2058       }
2059
2060       case SHADER_OPCODE_BROADCAST:
2061          assert(inst->force_writemask_all);
2062          brw_broadcast(p, dst, src[0], src[1]);
2063          break;
2064
2065       case FS_OPCODE_SET_SAMPLE_ID:
2066          generate_set_sample_id(inst, dst, src[0], src[1]);
2067          break;
2068
2069       case FS_OPCODE_PACK_HALF_2x16_SPLIT:
2070           generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
2071           break;
2072
2073       case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
2074       case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
2075          generate_unpack_half_2x16_split(inst, dst, src[0]);
2076          break;
2077
2078       case FS_OPCODE_PLACEHOLDER_HALT:
2079          /* This is the place where the final HALT needs to be inserted if
2080           * we've emitted any discards.  If not, this will emit no code.
2081           */
2082          if (!patch_discard_jumps_to_fb_writes()) {
2083             if (unlikely(debug_flag)) {
2084                annotation.ann_count--;
2085             }
2086          }
2087          break;
2088
2089       case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2090          generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2091                                            GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE);
2092          break;
2093
2094       case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2095          generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2096                                            GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET);
2097          break;
2098
2099       case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2100          generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2101                                            GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET);
2102          break;
2103
2104       case CS_OPCODE_CS_TERMINATE:
2105          generate_cs_terminate(inst, src[0]);
2106          break;
2107
2108       case SHADER_OPCODE_BARRIER:
2109          generate_barrier(inst, src[0]);
2110          break;
2111
2112       case BRW_OPCODE_DIM:
2113          assert(devinfo->is_haswell);
2114          assert(src[0].type == BRW_REGISTER_TYPE_DF);
2115          assert(dst.type == BRW_REGISTER_TYPE_DF);
2116          brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
2117          break;
2118
2119       default:
2120          unreachable("Unsupported opcode");
2121
2122       case SHADER_OPCODE_LOAD_PAYLOAD:
2123          unreachable("Should be lowered by lower_load_payload()");
2124       }
2125
2126       if (multiple_instructions_emitted)
2127          continue;
2128
2129       if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
2130          assert(p->next_insn_offset == last_insn_offset + 16 ||
2131                 !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
2132                  "emitting more than 1 instruction");
2133
2134          brw_inst *last = &p->store[last_insn_offset / 16];
2135
2136          if (inst->conditional_mod)
2137             brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
2138          brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
2139          brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
2140       }
2141    }
2142
2143    brw_set_uip_jip(p, start_offset);
2144    annotation_finalize(&annotation, p->next_insn_offset);
2145
2146 #ifndef NDEBUG
2147    bool validated = brw_validate_instructions(p, start_offset, &annotation);
2148 #else
2149    if (unlikely(debug_flag))
2150       brw_validate_instructions(p, start_offset, &annotation);
2151 #endif
2152
2153    int before_size = p->next_insn_offset - start_offset;
2154    brw_compact_instructions(p, start_offset, annotation.ann_count,
2155                             annotation.ann);
2156    int after_size = p->next_insn_offset - start_offset;
2157
2158    if (unlikely(debug_flag)) {
2159       fprintf(stderr, "Native code for %s\n"
2160               "SIMD%d shader: %d instructions. %d loops. %u cycles. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d"
2161               " bytes (%.0f%%)\n",
2162               shader_name, dispatch_width, before_size / 16, loop_count, cfg->cycle_count,
2163               spill_count, fill_count, promoted_constants, before_size, after_size,
2164               100.0f * (before_size - after_size) / before_size);
2165
2166       dump_assembly(p->store, annotation.ann_count, annotation.ann,
2167                     p->devinfo);
2168       ralloc_free(annotation.mem_ctx);
2169    }
2170    assert(validated);
2171
2172    compiler->shader_debug_log(log_data,
2173                               "%s SIMD%d shader: %d inst, %d loops, %u cycles, "
2174                               "%d:%d spills:fills, Promoted %u constants, "
2175                               "compacted %d to %d bytes.",
2176                               _mesa_shader_stage_to_abbrev(stage),
2177                               dispatch_width, before_size / 16,
2178                               loop_count, cfg->cycle_count, spill_count,
2179                               fill_count, promoted_constants, before_size,
2180                               after_size);
2181
2182    return start_offset;
2183 }
2184
2185 const unsigned *
2186 fs_generator::get_assembly(unsigned int *assembly_size)
2187 {
2188    return brw_get_program(p, assembly_size);
2189 }