src/mesa/drivers/dri/i965/gen8_fs_generator.cpp

   1 /*
   2  * Copyright © 2010, 2011, 2012 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file gen8_fs_generate.cpp
  25  *
  26  * Code generation for Gen8+ hardware.
  27  */
  28
  29 extern "C" {
  30 #include "main/macros.h"
  31 #include "brw_context.h"
  32 } /* extern "C" */
  33
  34 #include "brw_fs.h"
  35 #include "brw_cfg.h"
  36 #include "glsl/ir_print_visitor.h"
  37
  38 gen8_fs_generator::gen8_fs_generator(struct brw_context *brw,
  39                                      void *mem_ctx,
  40                                      const struct brw_wm_prog_key *key,
  41                                      struct brw_wm_prog_data *prog_data,
  42                                      struct gl_shader_program *shader_prog,
  43                                      struct gl_fragment_program *fp,
  44                                      bool dual_source_output)
  45    : gen8_generator(brw, shader_prog, fp ? &fp->Base : NULL, mem_ctx),
  46      key(key), prog_data(prog_data),
  47      fp(fp), dual_source_output(dual_source_output)
  48 {
  49 }
  50
  51 gen8_fs_generator::~gen8_fs_generator()
  52 {
  53 }
  54
  55 void
  56 gen8_fs_generator::generate_fb_write(fs_inst *ir)
  57 {
  58    /* Disable the discard condition while setting up the header. */
  59    default_state.predicate = BRW_PREDICATE_NONE;
  60    default_state.predicate_inverse = false;
  61    default_state.flag_subreg_nr = 0;
  62
  63    if (ir->header_present) {
  64       /* The GPU will use the predicate on SENDC, unless the header is present.
  65        */
  66       if (fp && fp->UsesKill) {
  67          gen8_instruction *mov =
  68             MOV(retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW),
  69                 brw_flag_reg(0, 1));
  70          gen8_set_mask_control(mov, BRW_MASK_DISABLE);
  71       }
  72
  73       gen8_instruction *mov =
  74          MOV_RAW(brw_message_reg(ir->base_mrf), brw_vec8_grf(0, 0));
  75       gen8_set_exec_size(mov, BRW_EXECUTE_16);
  76
  77       if (ir->target > 0 && key->replicate_alpha) {
  78          /* Set "Source0 Alpha Present to RenderTarget" bit in the header. */
  79          gen8_instruction *inst =
  80             OR(get_element_ud(brw_message_reg(ir->base_mrf), 0),
  81                vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
  82                brw_imm_ud(1 << 11));
  83          gen8_set_mask_control(inst, BRW_MASK_DISABLE);
  84       }
  85
  86       if (ir->target > 0) {
  87          /* Set the render target index for choosing BLEND_STATE. */
  88          MOV_RAW(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, ir->base_mrf, 2),
  89                  brw_imm_ud(ir->target));
  90       }
  91    }
  92
  93    /* Set the predicate back to get the conditional write if necessary for
  94     * discards.
  95     */
  96    default_state.predicate = ir->predicate;
  97    default_state.predicate_inverse = ir->predicate_inverse;
  98    default_state.flag_subreg_nr = ir->flag_subreg;
  99
 100    gen8_instruction *inst = next_inst(BRW_OPCODE_SENDC);
 101    gen8_set_dst(brw, inst, retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW));
 102    gen8_set_src0(brw, inst, brw_message_reg(ir->base_mrf));
 103
 104    /* Set up the "Message Specific Control" bits for the Data Port Message
 105     * Descriptor.  These are documented in the "Render Target Write" message's
 106     * "Message Descriptor" documentation (vol5c.2).
 107     */
 108    uint32_t msg_type;
 109    /* Set the Message Type */
 110    if (this->dual_source_output)
 111       msg_type = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
 112    else if (dispatch_width == 16)
 113       msg_type = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
 114    else
 115       msg_type = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
 116
 117    uint32_t msg_control = msg_type;
 118
 119    /* Set "Last Render Target Select" on the final FB write. */
 120    if (ir->eot)
 121       msg_control |= (1 << 4); /* Last Render Target Select */
 122
 123    uint32_t surf_index =
 124       prog_data->binding_table.render_target_start + ir->target;
 125
 126    gen8_set_dp_message(brw, inst,
 127                        GEN6_SFID_DATAPORT_RENDER_CACHE,
 128                        surf_index,
 129                        GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE,
 130                        msg_control,
 131                        ir->mlen,
 132                        0,
 133                        ir->header_present,
 134                        ir->eot);
 135
 136    brw_mark_surface_used(&prog_data->base, surf_index);
 137 }
 138
 139 void
 140 gen8_fs_generator::generate_linterp(fs_inst *inst,
 141                                     struct brw_reg dst,
 142                                     struct brw_reg *src)
 143 {
 144    struct brw_reg delta_x = src[0];
 145    struct brw_reg delta_y = src[1];
 146    struct brw_reg interp = src[2];
 147
 148    (void) delta_y;
 149    assert(delta_y.nr == delta_x.nr + 1);
 150    PLN(dst, interp, delta_x);
 151 }
 152
 153 void
 154 gen8_fs_generator::generate_tex(fs_inst *ir,
 155                                 struct brw_reg dst,
 156                                 struct brw_reg src,
 157                                 struct brw_reg sampler_index)
 158 {
 159    int msg_type = -1;
 160    int rlen = 4;
 161    uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 162
 163    assert(src.file == BRW_GENERAL_REGISTER_FILE);
 164
 165    if (dispatch_width == 16 && !ir->force_uncompressed && !ir->force_sechalf)
 166       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 167
 168    switch (ir->opcode) {
 169    case SHADER_OPCODE_TEX:
 170       if (ir->shadow_compare) {
 171          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
 172       } else {
 173          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
 174       }
 175       break;
 176    case FS_OPCODE_TXB:
 177       if (ir->shadow_compare) {
 178          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
 179       } else {
 180          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
 181       }
 182       break;
 183    case SHADER_OPCODE_TXL:
 184       if (ir->shadow_compare) {
 185          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
 186       } else {
 187          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
 188       }
 189       break;
 190    case SHADER_OPCODE_TXS:
 191       msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
 192       break;
 193    case SHADER_OPCODE_TXD:
 194       if (ir->shadow_compare) {
 195          msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
 196       } else {
 197          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
 198       }
 199       break;
 200    case SHADER_OPCODE_TXF:
 201       msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
 202       break;
 203    case SHADER_OPCODE_TXF_CMS:
 204       msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
 205       break;
 206    case SHADER_OPCODE_TXF_UMS:
 207       msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
 208       break;
 209    case SHADER_OPCODE_TXF_MCS:
 210       msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
 211       break;
 212    case SHADER_OPCODE_LOD:
 213       msg_type = GEN5_SAMPLER_MESSAGE_LOD;
 214       break;
 215    case SHADER_OPCODE_TG4:
 216       if (ir->shadow_compare) {
 217          assert(brw->gen >= 7);
 218          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
 219       } else {
 220          assert(brw->gen >= 6);
 221          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
 222       }
 223       break;
 224    case SHADER_OPCODE_TG4_OFFSET:
 225       assert(brw->gen >= 7);
 226       if (ir->shadow_compare) {
 227          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
 228       } else {
 229          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
 230       }
 231       break;
 232    default:
 233       unreachable("not reached");
 234    }
 235    assert(msg_type != -1);
 236
 237    if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
 238       rlen = 8;
 239       dst = vec16(dst);
 240    }
 241
 242    assert(sampler_index.file == BRW_IMMEDIATE_VALUE);
 243    assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
 244
 245    uint32_t sampler = sampler_index.dw1.ud;
 246
 247    if (ir->header_present) {
 248       /* The send-from-GRF for SIMD16 texturing with a header has an extra
 249        * hardware register allocated to it, which we need to skip over (since
 250        * our coordinates in the payload are in the even-numbered registers,
 251        * and the header comes right before the first one.
 252        */
 253       if (dispatch_width == 16)
 254          src.nr++;
 255
 256       unsigned save_exec_size = default_state.exec_size;
 257       default_state.exec_size = BRW_EXECUTE_8;
 258
 259       MOV_RAW(src, brw_vec8_grf(0, 0));
 260
 261       if (ir->texture_offset) {
 262          /* Set the texel offset bits. */
 263          MOV_RAW(retype(brw_vec1_grf(src.nr, 2), BRW_REGISTER_TYPE_UD),
 264                  brw_imm_ud(ir->texture_offset));
 265       }
 266
 267       if (sampler >= 16) {
 268          /* The "Sampler Index" field can only store values between 0 and 15.
 269           * However, we can add an offset to the "Sampler State Pointer"
 270           * field, effectively selecting a different set of 16 samplers.
 271           *
 272           * The "Sampler State Pointer" needs to be aligned to a 32-byte
 273           * offset, and each sampler state is only 16-bytes, so we can't
 274           * exclusively use the offset - we have to use both.
 275           */
 276          const int sampler_state_size = 16; /* 16 bytes */
 277          gen8_instruction *add =
 278             ADD(get_element_ud(src, 3),
 279                 get_element_ud(brw_vec8_grf(0, 0), 3),
 280                 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
 281          gen8_set_mask_control(add, BRW_MASK_DISABLE);
 282       }
 283
 284       default_state.exec_size = save_exec_size;
 285    }
 286
 287    uint32_t surf_index =
 288       prog_data->base.binding_table.texture_start + sampler;
 289
 290    gen8_instruction *inst = next_inst(BRW_OPCODE_SEND);
 291    gen8_set_dst(brw, inst, dst);
 292    gen8_set_src0(brw, inst, src);
 293    gen8_set_sampler_message(brw, inst,
 294                             surf_index,
 295                             sampler % 16,
 296                             msg_type,
 297                             rlen,
 298                             ir->mlen,
 299                             ir->header_present,
 300                             simd_mode);
 301
 302    brw_mark_surface_used(&prog_data->base, surf_index);
 303 }
 304
 305
 306 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 307  * looking like:
 308  *
 309  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 310  *
 311  * and we're trying to produce:
 312  *
 313  *           DDX                     DDY
 314  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 315  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 316  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 317  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 318  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 319  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 320  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 321  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 322  *
 323  * and add another set of two more subspans if in 16-pixel dispatch mode.
 324  *
 325  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 326  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
 327  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
 328  * between each other.  We could probably do it like ddx and swizzle the right
 329  * order later, but bail for now and just produce
 330  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
 331  */
 332 void
 333 gen8_fs_generator::generate_ddx(fs_inst *inst,
 334                                 struct brw_reg dst,
 335                                 struct brw_reg src)
 336 {
 337    unsigned vstride, width;
 338
 339    if (key->high_quality_derivatives) {
 340       /* Produce accurate derivatives. */
 341       vstride = BRW_VERTICAL_STRIDE_2;
 342       width = BRW_WIDTH_2;
 343    } else {
 344       /* Replicate the derivative at the top-left pixel to other pixels. */
 345       vstride = BRW_VERTICAL_STRIDE_4;
 346       width = BRW_WIDTH_4;
 347    }
 348
 349    struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
 350                                  BRW_REGISTER_TYPE_F,
 351                                  vstride,
 352                                  width,
 353                                  BRW_HORIZONTAL_STRIDE_0,
 354                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 355    struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
 356                                  BRW_REGISTER_TYPE_F,
 357                                  vstride,
 358                                  width,
 359                                  BRW_HORIZONTAL_STRIDE_0,
 360                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 361    ADD(dst, src0, negate(src1));
 362 }
 363
 364 /* The negate_value boolean is used to negate the derivative computation for
 365  * FBOs, since they place the origin at the upper left instead of the lower
 366  * left.
 367  */
 368 void
 369 gen8_fs_generator::generate_ddy(fs_inst *inst,
 370                                 struct brw_reg dst,
 371                                 struct brw_reg src,
 372                                 bool negate_value)
 373 {
 374    unsigned hstride;
 375    unsigned src0_swizzle;
 376    unsigned src1_swizzle;
 377    unsigned src1_subnr;
 378
 379    if (key->high_quality_derivatives) {
 380       /* Produce accurate derivatives. */
 381       hstride = BRW_HORIZONTAL_STRIDE_1;
 382       src0_swizzle = BRW_SWIZZLE_XYXY;
 383       src1_swizzle = BRW_SWIZZLE_ZWZW;
 384       src1_subnr = 0;
 385
 386       default_state.access_mode = BRW_ALIGN_16;
 387    } else {
 388       /* Replicate the derivative at the top-left pixel to other pixels. */
 389       hstride = BRW_HORIZONTAL_STRIDE_0;
 390       src0_swizzle = BRW_SWIZZLE_XYZW;
 391       src1_swizzle = BRW_SWIZZLE_XYZW;
 392       src1_subnr = 2;
 393    }
 394
 395    struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
 396                                  BRW_REGISTER_TYPE_F,
 397                                  BRW_VERTICAL_STRIDE_4,
 398                                  BRW_WIDTH_4,
 399                                  hstride,
 400                                  src0_swizzle, WRITEMASK_XYZW);
 401    struct brw_reg src1 = brw_reg(src.file, src.nr, src1_subnr,
 402                                  BRW_REGISTER_TYPE_F,
 403                                  BRW_VERTICAL_STRIDE_4,
 404                                  BRW_WIDTH_4,
 405                                  hstride,
 406                                  src1_swizzle, WRITEMASK_XYZW);
 407
 408    if (negate_value)
 409       ADD(dst, src1, negate(src0));
 410    else
 411       ADD(dst, src0, negate(src1));
 412
 413    default_state.access_mode = BRW_ALIGN_1;
 414 }
 415
 416 void
 417 gen8_fs_generator::generate_scratch_write(fs_inst *ir, struct brw_reg src)
 418 {
 419    MOV(retype(brw_message_reg(ir->base_mrf + 1), BRW_REGISTER_TYPE_UD),
 420        retype(src, BRW_REGISTER_TYPE_UD));
 421
 422    struct brw_reg mrf =
 423       retype(brw_message_reg(ir->base_mrf), BRW_REGISTER_TYPE_UD);
 424
 425    const int num_regs = dispatch_width / 8;
 426
 427    uint32_t msg_control;
 428    if (num_regs == 1)
 429       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
 430    else
 431       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
 432
 433    /* Set up the message header.  This is g0, with g0.2 filled with
 434     * the offset.  We don't want to leave our offset around in g0 or
 435     * it'll screw up texture samples, so set it up inside the message
 436     * reg.
 437     */
 438    unsigned save_exec_size = default_state.exec_size;
 439    default_state.exec_size = BRW_EXECUTE_8;
 440
 441    MOV_RAW(mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
 442    /* set message header global offset field (reg 0, element 2) */
 443    MOV_RAW(get_element_ud(mrf, 2), brw_imm_ud(ir->offset / 16));
 444
 445    struct brw_reg dst;
 446    if (dispatch_width == 16)
 447       dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
 448    else
 449       dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
 450
 451    default_state.exec_size = BRW_EXECUTE_16;
 452
 453    gen8_instruction *send = next_inst(BRW_OPCODE_SEND);
 454    gen8_set_dst(brw, send, dst);
 455    gen8_set_src0(brw, send, mrf);
 456    gen8_set_dp_message(brw, send, GEN7_SFID_DATAPORT_DATA_CACHE,
 457                        255, /* binding table index: stateless access */
 458                        GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE,
 459                        msg_control,
 460                        1 + num_regs, /* mlen */
 461                        0,            /* rlen */
 462                        true,         /* header present */
 463                        false);       /* EOT */
 464
 465    default_state.exec_size = save_exec_size;
 466 }
 467
 468 void
 469 gen8_fs_generator::generate_scratch_read(fs_inst *ir, struct brw_reg dst)
 470 {
 471    struct brw_reg mrf =
 472       retype(brw_message_reg(ir->base_mrf), BRW_REGISTER_TYPE_UD);
 473
 474    const int num_regs = dispatch_width / 8;
 475
 476    uint32_t msg_control;
 477    if (num_regs == 1)
 478       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
 479    else
 480       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
 481
 482    unsigned save_exec_size = default_state.exec_size;
 483    default_state.exec_size = BRW_EXECUTE_8;
 484
 485    MOV_RAW(mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
 486    /* set message header global offset field (reg 0, element 2) */
 487    MOV_RAW(get_element_ud(mrf, 2), brw_imm_ud(ir->offset / 16));
 488
 489    gen8_instruction *send = next_inst(BRW_OPCODE_SEND);
 490    gen8_set_dst(brw, send, retype(dst, BRW_REGISTER_TYPE_UW));
 491    gen8_set_src0(brw, send, mrf);
 492    gen8_set_dp_message(brw, send, GEN7_SFID_DATAPORT_DATA_CACHE,
 493                        255, /* binding table index: stateless access */
 494                        BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
 495                        msg_control,
 496                        1,        /* mlen */
 497                        num_regs, /* rlen */
 498                        true,     /* header present */
 499                        false);   /* EOT */
 500
 501    default_state.exec_size = save_exec_size;
 502 }
 503
 504 void
 505 gen8_fs_generator::generate_scratch_read_gen7(fs_inst *ir, struct brw_reg dst)
 506 {
 507    unsigned save_exec_size = default_state.exec_size;
 508    gen8_instruction *send = next_inst(BRW_OPCODE_SEND);
 509
 510    int num_regs = dispatch_width / 8;
 511
 512    /* According to the docs, offset is "A 12-bit HWord offset into the memory
 513     * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
 514     * is 32 bytes, which happens to be the size of a register.
 515     */
 516    int offset = ir->offset / REG_SIZE;
 517
 518    /* The HW requires that the header is present; this is to get the g0.5
 519     * scratch offset.
 520     */
 521    gen8_set_src0(brw, send, brw_vec8_grf(0, 0));
 522    gen8_set_dst(brw, send, retype(dst, BRW_REGISTER_TYPE_UW));
 523    gen8_set_dp_scratch_message(brw, send,
 524                                false,    /* scratch read */
 525                                false,    /* OWords */
 526                                false,    /* invalidate after read */
 527                                num_regs,
 528                                offset,
 529                                1,        /* mlen - just g0 */
 530                                num_regs, /* rlen */
 531                                true,     /* header present */
 532                                false);   /* EOT */
 533
 534    default_state.exec_size = save_exec_size;
 535 }
 536
 537 void
 538 gen8_fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
 539                                                        struct brw_reg dst,
 540                                                        struct brw_reg index,
 541                                                        struct brw_reg offset)
 542 {
 543    assert(inst->mlen == 0);
 544
 545    assert(index.file == BRW_IMMEDIATE_VALUE &&
 546           index.type == BRW_REGISTER_TYPE_UD);
 547    uint32_t surf_index = index.dw1.ud;
 548
 549    assert(offset.file == BRW_GENERAL_REGISTER_FILE);
 550    /* Reference only the dword we need lest we anger validate_reg() with
 551     * reg.width > reg.execszie.
 552     */
 553    offset = brw_vec1_grf(offset.nr, 0);
 554
 555    gen8_instruction *send = next_inst(BRW_OPCODE_SEND);
 556    gen8_set_mask_control(send, BRW_MASK_DISABLE);
 557
 558    /* We use the SIMD4x2 mode because we want to end up with 4 constants in
 559     * the destination loaded consecutively from the same offset (which appears
 560     * in the first component, and the rest are ignored).
 561     */
 562    dst.width = BRW_WIDTH_4;
 563    gen8_set_dst(brw, send, dst);
 564    gen8_set_src0(brw, send, offset);
 565    gen8_set_sampler_message(brw, send,
 566                             surf_index,
 567                             0, /* The LD message ignores the sampler unit. */
 568                             GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
 569                             1, /* rlen */
 570                             1, /* mlen */
 571                             false, /* no header */
 572                             BRW_SAMPLER_SIMD_MODE_SIMD4X2);
 573
 574    brw_mark_surface_used(&prog_data->base, surf_index);
 575 }
 576
 577 void
 578 gen8_fs_generator::generate_varying_pull_constant_load(fs_inst *ir,
 579                                                        struct brw_reg dst,
 580                                                        struct brw_reg index,
 581                                                        struct brw_reg offset)
 582 {
 583    /* Varying-offset pull constant loads are treated as a normal expression on
 584     * gen7, so the fact that it's a send message is hidden at the IR level.
 585     */
 586    assert(!ir->header_present);
 587    assert(!ir->mlen);
 588
 589    assert(index.file == BRW_IMMEDIATE_VALUE &&
 590           index.type == BRW_REGISTER_TYPE_UD);
 591    uint32_t surf_index = index.dw1.ud;
 592
 593    uint32_t simd_mode, rlen, mlen;
 594    if (dispatch_width == 16) {
 595       mlen = 2;
 596       rlen = 8;
 597       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 598    } else {
 599       mlen = 1;
 600       rlen = 4;
 601       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 602    }
 603
 604    gen8_instruction *send = next_inst(BRW_OPCODE_SEND);
 605    gen8_set_dst(brw, send, dst);
 606    gen8_set_src0(brw, send, offset);
 607    gen8_set_sampler_message(brw, send,
 608                             surf_index,
 609                             0, /* The LD message ignore the sampler unit. */
 610                             GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
 611                             rlen, /* rlen */
 612                             mlen, /* mlen */
 613                             false, /* no header */
 614                             simd_mode);
 615
 616    brw_mark_surface_used(&prog_data->base, surf_index);
 617 }
 618
 619 /**
 620  * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
 621  * into the flags register (f0.0).
 622  */
 623 void
 624 gen8_fs_generator::generate_mov_dispatch_to_flags(fs_inst *ir)
 625 {
 626    struct brw_reg flags = brw_flag_reg(0, ir->flag_subreg);
 627    struct brw_reg dispatch_mask =
 628       retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
 629
 630    gen8_instruction *mov = MOV(flags, dispatch_mask);
 631    gen8_set_mask_control(mov, BRW_MASK_DISABLE);
 632 }
 633
 634 void
 635 gen8_fs_generator::generate_discard_jump(fs_inst *ir)
 636 {
 637    /* This HALT will be patched up at FB write time to point UIP at the end of
 638     * the program, and at brw_uip_jip() JIP will be set to the end of the
 639     * current block (or the program).
 640     */
 641    discard_halt_patches.push_tail(new(mem_ctx) ip_record(nr_inst));
 642
 643    HALT();
 644 }
 645
 646 bool
 647 gen8_fs_generator::patch_discard_jumps_to_fb_writes()
 648 {
 649    if (discard_halt_patches.is_empty())
 650       return false;
 651
 652    /* There is a somewhat strange undocumented requirement of using
 653     * HALT, according to the simulator.  If some channel has HALTed to
 654     * a particular UIP, then by the end of the program, every channel
 655     * must have HALTed to that UIP.  Furthermore, the tracking is a
 656     * stack, so you can't do the final halt of a UIP after starting
 657     * halting to a new UIP.
 658     *
 659     * Symptoms of not emitting this instruction on actual hardware
 660     * included GPU hangs and sparkly rendering on the piglit discard
 661     * tests.
 662     */
 663    gen8_instruction *last_halt = HALT();
 664    gen8_set_uip(last_halt, 16);
 665    gen8_set_jip(last_halt, 16);
 666
 667    int ip = nr_inst;
 668
 669    foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
 670       gen8_instruction *patch = &store[patch_ip->ip];
 671       assert(gen8_opcode(patch) == BRW_OPCODE_HALT);
 672
 673       /* HALT takes an instruction distance from the pre-incremented IP. */
 674       gen8_set_uip(patch, (ip - patch_ip->ip) * 16);
 675    }
 676
 677    this->discard_halt_patches.make_empty();
 678    return true;
 679 }
 680
 681 /**
 682  * Sets the first dword of a vgrf for simd4x2 uniform pull constant
 683  * sampler LD messages.
 684  *
 685  * We don't want to bake it into the send message's code generation because
 686  * that means we don't get a chance to schedule the instruction.
 687  */
 688 void
 689 gen8_fs_generator::generate_set_simd4x2_offset(fs_inst *ir,
 690                                                struct brw_reg dst,
 691                                                struct brw_reg value)
 692 {
 693    assert(value.file == BRW_IMMEDIATE_VALUE);
 694    MOV_RAW(retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
 695 }
 696
 697 /**
 698  * Sets vstride=16, width=8, hstride=2 or vstride=0, width=1, hstride=0
 699  * (when mask is passed as a uniform) of register mask before moving it
 700  * to register dst.
 701  */
 702 void
 703 gen8_fs_generator::generate_set_omask(fs_inst *inst,
 704                                       struct brw_reg dst,
 705                                       struct brw_reg mask)
 706 {
 707    assert(dst.type == BRW_REGISTER_TYPE_UW);
 708
 709    if (dispatch_width == 16)
 710       dst = vec16(dst);
 711
 712    if (mask.vstride == BRW_VERTICAL_STRIDE_8 &&
 713        mask.width == BRW_WIDTH_8 &&
 714        mask.hstride == BRW_HORIZONTAL_STRIDE_1) {
 715       mask = stride(mask, 16, 8, 2);
 716    } else {
 717       assert(mask.vstride == BRW_VERTICAL_STRIDE_0 &&
 718              mask.width == BRW_WIDTH_1 &&
 719              mask.hstride == BRW_HORIZONTAL_STRIDE_0);
 720    }
 721
 722    gen8_instruction *mov = MOV(dst, retype(mask, dst.type));
 723    gen8_set_mask_control(mov, BRW_MASK_DISABLE);
 724 }
 725
 726 /**
 727  * Do a special ADD with vstride=1, width=4, hstride=0 for src1.
 728  */
 729 void
 730 gen8_fs_generator::generate_set_sample_id(fs_inst *ir,
 731                                           struct brw_reg dst,
 732                                           struct brw_reg src0,
 733                                           struct brw_reg src1)
 734 {
 735    assert(dst.type == BRW_REGISTER_TYPE_D || dst.type == BRW_REGISTER_TYPE_UD);
 736    assert(src0.type == BRW_REGISTER_TYPE_D || src0.type == BRW_REGISTER_TYPE_UD);
 737
 738    struct brw_reg reg = retype(stride(src1, 1, 4, 0), BRW_REGISTER_TYPE_UW);
 739
 740    unsigned save_exec_size = default_state.exec_size;
 741    default_state.exec_size = BRW_EXECUTE_8;
 742
 743    gen8_instruction *add = ADD(dst, src0, reg);
 744    gen8_set_mask_control(add, BRW_MASK_DISABLE);
 745    if (dispatch_width == 16) {
 746       add = ADD(offset(dst, 1), offset(src0, 1), suboffset(reg, 2));
 747       gen8_set_mask_control(add, BRW_MASK_DISABLE);
 748    }
 749
 750    default_state.exec_size = save_exec_size;
 751 }
 752
 753 /**
 754  * Change the register's data type from UD to HF, doubling the strides in order
 755  * to compensate for halving the data type width.
 756  */
 757 static struct brw_reg
 758 ud_reg_to_hf(struct brw_reg r)
 759 {
 760    assert(r.type == BRW_REGISTER_TYPE_UD);
 761    r.type = BRW_REGISTER_TYPE_HF;
 762
 763    /* The BRW_*_STRIDE enums are defined so that incrementing the field
 764     * doubles the real stride.
 765     */
 766    if (r.hstride != 0)
 767       ++r.hstride;
 768    if (r.vstride != 0)
 769       ++r.vstride;
 770
 771    return r;
 772 }
 773
 774 void
 775 gen8_fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
 776                                                  struct brw_reg dst,
 777                                                  struct brw_reg x,
 778                                                  struct brw_reg y)
 779 {
 780    assert(dst.type == BRW_REGISTER_TYPE_UD);
 781    assert(x.type == BRW_REGISTER_TYPE_F);
 782    assert(y.type == BRW_REGISTER_TYPE_F);
 783
 784    struct brw_reg dst_hf = ud_reg_to_hf(dst);
 785
 786    /* Give each 32-bit channel of dst the form below , where "." means
 787     * unchanged.
 788     *   0x....hhhh
 789     */
 790    MOV(dst_hf, y);
 791
 792    /* Now the form:
 793     *   0xhhhh0000
 794     */
 795    SHL(dst, dst, brw_imm_ud(16u));
 796
 797    /* And, finally the form of packHalf2x16's output:
 798     *   0xhhhhllll
 799     */
 800    MOV(dst_hf, x);
 801 }
 802
 803 void
 804 gen8_fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
 805                                                    struct brw_reg dst,
 806                                                    struct brw_reg src)
 807 {
 808    assert(dst.type == BRW_REGISTER_TYPE_F);
 809    assert(src.type == BRW_REGISTER_TYPE_UD);
 810
 811    struct brw_reg src_hf = ud_reg_to_hf(src);
 812
 813    /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
 814     * For the Y case, we wish to access only the upper word; therefore
 815     * a 16-bit subregister offset is needed.
 816     */
 817    assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
 818           inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
 819    if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
 820       src_hf.subnr += 2;
 821
 822    MOV(dst, src_hf);
 823 }
 824
 825 void
 826 gen8_fs_generator::generate_untyped_atomic(fs_inst *ir,
 827                                            struct brw_reg dst,
 828                                            struct brw_reg atomic_op,
 829                                            struct brw_reg surf_index)
 830 {
 831    assert(atomic_op.file == BRW_IMMEDIATE_VALUE &&
 832           atomic_op.type == BRW_REGISTER_TYPE_UD &&
 833           surf_index.file == BRW_IMMEDIATE_VALUE &&
 834           surf_index.type == BRW_REGISTER_TYPE_UD);
 835    assert((atomic_op.dw1.ud & ~0xf) == 0);
 836
 837    unsigned msg_control =
 838       atomic_op.dw1.ud | /* Atomic Operation Type: BRW_AOP_* */
 839       ((dispatch_width == 16 ? 0 : 1) << 4) | /* SIMD Mode */
 840       (1 << 5); /* Return data expected */
 841
 842    gen8_instruction *inst = next_inst(BRW_OPCODE_SEND);
 843    gen8_set_dst(brw, inst, retype(dst, BRW_REGISTER_TYPE_UD));
 844    gen8_set_src0(brw, inst, brw_message_reg(ir->base_mrf));
 845    gen8_set_dp_message(brw, inst, HSW_SFID_DATAPORT_DATA_CACHE_1,
 846                        surf_index.dw1.ud,
 847                        HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP,
 848                        msg_control,
 849                        ir->mlen,
 850                        dispatch_width / 8,
 851                        ir->header_present,
 852                        false);
 853
 854    brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
 855 }
 856
 857 void
 858 gen8_fs_generator::generate_untyped_surface_read(fs_inst *ir,
 859                                                  struct brw_reg dst,
 860                                                  struct brw_reg surf_index)
 861 {
 862    assert(surf_index.file == BRW_IMMEDIATE_VALUE &&
 863           surf_index.type == BRW_REGISTER_TYPE_UD);
 864
 865    unsigned msg_control = 0xe | /* Enable only the R channel */
 866      ((dispatch_width == 16 ? 1 : 2) << 4); /* SIMD Mode */
 867
 868    gen8_instruction *inst = next_inst(BRW_OPCODE_SEND);
 869    gen8_set_dst(brw, inst, retype(dst, BRW_REGISTER_TYPE_UD));
 870    gen8_set_src0(brw, inst, brw_message_reg(ir->base_mrf));
 871    gen8_set_dp_message(brw, inst, HSW_SFID_DATAPORT_DATA_CACHE_1,
 872                        surf_index.dw1.ud,
 873                        HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ,
 874                        msg_control,
 875                        ir->mlen,
 876                        dispatch_width / 8,
 877                        ir->header_present,
 878                        false);
 879
 880    brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
 881 }
 882
 883 void
 884 gen8_fs_generator::generate_code(exec_list *instructions)
 885 {
 886    int start_offset = next_inst_offset;
 887
 888    struct annotation_info annotation;
 889    memset(&annotation, 0, sizeof(annotation));
 890
 891    cfg_t *cfg = NULL;
 892    if (unlikely(INTEL_DEBUG & DEBUG_WM))
 893       cfg = new(mem_ctx) cfg_t(instructions);
 894
 895    foreach_in_list(fs_inst, ir, instructions) {
 896       struct brw_reg src[3], dst;
 897
 898       if (unlikely(INTEL_DEBUG & DEBUG_WM))
 899          annotate(brw, &annotation, cfg, ir, next_inst_offset);
 900
 901       for (unsigned int i = 0; i < 3; i++) {
 902          src[i] = brw_reg_from_fs_reg(&ir->src[i]);
 903
 904          /* The accumulator result appears to get used for the
 905           * conditional modifier generation.  When negating a UD
 906           * value, there is a 33rd bit generated for the sign in the
 907           * accumulator value, so now you can't check, for example,
 908           * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
 909           */
 910          assert(!ir->conditional_mod ||
 911                 ir->src[i].type != BRW_REGISTER_TYPE_UD ||
 912                 !ir->src[i].negate);
 913       }
 914       dst = brw_reg_from_fs_reg(&ir->dst);
 915
 916       default_state.conditional_mod = ir->conditional_mod;
 917       default_state.predicate = ir->predicate;
 918       default_state.predicate_inverse = ir->predicate_inverse;
 919       default_state.saturate = ir->saturate;
 920       default_state.mask_control = ir->force_writemask_all;
 921       default_state.flag_subreg_nr = ir->flag_subreg;
 922
 923       if (dispatch_width == 16 && !ir->force_uncompressed && !ir->force_sechalf)
 924          default_state.exec_size = BRW_EXECUTE_16;
 925       else
 926          default_state.exec_size = BRW_EXECUTE_8;
 927
 928       if (ir->force_uncompressed || dispatch_width == 8)
 929          default_state.qtr_control = GEN6_COMPRESSION_1Q;
 930       else if (ir->force_sechalf)
 931          default_state.qtr_control = GEN6_COMPRESSION_2Q;
 932       else
 933          default_state.qtr_control = GEN6_COMPRESSION_1H;
 934
 935       switch (ir->opcode) {
 936       case BRW_OPCODE_MOV:
 937          MOV(dst, src[0]);
 938          break;
 939       case BRW_OPCODE_ADD:
 940          ADD(dst, src[0], src[1]);
 941          break;
 942       case BRW_OPCODE_MUL:
 943          MUL(dst, src[0], src[1]);
 944          break;
 945       case BRW_OPCODE_MACH:
 946          MACH(dst, src[0], src[1]);
 947          break;
 948
 949       case BRW_OPCODE_MAD:
 950          default_state.access_mode = BRW_ALIGN_16;
 951          MAD(dst, src[0], src[1], src[2]);
 952          default_state.access_mode = BRW_ALIGN_1;
 953          break;
 954
 955       case BRW_OPCODE_LRP:
 956          default_state.access_mode = BRW_ALIGN_16;
 957          LRP(dst, src[0], src[1], src[2]);
 958          default_state.access_mode = BRW_ALIGN_1;
 959          break;
 960
 961
 962       case BRW_OPCODE_FRC:
 963          FRC(dst, src[0]);
 964          break;
 965       case BRW_OPCODE_RNDD:
 966          RNDD(dst, src[0]);
 967          break;
 968       case BRW_OPCODE_RNDE:
 969          RNDE(dst, src[0]);
 970          break;
 971       case BRW_OPCODE_RNDZ:
 972          RNDZ(dst, src[0]);
 973          break;
 974
 975       case BRW_OPCODE_AND:
 976          AND(dst, src[0], src[1]);
 977          break;
 978       case BRW_OPCODE_OR:
 979          OR(dst, src[0], src[1]);
 980          break;
 981       case BRW_OPCODE_XOR:
 982          XOR(dst, src[0], src[1]);
 983          break;
 984       case BRW_OPCODE_NOT:
 985          NOT(dst, src[0]);
 986          break;
 987       case BRW_OPCODE_ASR:
 988          ASR(dst, src[0], src[1]);
 989          break;
 990       case BRW_OPCODE_SHR:
 991          SHR(dst, src[0], src[1]);
 992          break;
 993       case BRW_OPCODE_SHL:
 994          SHL(dst, src[0], src[1]);
 995          break;
 996
 997       case BRW_OPCODE_F32TO16:
 998          MOV(retype(dst, BRW_REGISTER_TYPE_HF), src[0]);
 999          break;
1000       case BRW_OPCODE_F16TO32:
1001          MOV(dst, retype(src[0], BRW_REGISTER_TYPE_HF));
1002          break;
1003
1004       case BRW_OPCODE_CMP:
1005          CMP(dst, ir->conditional_mod, src[0], src[1]);
1006          break;
1007       case BRW_OPCODE_SEL:
1008          SEL(dst, src[0], src[1]);
1009          break;
1010
1011       case BRW_OPCODE_BFREV:
1012          /* BFREV only supports UD type for src and dst. */
1013          BFREV(retype(dst, BRW_REGISTER_TYPE_UD),
1014                retype(src[0], BRW_REGISTER_TYPE_UD));
1015          break;
1016
1017       case BRW_OPCODE_FBH:
1018          /* FBH only supports UD type for dst. */
1019          FBH(retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1020          break;
1021
1022       case BRW_OPCODE_FBL:
1023          /* FBL only supports UD type for dst. */
1024          FBL(retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1025          break;
1026
1027       case BRW_OPCODE_CBIT:
1028          /* CBIT only supports UD type for dst. */
1029          CBIT(retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1030          break;
1031
1032       case BRW_OPCODE_ADDC:
1033          ADDC(dst, src[0], src[1]);
1034          break;
1035
1036       case BRW_OPCODE_SUBB:
1037          SUBB(dst, src[0], src[1]);
1038          break;
1039
1040       case BRW_OPCODE_BFE:
1041          default_state.access_mode = BRW_ALIGN_16;
1042          BFE(dst, src[0], src[1], src[2]);
1043          default_state.access_mode = BRW_ALIGN_1;
1044          break;
1045
1046       case BRW_OPCODE_BFI1:
1047          BFI1(dst, src[0], src[1]);
1048          break;
1049
1050       case BRW_OPCODE_BFI2:
1051          default_state.access_mode = BRW_ALIGN_16;
1052          BFI2(dst, src[0], src[1], src[2]);
1053          default_state.access_mode = BRW_ALIGN_1;
1054          break;
1055
1056       case BRW_OPCODE_IF:
1057          IF(BRW_PREDICATE_NORMAL);
1058          break;
1059
1060       case BRW_OPCODE_ELSE:
1061          ELSE();
1062          break;
1063
1064       case BRW_OPCODE_ENDIF:
1065          ENDIF();
1066          break;
1067
1068       case BRW_OPCODE_DO:
1069          DO();
1070          break;
1071
1072       case BRW_OPCODE_BREAK:
1073          BREAK();
1074          break;
1075
1076       case BRW_OPCODE_CONTINUE:
1077          CONTINUE();
1078          break;
1079
1080       case BRW_OPCODE_WHILE:
1081          WHILE();
1082          break;
1083
1084       case SHADER_OPCODE_RCP:
1085          MATH(BRW_MATH_FUNCTION_INV, dst, src[0]);
1086          break;
1087
1088       case SHADER_OPCODE_RSQ:
1089          MATH(BRW_MATH_FUNCTION_RSQ, dst, src[0]);
1090          break;
1091
1092       case SHADER_OPCODE_SQRT:
1093          MATH(BRW_MATH_FUNCTION_SQRT, dst, src[0]);
1094          break;
1095
1096       case SHADER_OPCODE_EXP2:
1097          MATH(BRW_MATH_FUNCTION_EXP, dst, src[0]);
1098          break;
1099
1100       case SHADER_OPCODE_LOG2:
1101          MATH(BRW_MATH_FUNCTION_LOG, dst, src[0]);
1102          break;
1103
1104       case SHADER_OPCODE_SIN:
1105          MATH(BRW_MATH_FUNCTION_SIN, dst, src[0]);
1106          break;
1107
1108       case SHADER_OPCODE_COS:
1109          MATH(BRW_MATH_FUNCTION_COS, dst, src[0]);
1110          break;
1111
1112       case SHADER_OPCODE_INT_QUOTIENT:
1113          MATH(BRW_MATH_FUNCTION_INT_DIV_QUOTIENT, dst, src[0], src[1]);
1114          break;
1115
1116       case SHADER_OPCODE_INT_REMAINDER:
1117          MATH(BRW_MATH_FUNCTION_INT_DIV_REMAINDER, dst, src[0], src[1]);
1118          break;
1119
1120       case SHADER_OPCODE_POW:
1121          MATH(BRW_MATH_FUNCTION_POW, dst, src[0], src[1]);
1122          break;
1123
1124       case FS_OPCODE_PIXEL_X:
1125       case FS_OPCODE_PIXEL_Y:
1126          unreachable("FS_OPCODE_PIXEL_X and FS_OPCODE_PIXEL_Y are only for Gen4-5.");
1127
1128       case FS_OPCODE_CINTERP:
1129          MOV(dst, src[0]);
1130          break;
1131       case FS_OPCODE_LINTERP:
1132          generate_linterp(ir, dst, src);
1133          break;
1134       case SHADER_OPCODE_TEX:
1135       case FS_OPCODE_TXB:
1136       case SHADER_OPCODE_TXD:
1137       case SHADER_OPCODE_TXF:
1138       case SHADER_OPCODE_TXF_CMS:
1139       case SHADER_OPCODE_TXF_UMS:
1140       case SHADER_OPCODE_TXF_MCS:
1141       case SHADER_OPCODE_TXL:
1142       case SHADER_OPCODE_TXS:
1143       case SHADER_OPCODE_LOD:
1144       case SHADER_OPCODE_TG4:
1145       case SHADER_OPCODE_TG4_OFFSET:
1146          generate_tex(ir, dst, src[0], src[1]);
1147          break;
1148
1149       case FS_OPCODE_DDX:
1150          generate_ddx(ir, dst, src[0]);
1151          break;
1152       case FS_OPCODE_DDY:
1153          /* Make sure fp->UsesDFdy flag got set (otherwise there's no
1154           * guarantee that key->render_to_fbo is set).
1155           */
1156          assert(fp->UsesDFdy);
1157          generate_ddy(ir, dst, src[0], key->render_to_fbo);
1158          break;
1159
1160       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1161          generate_scratch_write(ir, src[0]);
1162          break;
1163
1164       case SHADER_OPCODE_GEN4_SCRATCH_READ:
1165          generate_scratch_read(ir, dst);
1166          break;
1167
1168       case SHADER_OPCODE_GEN7_SCRATCH_READ:
1169          generate_scratch_read_gen7(ir, dst);
1170          break;
1171
1172       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
1173          generate_uniform_pull_constant_load(ir, dst, src[0], src[1]);
1174          break;
1175
1176       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
1177          generate_varying_pull_constant_load(ir, dst, src[0], src[1]);
1178          break;
1179
1180       case FS_OPCODE_FB_WRITE:
1181          generate_fb_write(ir);
1182          break;
1183
1184       case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
1185          generate_mov_dispatch_to_flags(ir);
1186          break;
1187
1188       case FS_OPCODE_DISCARD_JUMP:
1189          generate_discard_jump(ir);
1190          break;
1191
1192       case SHADER_OPCODE_SHADER_TIME_ADD:
1193          unreachable("XXX: Missing Gen8 scalar support for INTEL_DEBUG=shader_time");
1194
1195       case SHADER_OPCODE_UNTYPED_ATOMIC:
1196          generate_untyped_atomic(ir, dst, src[0], src[1]);
1197          break;
1198
1199       case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1200          generate_untyped_surface_read(ir, dst, src[0]);
1201          break;
1202
1203       case FS_OPCODE_SET_SIMD4X2_OFFSET:
1204          generate_set_simd4x2_offset(ir, dst, src[0]);
1205          break;
1206
1207       case FS_OPCODE_SET_OMASK:
1208          generate_set_omask(ir, dst, src[0]);
1209          break;
1210
1211       case FS_OPCODE_SET_SAMPLE_ID:
1212          generate_set_sample_id(ir, dst, src[0], src[1]);
1213          break;
1214
1215       case FS_OPCODE_PACK_HALF_2x16_SPLIT:
1216          generate_pack_half_2x16_split(ir, dst, src[0], src[1]);
1217          break;
1218
1219       case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
1220       case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
1221          generate_unpack_half_2x16_split(ir, dst, src[0]);
1222          break;
1223
1224       case FS_OPCODE_PLACEHOLDER_HALT:
1225          /* This is the place where the final HALT needs to be inserted if
1226           * we've emitted any discards.  If not, this will emit no code.
1227           */
1228          if (!patch_discard_jumps_to_fb_writes()) {
1229             if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1230                annotation.ann_count--;
1231             }
1232          }
1233          break;
1234
1235       default:
1236          if (ir->opcode < int(ARRAY_SIZE(opcode_descs))) {
1237             _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
1238                           opcode_descs[ir->opcode].name);
1239          } else {
1240             _mesa_problem(ctx, "Unsupported opcode %d in FS", ir->opcode);
1241          }
1242          abort();
1243       }
1244    }
1245
1246    patch_jump_targets();
1247    annotation_finalize(&annotation, next_inst_offset);
1248
1249    int before_size = next_inst_offset - start_offset;
1250
1251    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1252       if (shader_prog) {
1253          fprintf(stderr,
1254                  "Native code for %s fragment shader %d (SIMD%d dispatch):\n",
1255                 shader_prog->Label ? shader_prog->Label : "unnamed",
1256                 shader_prog->Name, dispatch_width);
1257       } else if (fp) {
1258          fprintf(stderr,
1259                  "Native code for fragment program %d (SIMD%d dispatch):\n",
1260                  prog->Id, dispatch_width);
1261       } else {
1262          fprintf(stderr, "Native code for blorp program (SIMD%d dispatch):\n",
1263                  dispatch_width);
1264       }
1265       fprintf(stderr, "SIMD%d shader: %d instructions.\n",
1266               dispatch_width, before_size / 16);
1267
1268       dump_assembly(store, annotation.ann_count, annotation.ann, brw, prog);
1269       ralloc_free(annotation.ann);
1270    }
1271 }
1272
1273 const unsigned *
1274 gen8_fs_generator::generate_assembly(exec_list *simd8_instructions,
1275                                      exec_list *simd16_instructions,
1276                                      unsigned *assembly_size)
1277 {
1278    assert(simd8_instructions || simd16_instructions);
1279
1280    if (simd8_instructions) {
1281       dispatch_width = 8;
1282       generate_code(simd8_instructions);
1283    }
1284
1285    if (simd16_instructions) {
1286       /* Align to a 64-byte boundary. */
1287       while (next_inst_offset % 64)
1288          NOP();
1289
1290       /* Save off the start of this SIMD16 program */
1291       prog_data->prog_offset_16 = next_inst_offset;
1292
1293       dispatch_width = 16;
1294       generate_code(simd16_instructions);
1295    }
1296
1297    *assembly_size = next_inst_offset;
1298    return (const unsigned *) store;
1299 }