src/mesa/drivers/dri/i965/gen8_fs_generator.cpp

   1 /*
   2  * Copyright © 2010, 2011, 2012 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file gen8_fs_generate.cpp
  25  *
  26  * Code generation for Gen8+ hardware.
  27  */
  28
  29 extern "C" {
  30 #include "main/macros.h"
  31 #include "brw_context.h"
  32 } /* extern "C" */
  33
  34 #include "brw_fs.h"
  35 #include "brw_cfg.h"
  36 #include "glsl/ir_print_visitor.h"
  37
  38 gen8_fs_generator::gen8_fs_generator(struct brw_context *brw,
  39                                      struct brw_wm_compile *c,
  40                                      struct gl_shader_program *shader_prog,
  41                                      struct gl_fragment_program *fp,
  42                                      bool dual_source_output)
  43    : gen8_generator(brw, shader_prog, fp ? &fp->Base : NULL, c), c(c), fp(fp),
  44      dual_source_output(dual_source_output)
  45 {
  46 }
  47
  48 gen8_fs_generator::~gen8_fs_generator()
  49 {
  50 }
  51
  52 void
  53 gen8_fs_generator::mark_surface_used(unsigned surf_index)
  54 {
  55    assert(surf_index < BRW_MAX_SURFACES);
  56
  57    c->prog_data.base.binding_table.size_bytes =
  58       MAX2(c->prog_data.base.binding_table.size_bytes, (surf_index + 1) * 4);
  59 }
  60
  61 void
  62 gen8_fs_generator::generate_fb_write(fs_inst *ir)
  63 {
  64    /* Disable the discard condition while setting up the header. */
  65    default_state.predicate = BRW_PREDICATE_NONE;
  66    default_state.predicate_inverse = false;
  67    default_state.flag_subreg_nr = 0;
  68
  69    if (ir->header_present) {
  70       /* The GPU will use the predicate on SENDC, unless the header is present.
  71        */
  72       if (fp && fp->UsesKill) {
  73          gen8_instruction *mov =
  74             MOV(retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW),
  75                 brw_flag_reg(0, 1));
  76          gen8_set_mask_control(mov, BRW_MASK_DISABLE);
  77       }
  78
  79       gen8_instruction *mov =
  80          MOV_RAW(brw_message_reg(ir->base_mrf), brw_vec8_grf(0, 0));
  81       gen8_set_exec_size(mov, BRW_EXECUTE_16);
  82
  83       if (ir->target > 0 && c->key.replicate_alpha) {
  84          /* Set "Source0 Alpha Present to RenderTarget" bit in the header. */
  85          OR(vec1(retype(brw_message_reg(ir->base_mrf), BRW_REGISTER_TYPE_UD)),
  86             vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
  87             brw_imm_ud(1 << 11));
  88       }
  89
  90       if (ir->target > 0) {
  91          /* Set the render target index for choosing BLEND_STATE. */
  92          MOV(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, ir->base_mrf, 2),
  93                     BRW_REGISTER_TYPE_UD),
  94              brw_imm_ud(ir->target));
  95       }
  96    }
  97
  98    /* Set the predicate back to get the conditional write if necessary for
  99     * discards.
 100     */
 101    default_state.predicate = ir->predicate;
 102    default_state.predicate_inverse = ir->predicate_inverse;
 103    default_state.flag_subreg_nr = ir->flag_subreg;
 104
 105    gen8_instruction *inst = next_inst(BRW_OPCODE_SENDC);
 106    gen8_set_dst(brw, inst, retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW));
 107    gen8_set_src0(brw, inst, brw_message_reg(ir->base_mrf));
 108
 109    /* Set up the "Message Specific Control" bits for the Data Port Message
 110     * Descriptor.  These are documented in the "Render Target Write" message's
 111     * "Message Descriptor" documentation (vol5c.2).
 112     */
 113    uint32_t msg_type;
 114    /* Set the Message Type */
 115    if (this->dual_source_output)
 116       msg_type = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
 117    else if (dispatch_width == 16)
 118       msg_type = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
 119    else
 120       msg_type = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
 121
 122    uint32_t msg_control = msg_type;
 123
 124    /* "Last Render Target Select" must be set on all writes to the last of
 125     * the render targets (if using MRT), or always for a single RT scenario.
 126     */
 127    if ((ir->target == c->key.nr_color_regions - 1) || !c->key.nr_color_regions)
 128       msg_control |= (1 << 4); /* Last Render Target Select */
 129
 130    uint32_t surf_index =
 131       c->prog_data.binding_table.render_target_start + ir->target;
 132
 133    gen8_set_dp_message(brw, inst,
 134                        GEN6_SFID_DATAPORT_RENDER_CACHE,
 135                        surf_index,
 136                        GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE,
 137                        msg_control,
 138                        ir->mlen,
 139                        0,
 140                        ir->header_present,
 141                        ir->eot);
 142
 143    mark_surface_used(surf_index);
 144 }
 145
 146 void
 147 gen8_fs_generator::generate_linterp(fs_inst *inst,
 148                                     struct brw_reg dst,
 149                                     struct brw_reg *src)
 150 {
 151    struct brw_reg delta_x = src[0];
 152    struct brw_reg delta_y = src[1];
 153    struct brw_reg interp = src[2];
 154
 155    (void) delta_y;
 156    assert(delta_y.nr == delta_x.nr + 1);
 157    PLN(dst, interp, delta_x);
 158 }
 159
 160 void
 161 gen8_fs_generator::generate_tex(fs_inst *ir,
 162                                 struct brw_reg dst,
 163                                 struct brw_reg src)
 164 {
 165    int msg_type = -1;
 166    int rlen = 4;
 167    uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 168
 169    assert(src.file == BRW_GENERAL_REGISTER_FILE);
 170
 171    if (dispatch_width == 16 && !ir->force_uncompressed && !ir->force_sechalf)
 172       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 173
 174    switch (ir->opcode) {
 175    case SHADER_OPCODE_TEX:
 176       if (ir->shadow_compare) {
 177          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
 178       } else {
 179          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
 180       }
 181       break;
 182    case FS_OPCODE_TXB:
 183       if (ir->shadow_compare) {
 184          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
 185       } else {
 186          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
 187       }
 188       break;
 189    case SHADER_OPCODE_TXL:
 190       if (ir->shadow_compare) {
 191          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
 192       } else {
 193          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
 194       }
 195       break;
 196    case SHADER_OPCODE_TXS:
 197       msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
 198       break;
 199    case SHADER_OPCODE_TXD:
 200       if (ir->shadow_compare) {
 201          msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
 202       } else {
 203          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
 204       }
 205       break;
 206    case SHADER_OPCODE_TXF:
 207       msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
 208       break;
 209    case SHADER_OPCODE_TXF_CMS:
 210       msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
 211       break;
 212    case SHADER_OPCODE_TXF_UMS:
 213       msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
 214       break;
 215    case SHADER_OPCODE_TXF_MCS:
 216       msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
 217       break;
 218    case SHADER_OPCODE_LOD:
 219       msg_type = GEN5_SAMPLER_MESSAGE_LOD;
 220       break;
 221    case SHADER_OPCODE_TG4:
 222       if (ir->shadow_compare) {
 223          assert(brw->gen >= 7);
 224          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
 225       } else {
 226          assert(brw->gen >= 6);
 227          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
 228       }
 229       break;
 230    case SHADER_OPCODE_TG4_OFFSET:
 231       assert(brw->gen >= 7);
 232       if (ir->shadow_compare) {
 233          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
 234       } else {
 235          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
 236       }
 237       break;
 238    default:
 239       assert(!"not reached");
 240       break;
 241    }
 242    assert(msg_type != -1);
 243
 244    if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
 245       rlen = 8;
 246       dst = vec16(dst);
 247    }
 248
 249    if (ir->header_present) {
 250       /* The send-from-GRF for SIMD16 texturing with a header has an extra
 251        * hardware register allocated to it, which we need to skip over (since
 252        * our coordinates in the payload are in the even-numbered registers,
 253        * and the header comes right before the first one.
 254        */
 255       if (dispatch_width == 16)
 256          src.nr++;
 257
 258       unsigned save_exec_size = default_state.exec_size;
 259       default_state.exec_size = BRW_EXECUTE_8;
 260
 261       MOV_RAW(src, brw_vec8_grf(0, 0));
 262
 263       if (ir->texture_offset) {
 264          /* Set the texel offset bits. */
 265          MOV_RAW(retype(brw_vec1_grf(src.nr, 2), BRW_REGISTER_TYPE_UD),
 266                  brw_imm_ud(ir->texture_offset));
 267       }
 268
 269       if (ir->sampler >= 16) {
 270          /* The "Sampler Index" field can only store values between 0 and 15.
 271           * However, we can add an offset to the "Sampler State Pointer"
 272           * field, effectively selecting a different set of 16 samplers.
 273           *
 274           * The "Sampler State Pointer" needs to be aligned to a 32-byte
 275           * offset, and each sampler state is only 16-bytes, so we can't
 276           * exclusively use the offset - we have to use both.
 277           */
 278          gen8_instruction *add =
 279             ADD(get_element_ud(src, 3),
 280                 get_element_ud(brw_vec8_grf(0, 0), 3),
 281                 brw_imm_ud(16 * (ir->sampler / 16) *
 282                            sizeof(gen7_sampler_state)));
 283          gen8_set_mask_control(add, BRW_MASK_DISABLE);
 284       }
 285
 286       default_state.exec_size = save_exec_size;
 287    }
 288
 289    uint32_t surf_index =
 290       c->prog_data.base.binding_table.texture_start + ir->sampler;
 291
 292    gen8_instruction *inst = next_inst(BRW_OPCODE_SEND);
 293    gen8_set_dst(brw, inst, dst);
 294    gen8_set_src0(brw, inst, src);
 295    gen8_set_sampler_message(brw, inst,
 296                             surf_index,
 297                             ir->sampler % 16,
 298                             msg_type,
 299                             rlen,
 300                             ir->mlen,
 301                             ir->header_present,
 302                             simd_mode);
 303
 304    mark_surface_used(surf_index);
 305 }
 306
 307
 308 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 309  * looking like:
 310  *
 311  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 312  *
 313  * and we're trying to produce:
 314  *
 315  *           DDX                     DDY
 316  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 317  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 318  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 319  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 320  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 321  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 322  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 323  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 324  *
 325  * and add another set of two more subspans if in 16-pixel dispatch mode.
 326  *
 327  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 328  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
 329  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
 330  * between each other.  We could probably do it like ddx and swizzle the right
 331  * order later, but bail for now and just produce
 332  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
 333  */
 334 void
 335 gen8_fs_generator::generate_ddx(fs_inst *inst,
 336                                 struct brw_reg dst,
 337                                 struct brw_reg src)
 338 {
 339    unsigned vstride, width;
 340
 341    if (c->key.high_quality_derivatives) {
 342       /* Produce accurate derivatives. */
 343       vstride = BRW_VERTICAL_STRIDE_2;
 344       width = BRW_WIDTH_2;
 345    } else {
 346       /* Replicate the derivative at the top-left pixel to other pixels. */
 347       vstride = BRW_VERTICAL_STRIDE_4;
 348       width = BRW_WIDTH_4;
 349    }
 350
 351    struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
 352                                  BRW_REGISTER_TYPE_F,
 353                                  vstride,
 354                                  width,
 355                                  BRW_HORIZONTAL_STRIDE_0,
 356                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 357    struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
 358                                  BRW_REGISTER_TYPE_F,
 359                                  vstride,
 360                                  width,
 361                                  BRW_HORIZONTAL_STRIDE_0,
 362                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 363    ADD(dst, src0, negate(src1));
 364 }
 365
 366 /* The negate_value boolean is used to negate the derivative computation for
 367  * FBOs, since they place the origin at the upper left instead of the lower
 368  * left.
 369  */
 370 void
 371 gen8_fs_generator::generate_ddy(fs_inst *inst,
 372                                 struct brw_reg dst,
 373                                 struct brw_reg src,
 374                                 bool negate_value)
 375 {
 376    unsigned hstride;
 377    unsigned src0_swizzle;
 378    unsigned src1_swizzle;
 379    unsigned src1_subnr;
 380
 381    if (c->key.high_quality_derivatives) {
 382       /* Produce accurate derivatives. */
 383       hstride = BRW_HORIZONTAL_STRIDE_1;
 384       src0_swizzle = BRW_SWIZZLE_XYXY;
 385       src1_swizzle = BRW_SWIZZLE_ZWZW;
 386       src1_subnr = 0;
 387
 388       default_state.access_mode = BRW_ALIGN_16;
 389    } else {
 390       /* Replicate the derivative at the top-left pixel to other pixels. */
 391       hstride = BRW_HORIZONTAL_STRIDE_0;
 392       src0_swizzle = BRW_SWIZZLE_XYZW;
 393       src1_swizzle = BRW_SWIZZLE_XYZW;
 394       src1_subnr = 2;
 395    }
 396
 397    struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
 398                                  BRW_REGISTER_TYPE_F,
 399                                  BRW_VERTICAL_STRIDE_4,
 400                                  BRW_WIDTH_4,
 401                                  hstride,
 402                                  src0_swizzle, WRITEMASK_XYZW);
 403    struct brw_reg src1 = brw_reg(src.file, src.nr, src1_subnr,
 404                                  BRW_REGISTER_TYPE_F,
 405                                  BRW_VERTICAL_STRIDE_4,
 406                                  BRW_WIDTH_4,
 407                                  hstride,
 408                                  src1_swizzle, WRITEMASK_XYZW);
 409
 410    if (negate_value)
 411       ADD(dst, src1, negate(src0));
 412    else
 413       ADD(dst, src0, negate(src1));
 414
 415    default_state.access_mode = BRW_ALIGN_1;
 416 }
 417
 418 void
 419 gen8_fs_generator::generate_scratch_write(fs_inst *ir, struct brw_reg src)
 420 {
 421    MOV(retype(brw_message_reg(ir->base_mrf + 1), BRW_REGISTER_TYPE_UD),
 422        retype(src, BRW_REGISTER_TYPE_UD));
 423
 424    struct brw_reg mrf =
 425       retype(brw_message_reg(ir->base_mrf), BRW_REGISTER_TYPE_UD);
 426
 427    const int num_regs = dispatch_width / 8;
 428
 429    uint32_t msg_control;
 430    if (num_regs == 1)
 431       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
 432    else
 433       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
 434
 435    /* Set up the message header.  This is g0, with g0.2 filled with
 436     * the offset.  We don't want to leave our offset around in g0 or
 437     * it'll screw up texture samples, so set it up inside the message
 438     * reg.
 439     */
 440    unsigned save_exec_size = default_state.exec_size;
 441    default_state.exec_size = BRW_EXECUTE_8;
 442
 443    MOV_RAW(mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
 444    /* set message header global offset field (reg 0, element 2) */
 445    MOV_RAW(get_element_ud(mrf, 2), brw_imm_ud(ir->offset / 16));
 446
 447    struct brw_reg dst;
 448    if (dispatch_width == 16)
 449       dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
 450    else
 451       dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
 452
 453    default_state.exec_size = BRW_EXECUTE_16;
 454
 455    gen8_instruction *send = next_inst(BRW_OPCODE_SEND);
 456    gen8_set_dst(brw, send, dst);
 457    gen8_set_src0(brw, send, mrf);
 458    gen8_set_dp_message(brw, send, GEN7_SFID_DATAPORT_DATA_CACHE,
 459                        255, /* binding table index: stateless access */
 460                        GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE,
 461                        msg_control,
 462                        1 + num_regs, /* mlen */
 463                        0,            /* rlen */
 464                        true,         /* header present */
 465                        false);       /* EOT */
 466
 467    default_state.exec_size = save_exec_size;
 468 }
 469
 470 void
 471 gen8_fs_generator::generate_scratch_read(fs_inst *ir, struct brw_reg dst)
 472 {
 473    struct brw_reg mrf =
 474       retype(brw_message_reg(ir->base_mrf), BRW_REGISTER_TYPE_UD);
 475
 476    const int num_regs = dispatch_width / 8;
 477
 478    uint32_t msg_control;
 479    if (num_regs == 1)
 480       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
 481    else
 482       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
 483
 484    unsigned save_exec_size = default_state.exec_size;
 485    default_state.exec_size = BRW_EXECUTE_8;
 486
 487    MOV_RAW(mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
 488    /* set message header global offset field (reg 0, element 2) */
 489    MOV_RAW(get_element_ud(mrf, 2), brw_imm_ud(ir->offset / 16));
 490
 491    gen8_instruction *send = next_inst(BRW_OPCODE_SEND);
 492    gen8_set_dst(brw, send, retype(dst, BRW_REGISTER_TYPE_UW));
 493    gen8_set_src0(brw, send, mrf);
 494    gen8_set_dp_message(brw, send, GEN7_SFID_DATAPORT_DATA_CACHE,
 495                        255, /* binding table index: stateless access */
 496                        BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
 497                        msg_control,
 498                        1,        /* mlen */
 499                        num_regs, /* rlen */
 500                        true,     /* header present */
 501                        false);   /* EOT */
 502
 503    default_state.exec_size = save_exec_size;
 504 }
 505
 506 void
 507 gen8_fs_generator::generate_scratch_read_gen7(fs_inst *ir, struct brw_reg dst)
 508 {
 509    unsigned save_exec_size = default_state.exec_size;
 510    gen8_instruction *send = next_inst(BRW_OPCODE_SEND);
 511
 512    int num_regs = dispatch_width / 8;
 513
 514    /* According to the docs, offset is "A 12-bit HWord offset into the memory
 515     * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
 516     * is 32 bytes, which happens to be the size of a register.
 517     */
 518    int offset = ir->offset / REG_SIZE;
 519
 520    /* The HW requires that the header is present; this is to get the g0.5
 521     * scratch offset.
 522     */
 523    gen8_set_src0(brw, send, brw_vec8_grf(0, 0));
 524    gen8_set_dst(brw, send, retype(dst, BRW_REGISTER_TYPE_UW));
 525    gen8_set_dp_scratch_message(brw, send,
 526                                false,    /* scratch read */
 527                                false,    /* OWords */
 528                                false,    /* invalidate after read */
 529                                num_regs,
 530                                offset,
 531                                1,        /* mlen - just g0 */
 532                                num_regs, /* rlen */
 533                                true,     /* header present */
 534                                false);   /* EOT */
 535
 536    default_state.exec_size = save_exec_size;
 537 }
 538
 539 void
 540 gen8_fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
 541                                                        struct brw_reg dst,
 542                                                        struct brw_reg index,
 543                                                        struct brw_reg offset)
 544 {
 545    assert(inst->mlen == 0);
 546
 547    assert(index.file == BRW_IMMEDIATE_VALUE &&
 548           index.type == BRW_REGISTER_TYPE_UD);
 549    uint32_t surf_index = index.dw1.ud;
 550
 551    assert(offset.file == BRW_GENERAL_REGISTER_FILE);
 552    /* Reference only the dword we need lest we anger validate_reg() with
 553     * reg.width > reg.execszie.
 554     */
 555    offset = brw_vec1_grf(offset.nr, 0);
 556
 557    gen8_instruction *send = next_inst(BRW_OPCODE_SEND);
 558    gen8_set_mask_control(send, BRW_MASK_DISABLE);
 559
 560    /* We use the SIMD4x2 mode because we want to end up with 4 constants in
 561     * the destination loaded consecutively from the same offset (which appears
 562     * in the first component, and the rest are ignored).
 563     */
 564    dst.width = BRW_WIDTH_4;
 565    gen8_set_dst(brw, send, dst);
 566    gen8_set_src0(brw, send, offset);
 567    gen8_set_sampler_message(brw, send,
 568                             surf_index,
 569                             0, /* The LD message ignores the sampler unit. */
 570                             GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
 571                             1, /* rlen */
 572                             1, /* mlen */
 573                             false, /* no header */
 574                             BRW_SAMPLER_SIMD_MODE_SIMD4X2);
 575
 576    mark_surface_used(surf_index);
 577 }
 578
 579 void
 580 gen8_fs_generator::generate_varying_pull_constant_load(fs_inst *ir,
 581                                                        struct brw_reg dst,
 582                                                        struct brw_reg index,
 583                                                        struct brw_reg offset)
 584 {
 585    /* Varying-offset pull constant loads are treated as a normal expression on
 586     * gen7, so the fact that it's a send message is hidden at the IR level.
 587     */
 588    assert(!ir->header_present);
 589    assert(!ir->mlen);
 590
 591    assert(index.file == BRW_IMMEDIATE_VALUE &&
 592           index.type == BRW_REGISTER_TYPE_UD);
 593    uint32_t surf_index = index.dw1.ud;
 594
 595    uint32_t simd_mode, rlen, mlen;
 596    if (dispatch_width == 16) {
 597       mlen = 2;
 598       rlen = 8;
 599       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 600    } else {
 601       mlen = 1;
 602       rlen = 4;
 603       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 604    }
 605
 606    gen8_instruction *send = next_inst(BRW_OPCODE_SEND);
 607    gen8_set_dst(brw, send, dst);
 608    gen8_set_src0(brw, send, offset);
 609    gen8_set_sampler_message(brw, send,
 610                             surf_index,
 611                             0, /* The LD message ignore the sampler unit. */
 612                             GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
 613                             rlen, /* rlen */
 614                             mlen, /* mlen */
 615                             false, /* no header */
 616                             simd_mode);
 617
 618    mark_surface_used(surf_index);
 619 }
 620
 621 /**
 622  * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
 623  * into the flags register (f0.0).
 624  */
 625 void
 626 gen8_fs_generator::generate_mov_dispatch_to_flags(fs_inst *ir)
 627 {
 628    struct brw_reg flags = brw_flag_reg(0, ir->flag_subreg);
 629    struct brw_reg dispatch_mask =
 630       retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
 631
 632    gen8_instruction *mov = MOV(flags, dispatch_mask);
 633    gen8_set_mask_control(mov, BRW_MASK_DISABLE);
 634 }
 635
 636 void
 637 gen8_fs_generator::generate_discard_jump(fs_inst *ir)
 638 {
 639    /* This HALT will be patched up at FB write time to point UIP at the end of
 640     * the program, and at brw_uip_jip() JIP will be set to the end of the
 641     * current block (or the program).
 642     */
 643    discard_halt_patches.push_tail(new(mem_ctx) ip_record(nr_inst));
 644
 645    HALT();
 646 }
 647
 648 void
 649 gen8_fs_generator::patch_discard_jumps_to_fb_writes()
 650 {
 651    if (discard_halt_patches.is_empty())
 652       return;
 653
 654    /* There is a somewhat strange undocumented requirement of using
 655     * HALT, according to the simulator.  If some channel has HALTed to
 656     * a particular UIP, then by the end of the program, every channel
 657     * must have HALTed to that UIP.  Furthermore, the tracking is a
 658     * stack, so you can't do the final halt of a UIP after starting
 659     * halting to a new UIP.
 660     *
 661     * Symptoms of not emitting this instruction on actual hardware
 662     * included GPU hangs and sparkly rendering on the piglit discard
 663     * tests.
 664     */
 665    gen8_instruction *last_halt = HALT();
 666    gen8_set_uip(last_halt, 16);
 667    gen8_set_jip(last_halt, 16);
 668
 669    int ip = nr_inst;
 670
 671    foreach_list(node, &discard_halt_patches) {
 672       ip_record *patch_ip = (ip_record *) node;
 673       gen8_instruction *patch = &store[patch_ip->ip];
 674       assert(gen8_opcode(patch) == BRW_OPCODE_HALT);
 675
 676       /* HALT takes an instruction distance from the pre-incremented IP. */
 677       gen8_set_uip(patch, (ip - patch_ip->ip) * 16);
 678    }
 679
 680    this->discard_halt_patches.make_empty();
 681 }
 682
 683 /**
 684  * Sets the first dword of a vgrf for simd4x2 uniform pull constant
 685  * sampler LD messages.
 686  *
 687  * We don't want to bake it into the send message's code generation because
 688  * that means we don't get a chance to schedule the instruction.
 689  */
 690 void
 691 gen8_fs_generator::generate_set_simd4x2_offset(fs_inst *ir,
 692                                                struct brw_reg dst,
 693                                                struct brw_reg value)
 694 {
 695    assert(value.file == BRW_IMMEDIATE_VALUE);
 696    MOV_RAW(retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
 697 }
 698
 699 /**
 700  * Sets vstride=16, width=8, hstride=2 or vstride=0, width=1, hstride=0
 701  * (when mask is passed as a uniform) of register mask before moving it
 702  * to register dst.
 703  */
 704 void
 705 gen8_fs_generator::generate_set_omask(fs_inst *inst,
 706                                       struct brw_reg dst,
 707                                       struct brw_reg mask)
 708 {
 709    assert(dst.type == BRW_REGISTER_TYPE_UW);
 710
 711    if (dispatch_width == 16)
 712       dst = vec16(dst);
 713
 714    if (mask.vstride == BRW_VERTICAL_STRIDE_8 &&
 715        mask.width == BRW_WIDTH_8 &&
 716        mask.hstride == BRW_HORIZONTAL_STRIDE_1) {
 717       mask = stride(mask, 16, 8, 2);
 718    } else {
 719       assert(mask.vstride == BRW_VERTICAL_STRIDE_0 &&
 720              mask.width == BRW_WIDTH_1 &&
 721              mask.hstride == BRW_HORIZONTAL_STRIDE_0);
 722    }
 723
 724    unsigned save_exec_size = default_state.exec_size;
 725    default_state.exec_size = BRW_EXECUTE_8;
 726
 727    gen8_instruction *mov = MOV(dst, retype(mask, dst.type));
 728    gen8_set_mask_control(mov, BRW_MASK_DISABLE);
 729
 730    default_state.exec_size = save_exec_size;
 731 }
 732
 733 /**
 734  * Do a special ADD with vstride=1, width=4, hstride=0 for src1.
 735  */
 736 void
 737 gen8_fs_generator::generate_set_sample_id(fs_inst *ir,
 738                                           struct brw_reg dst,
 739                                           struct brw_reg src0,
 740                                           struct brw_reg src1)
 741 {
 742    assert(dst.type == BRW_REGISTER_TYPE_D || dst.type == BRW_REGISTER_TYPE_UD);
 743    assert(src0.type == BRW_REGISTER_TYPE_D || src0.type == BRW_REGISTER_TYPE_UD);
 744
 745    struct brw_reg reg = retype(stride(src1, 1, 4, 0), BRW_REGISTER_TYPE_UW);
 746
 747    unsigned save_exec_size = default_state.exec_size;
 748    default_state.exec_size = BRW_EXECUTE_8;
 749
 750    gen8_instruction *add = ADD(dst, src0, reg);
 751    gen8_set_mask_control(add, BRW_MASK_DISABLE);
 752    if (dispatch_width == 16) {
 753       add = ADD(offset(dst, 1), offset(src0, 1), suboffset(reg, 2));
 754       gen8_set_mask_control(add, BRW_MASK_DISABLE);
 755    }
 756
 757    default_state.exec_size = save_exec_size;
 758 }
 759
 760 /**
 761  * Change the register's data type from UD to HF, doubling the strides in order
 762  * to compensate for halving the data type width.
 763  */
 764 static struct brw_reg
 765 ud_reg_to_hf(struct brw_reg r)
 766 {
 767    assert(r.type == BRW_REGISTER_TYPE_UD);
 768    r.type = BRW_REGISTER_TYPE_HF;
 769
 770    /* The BRW_*_STRIDE enums are defined so that incrementing the field
 771     * doubles the real stride.
 772     */
 773    if (r.hstride != 0)
 774       ++r.hstride;
 775    if (r.vstride != 0)
 776       ++r.vstride;
 777
 778    return r;
 779 }
 780
 781 void
 782 gen8_fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
 783                                                  struct brw_reg dst,
 784                                                  struct brw_reg x,
 785                                                  struct brw_reg y)
 786 {
 787    assert(dst.type == BRW_REGISTER_TYPE_UD);
 788    assert(x.type == BRW_REGISTER_TYPE_F);
 789    assert(y.type == BRW_REGISTER_TYPE_F);
 790
 791    struct brw_reg dst_hf = ud_reg_to_hf(dst);
 792
 793    /* Give each 32-bit channel of dst the form below , where "." means
 794     * unchanged.
 795     *   0x....hhhh
 796     */
 797    MOV(dst_hf, y);
 798
 799    /* Now the form:
 800     *   0xhhhh0000
 801     */
 802    SHL(dst, dst, brw_imm_ud(16u));
 803
 804    /* And, finally the form of packHalf2x16's output:
 805     *   0xhhhhllll
 806     */
 807    MOV(dst_hf, x);
 808 }
 809
 810 void
 811 gen8_fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
 812                                                    struct brw_reg dst,
 813                                                    struct brw_reg src)
 814 {
 815    assert(dst.type == BRW_REGISTER_TYPE_F);
 816    assert(src.type == BRW_REGISTER_TYPE_UD);
 817
 818    struct brw_reg src_hf = ud_reg_to_hf(src);
 819
 820    /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
 821     * For the Y case, we wish to access only the upper word; therefore
 822     * a 16-bit subregister offset is needed.
 823     */
 824    assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
 825           inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
 826    if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
 827       src_hf.subnr += 2;
 828
 829    MOV(dst, src_hf);
 830 }
 831
 832 void
 833 gen8_fs_generator::generate_code(exec_list *instructions)
 834 {
 835    int last_native_inst_offset = next_inst_offset;
 836    const char *last_annotation_string = NULL;
 837    const void *last_annotation_ir = NULL;
 838
 839    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
 840       if (prog) {
 841          fprintf(stderr,
 842                  "Native code for %s fragment shader %d (SIMD%d dispatch):\n",
 843                 shader_prog->Label ? shader_prog->Label : "unnamed",
 844                 shader_prog->Name, dispatch_width);
 845       } else if (fp) {
 846          fprintf(stderr,
 847                  "Native code for fragment program %d (SIMD%d dispatch):\n",
 848                  prog->Id, dispatch_width);
 849       } else {
 850          fprintf(stderr, "Native code for blorp program (SIMD%d dispatch):\n",
 851                  dispatch_width);
 852       }
 853    }
 854
 855    cfg_t *cfg = NULL;
 856    if (unlikely(INTEL_DEBUG & DEBUG_WM))
 857       cfg = new(mem_ctx) cfg_t(instructions);
 858
 859    foreach_list(node, instructions) {
 860       fs_inst *ir = (fs_inst *) node;
 861       struct brw_reg src[3], dst;
 862
 863       if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
 864          foreach_list(node, &cfg->block_list) {
 865             bblock_link *link = (bblock_link *)node;
 866             bblock_t *block = link->block;
 867
 868             if (block->start == ir) {
 869                fprintf(stderr, "   START B%d", block->block_num);
 870                foreach_list(predecessor_node, &block->parents) {
 871                   bblock_link *predecessor_link =
 872                      (bblock_link *)predecessor_node;
 873                   bblock_t *predecessor_block = predecessor_link->block;
 874                   fprintf(stderr, " <-B%d", predecessor_block->block_num);
 875                }
 876                fprintf(stderr, "\n");
 877             }
 878          }
 879
 880          if (last_annotation_ir != ir->ir) {
 881             last_annotation_ir = ir->ir;
 882             if (last_annotation_ir) {
 883                fprintf(stderr, "   ");
 884                if (prog) {
 885                   ((ir_instruction *) ir->ir)->fprint(stderr);
 886                } else if (prog) {
 887                   const prog_instruction *fpi;
 888                   fpi = (const prog_instruction *) ir->ir;
 889                   fprintf(stderr, "%d: ", (int)(fpi - prog->Instructions));
 890                   _mesa_fprint_instruction_opt(stderr,
 891                                                fpi,
 892                                                0, PROG_PRINT_DEBUG, NULL);
 893                }
 894                fprintf(stderr, "\n");
 895             }
 896          }
 897          if (last_annotation_string != ir->annotation) {
 898             last_annotation_string = ir->annotation;
 899             if (last_annotation_string)
 900                fprintf(stderr, "   %s\n", last_annotation_string);
 901          }
 902       }
 903
 904       for (unsigned int i = 0; i < 3; i++) {
 905          src[i] = brw_reg_from_fs_reg(&ir->src[i]);
 906
 907          /* The accumulator result appears to get used for the
 908           * conditional modifier generation.  When negating a UD
 909           * value, there is a 33rd bit generated for the sign in the
 910           * accumulator value, so now you can't check, for example,
 911           * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
 912           */
 913          assert(!ir->conditional_mod ||
 914                 ir->src[i].type != BRW_REGISTER_TYPE_UD ||
 915                 !ir->src[i].negate);
 916       }
 917       dst = brw_reg_from_fs_reg(&ir->dst);
 918
 919       default_state.conditional_mod = ir->conditional_mod;
 920       default_state.predicate = ir->predicate;
 921       default_state.predicate_inverse = ir->predicate_inverse;
 922       default_state.saturate = ir->saturate;
 923       default_state.mask_control = ir->force_writemask_all;
 924       default_state.flag_subreg_nr = ir->flag_subreg;
 925
 926       if (dispatch_width == 16 && !ir->force_uncompressed)
 927          default_state.exec_size = BRW_EXECUTE_16;
 928       else
 929          default_state.exec_size = BRW_EXECUTE_8;
 930
 931       if (ir->force_uncompressed || dispatch_width == 8)
 932          default_state.qtr_control = GEN6_COMPRESSION_1Q;
 933       else if (ir->force_sechalf)
 934          default_state.qtr_control = GEN6_COMPRESSION_2Q;
 935       else
 936          default_state.qtr_control = GEN6_COMPRESSION_1H;
 937
 938       switch (ir->opcode) {
 939       case BRW_OPCODE_MOV:
 940          MOV(dst, src[0]);
 941          break;
 942       case BRW_OPCODE_ADD:
 943          ADD(dst, src[0], src[1]);
 944          break;
 945       case BRW_OPCODE_MUL:
 946          MUL(dst, src[0], src[1]);
 947          break;
 948       case BRW_OPCODE_MACH:
 949          MACH(dst, src[0], src[1]);
 950          break;
 951
 952       case BRW_OPCODE_MAD:
 953          default_state.access_mode = BRW_ALIGN_16;
 954          MAD(dst, src[0], src[1], src[2]);
 955          default_state.access_mode = BRW_ALIGN_1;
 956          break;
 957
 958       case BRW_OPCODE_LRP:
 959          default_state.access_mode = BRW_ALIGN_16;
 960          LRP(dst, src[0], src[1], src[2]);
 961          default_state.access_mode = BRW_ALIGN_1;
 962          break;
 963
 964
 965       case BRW_OPCODE_FRC:
 966          FRC(dst, src[0]);
 967          break;
 968       case BRW_OPCODE_RNDD:
 969          RNDD(dst, src[0]);
 970          break;
 971       case BRW_OPCODE_RNDE:
 972          RNDE(dst, src[0]);
 973          break;
 974       case BRW_OPCODE_RNDZ:
 975          RNDZ(dst, src[0]);
 976          break;
 977
 978       case BRW_OPCODE_AND:
 979          AND(dst, src[0], src[1]);
 980          break;
 981       case BRW_OPCODE_OR:
 982          OR(dst, src[0], src[1]);
 983          break;
 984       case BRW_OPCODE_XOR:
 985          XOR(dst, src[0], src[1]);
 986          break;
 987       case BRW_OPCODE_NOT:
 988          NOT(dst, src[0]);
 989          break;
 990       case BRW_OPCODE_ASR:
 991          ASR(dst, src[0], src[1]);
 992          break;
 993       case BRW_OPCODE_SHR:
 994          SHR(dst, src[0], src[1]);
 995          break;
 996       case BRW_OPCODE_SHL:
 997          SHL(dst, src[0], src[1]);
 998          break;
 999
1000       case BRW_OPCODE_F32TO16:
1001          MOV(retype(dst, BRW_REGISTER_TYPE_HF), src[0]);
1002          break;
1003       case BRW_OPCODE_F16TO32:
1004          MOV(dst, retype(src[0], BRW_REGISTER_TYPE_HF));
1005          break;
1006
1007       case BRW_OPCODE_CMP:
1008          CMP(dst, ir->conditional_mod, src[0], src[1]);
1009          break;
1010       case BRW_OPCODE_SEL:
1011          SEL(dst, src[0], src[1]);
1012          break;
1013
1014       case BRW_OPCODE_BFREV:
1015          /* BFREV only supports UD type for src and dst. */
1016          BFREV(retype(dst, BRW_REGISTER_TYPE_UD),
1017                retype(src[0], BRW_REGISTER_TYPE_UD));
1018          break;
1019
1020       case BRW_OPCODE_FBH:
1021          /* FBH only supports UD type for dst. */
1022          FBH(retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1023          break;
1024
1025       case BRW_OPCODE_FBL:
1026          /* FBL only supports UD type for dst. */
1027          FBL(retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1028          break;
1029
1030       case BRW_OPCODE_CBIT:
1031          /* CBIT only supports UD type for dst. */
1032          CBIT(retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1033          break;
1034
1035       case BRW_OPCODE_ADDC:
1036          ADDC(dst, src[0], src[1]);
1037          break;
1038
1039       case BRW_OPCODE_SUBB:
1040          SUBB(dst, src[0], src[1]);
1041          break;
1042
1043       case BRW_OPCODE_BFE:
1044          default_state.access_mode = BRW_ALIGN_16;
1045          BFE(dst, src[0], src[1], src[2]);
1046          default_state.access_mode = BRW_ALIGN_1;
1047          break;
1048
1049       case BRW_OPCODE_BFI1:
1050          BFI1(dst, src[0], src[1]);
1051          break;
1052
1053       case BRW_OPCODE_BFI2:
1054          default_state.access_mode = BRW_ALIGN_16;
1055          BFI2(dst, src[0], src[1], src[2]);
1056          default_state.access_mode = BRW_ALIGN_1;
1057          break;
1058
1059       case BRW_OPCODE_IF:
1060          IF(BRW_PREDICATE_NORMAL);
1061          break;
1062
1063       case BRW_OPCODE_ELSE:
1064          ELSE();
1065          break;
1066
1067       case BRW_OPCODE_ENDIF:
1068          ENDIF();
1069          break;
1070
1071       case BRW_OPCODE_DO:
1072          DO();
1073          break;
1074
1075       case BRW_OPCODE_BREAK:
1076          BREAK();
1077          break;
1078
1079       case BRW_OPCODE_CONTINUE:
1080          CONTINUE();
1081          break;
1082
1083       case BRW_OPCODE_WHILE:
1084          WHILE();
1085          break;
1086
1087       case SHADER_OPCODE_RCP:
1088          MATH(BRW_MATH_FUNCTION_INV, dst, src[0]);
1089          break;
1090
1091       case SHADER_OPCODE_RSQ:
1092          MATH(BRW_MATH_FUNCTION_RSQ, dst, src[0]);
1093          break;
1094
1095       case SHADER_OPCODE_SQRT:
1096          MATH(BRW_MATH_FUNCTION_SQRT, dst, src[0]);
1097          break;
1098
1099       case SHADER_OPCODE_EXP2:
1100          MATH(BRW_MATH_FUNCTION_EXP, dst, src[0]);
1101          break;
1102
1103       case SHADER_OPCODE_LOG2:
1104          MATH(BRW_MATH_FUNCTION_LOG, dst, src[0]);
1105          break;
1106
1107       case SHADER_OPCODE_SIN:
1108          MATH(BRW_MATH_FUNCTION_SIN, dst, src[0]);
1109          break;
1110
1111       case SHADER_OPCODE_COS:
1112          MATH(BRW_MATH_FUNCTION_COS, dst, src[0]);
1113          break;
1114
1115       case SHADER_OPCODE_INT_QUOTIENT:
1116          MATH(BRW_MATH_FUNCTION_INT_DIV_QUOTIENT, dst, src[0], src[1]);
1117          break;
1118
1119       case SHADER_OPCODE_INT_REMAINDER:
1120          MATH(BRW_MATH_FUNCTION_INT_DIV_REMAINDER, dst, src[0], src[1]);
1121          break;
1122
1123       case SHADER_OPCODE_POW:
1124          MATH(BRW_MATH_FUNCTION_POW, dst, src[0], src[1]);
1125          break;
1126
1127       case FS_OPCODE_PIXEL_X:
1128       case FS_OPCODE_PIXEL_Y:
1129          assert(!"FS_OPCODE_PIXEL_X and FS_OPCODE_PIXEL_Y are only for Gen4-5.");
1130          break;
1131
1132       case FS_OPCODE_CINTERP:
1133          MOV(dst, src[0]);
1134          break;
1135       case FS_OPCODE_LINTERP:
1136          generate_linterp(ir, dst, src);
1137          break;
1138       case SHADER_OPCODE_TEX:
1139       case FS_OPCODE_TXB:
1140       case SHADER_OPCODE_TXD:
1141       case SHADER_OPCODE_TXF:
1142       case SHADER_OPCODE_TXF_CMS:
1143       case SHADER_OPCODE_TXF_UMS:
1144       case SHADER_OPCODE_TXF_MCS:
1145       case SHADER_OPCODE_TXL:
1146       case SHADER_OPCODE_TXS:
1147       case SHADER_OPCODE_LOD:
1148       case SHADER_OPCODE_TG4:
1149       case SHADER_OPCODE_TG4_OFFSET:
1150          generate_tex(ir, dst, src[0]);
1151          break;
1152
1153       case FS_OPCODE_DDX:
1154          generate_ddx(ir, dst, src[0]);
1155          break;
1156       case FS_OPCODE_DDY:
1157          /* Make sure fp->UsesDFdy flag got set (otherwise there's no
1158           * guarantee that c->key.render_to_fbo is set).
1159           */
1160          assert(fp->UsesDFdy);
1161          generate_ddy(ir, dst, src[0], c->key.render_to_fbo);
1162          break;
1163
1164       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1165          generate_scratch_write(ir, src[0]);
1166          break;
1167
1168       case SHADER_OPCODE_GEN4_SCRATCH_READ:
1169          generate_scratch_read(ir, dst);
1170          break;
1171
1172       case SHADER_OPCODE_GEN7_SCRATCH_READ:
1173          generate_scratch_read_gen7(ir, dst);
1174          break;
1175
1176       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
1177          generate_uniform_pull_constant_load(ir, dst, src[0], src[1]);
1178          break;
1179
1180       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
1181          generate_varying_pull_constant_load(ir, dst, src[0], src[1]);
1182          break;
1183
1184       case FS_OPCODE_FB_WRITE:
1185          generate_fb_write(ir);
1186          break;
1187
1188       case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
1189          generate_mov_dispatch_to_flags(ir);
1190          break;
1191
1192       case FS_OPCODE_DISCARD_JUMP:
1193          generate_discard_jump(ir);
1194          break;
1195
1196       case SHADER_OPCODE_SHADER_TIME_ADD:
1197          assert(!"XXX: Missing Gen8 scalar support for INTEL_DEBUG=shader_time");
1198          break;
1199
1200       case SHADER_OPCODE_UNTYPED_ATOMIC:
1201          assert(!"XXX: Missing Gen8 scalar support for untyped atomics");
1202          break;
1203
1204       case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1205          assert(!"XXX: Missing Gen8 scalar support for untyped surface reads");
1206          break;
1207
1208       case FS_OPCODE_SET_SIMD4X2_OFFSET:
1209          generate_set_simd4x2_offset(ir, dst, src[0]);
1210          break;
1211
1212       case FS_OPCODE_SET_OMASK:
1213          generate_set_omask(ir, dst, src[0]);
1214          break;
1215
1216       case FS_OPCODE_SET_SAMPLE_ID:
1217          generate_set_sample_id(ir, dst, src[0], src[1]);
1218          break;
1219
1220       case FS_OPCODE_PACK_HALF_2x16_SPLIT:
1221          generate_pack_half_2x16_split(ir, dst, src[0], src[1]);
1222          break;
1223
1224       case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
1225       case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
1226          generate_unpack_half_2x16_split(ir, dst, src[0]);
1227          break;
1228
1229       case FS_OPCODE_PLACEHOLDER_HALT:
1230          /* This is the place where the final HALT needs to be inserted if
1231           * we've emitted any discards.  If not, this will emit no code.
1232           */
1233          patch_discard_jumps_to_fb_writes();
1234          break;
1235
1236       default:
1237          if (ir->opcode < int(ARRAY_SIZE(opcode_descs))) {
1238             _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
1239                           opcode_descs[ir->opcode].name);
1240          } else {
1241             _mesa_problem(ctx, "Unsupported opcode %d in FS", ir->opcode);
1242          }
1243          abort();
1244       }
1245
1246       if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1247          disassemble(stderr, last_native_inst_offset, next_inst_offset);
1248
1249          foreach_list(node, &cfg->block_list) {
1250             bblock_link *link = (bblock_link *)node;
1251             bblock_t *block = link->block;
1252
1253             if (block->end == ir) {
1254                fprintf(stderr, "   END B%d", block->block_num);
1255                foreach_list(successor_node, &block->children) {
1256                   bblock_link *successor_link =
1257                      (bblock_link *)successor_node;
1258                   bblock_t *successor_block = successor_link->block;
1259                   fprintf(stderr, " ->B%d", successor_block->block_num);
1260                }
1261                fprintf(stderr, "\n");
1262             }
1263          }
1264       }
1265
1266       last_native_inst_offset = next_inst_offset;
1267    }
1268
1269    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1270       fprintf(stderr, "\n");
1271    }
1272
1273    patch_jump_targets();
1274
1275    /* OK, while the INTEL_DEBUG=fs above is very nice for debugging FS
1276     * emit issues, it doesn't get the jump distances into the output,
1277     * which is often something we want to debug.  So this is here in
1278     * case you're doing that.
1279     */
1280    if (0 && unlikely(INTEL_DEBUG & DEBUG_WM)) {
1281       disassemble(stderr, 0, next_inst_offset);
1282    }
1283 }
1284
1285 const unsigned *
1286 gen8_fs_generator::generate_assembly(exec_list *simd8_instructions,
1287                                      exec_list *simd16_instructions,
1288                                      unsigned *assembly_size)
1289 {
1290    assert(simd8_instructions || simd16_instructions);
1291
1292    if (simd8_instructions) {
1293       dispatch_width = 8;
1294       generate_code(simd8_instructions);
1295    }
1296
1297    if (simd16_instructions) {
1298       /* Align to a 64-byte boundary. */
1299       while ((nr_inst * sizeof(gen8_instruction)) % 64)
1300          NOP();
1301
1302       /* Save off the start of this SIMD16 program */
1303       c->prog_data.prog_offset_16 = nr_inst * sizeof(gen8_instruction);
1304
1305       dispatch_width = 16;
1306       generate_code(simd16_instructions);
1307    }
1308
1309    *assembly_size = next_inst_offset;
1310    return (const unsigned *) store;
1311 }