src/mesa/drivers/dri/i965/brw_fs_generator.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs_generator.cpp
  25  *
  26  * This file supports generating code from the FS LIR to the actual
  27  * native instructions.
  28  */
  29
  30 extern "C" {
  31 #include "main/macros.h"
  32 #include "brw_context.h"
  33 #include "brw_eu.h"
  34 } /* extern "C" */
  35
  36 #include "brw_fs.h"
  37 #include "brw_cfg.h"
  38
  39 fs_generator::fs_generator(struct brw_context *brw,
  40                            struct brw_wm_compile *c,
  41                            struct gl_shader_program *prog,
  42                            struct gl_fragment_program *fp,
  43                            bool dual_source_output)
  44
  45    : brw(brw), c(c), prog(prog), fp(fp), dual_source_output(dual_source_output)
  46 {
  47    ctx = &brw->ctx;
  48
  49    mem_ctx = c;
  50
  51    p = rzalloc(mem_ctx, struct brw_compile);
  52    brw_init_compile(brw, p, mem_ctx);
  53 }
  54
  55 fs_generator::~fs_generator()
  56 {
  57 }
  58
  59 void
  60 fs_generator::patch_discard_jumps_to_fb_writes()
  61 {
  62    if (brw->gen < 6 || this->discard_halt_patches.is_empty())
  63       return;
  64
  65    /* There is a somewhat strange undocumented requirement of using
  66     * HALT, according to the simulator.  If some channel has HALTed to
  67     * a particular UIP, then by the end of the program, every channel
  68     * must have HALTed to that UIP.  Furthermore, the tracking is a
  69     * stack, so you can't do the final halt of a UIP after starting
  70     * halting to a new UIP.
  71     *
  72     * Symptoms of not emitting this instruction on actual hardware
  73     * included GPU hangs and sparkly rendering on the piglit discard
  74     * tests.
  75     */
  76    struct brw_instruction *last_halt = gen6_HALT(p);
  77    last_halt->bits3.break_cont.uip = 2;
  78    last_halt->bits3.break_cont.jip = 2;
  79
  80    int ip = p->nr_insn;
  81
  82    foreach_list(node, &this->discard_halt_patches) {
  83       ip_record *patch_ip = (ip_record *)node;
  84       struct brw_instruction *patch = &p->store[patch_ip->ip];
  85
  86       assert(patch->header.opcode == BRW_OPCODE_HALT);
  87       /* HALT takes a half-instruction distance from the pre-incremented IP. */
  88       patch->bits3.break_cont.uip = (ip - patch_ip->ip) * 2;
  89    }
  90
  91    this->discard_halt_patches.make_empty();
  92 }
  93
  94 void
  95 fs_generator::generate_fb_write(fs_inst *inst)
  96 {
  97    bool eot = inst->eot;
  98    struct brw_reg implied_header;
  99    uint32_t msg_control;
 100
 101    /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
 102     * move, here's g1.
 103     */
 104    brw_push_insn_state(p);
 105    brw_set_mask_control(p, BRW_MASK_DISABLE);
 106    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 107
 108    if (inst->header_present) {
 109       /* On HSW, the GPU will use the predicate on SENDC, unless the header is
 110        * present.
 111        */
 112       if (!brw->is_haswell && ((fp && fp->UsesKill) ||
 113                                c->key.alpha_test_func)) {
 114          struct brw_reg pixel_mask;
 115
 116          if (brw->gen >= 6)
 117             pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
 118          else
 119             pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
 120
 121          brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
 122       }
 123
 124       if (brw->gen >= 6) {
 125          brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
 126          brw_MOV(p,
 127                  retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
 128                  retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
 129          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 130
 131          if (inst->target > 0 && c->key.replicate_alpha) {
 132             /* Set "Source0 Alpha Present to RenderTarget" bit in message
 133              * header.
 134              */
 135             brw_OR(p,
 136                    vec1(retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD)),
 137                    vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
 138                    brw_imm_ud(0x1 << 11));
 139          }
 140
 141          if (inst->target > 0) {
 142             /* Set the render target index for choosing BLEND_STATE. */
 143             brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
 144                                            inst->base_mrf, 2),
 145                               BRW_REGISTER_TYPE_UD),
 146                     brw_imm_ud(inst->target));
 147          }
 148
 149          implied_header = brw_null_reg();
 150       } else {
 151          implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
 152
 153          brw_MOV(p,
 154                  brw_message_reg(inst->base_mrf + 1),
 155                  brw_vec8_grf(1, 0));
 156       }
 157    } else {
 158       implied_header = brw_null_reg();
 159    }
 160
 161    if (this->dual_source_output)
 162       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
 163    else if (dispatch_width == 16)
 164       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
 165    else
 166       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
 167
 168    brw_pop_insn_state(p);
 169
 170    uint32_t surf_index =
 171       c->prog_data.binding_table.render_target_start + inst->target;
 172    brw_fb_WRITE(p,
 173                 dispatch_width,
 174                 inst->base_mrf,
 175                 implied_header,
 176                 msg_control,
 177                 surf_index,
 178                 inst->mlen,
 179                 0,
 180                 eot,
 181                 inst->header_present);
 182
 183    brw_mark_surface_used(&c->prog_data.base, surf_index);
 184 }
 185
 186 void
 187 fs_generator::generate_blorp_fb_write(fs_inst *inst)
 188 {
 189    brw_fb_WRITE(p,
 190                 16 /* dispatch_width */,
 191                 inst->base_mrf,
 192                 brw_reg_from_fs_reg(&inst->src[0]),
 193                 BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE,
 194                 inst->target,
 195                 inst->mlen,
 196                 0,
 197                 true,
 198                 inst->header_present);
 199 }
 200
 201 /* Computes the integer pixel x,y values from the origin.
 202  *
 203  * This is the basis of gl_FragCoord computation, but is also used
 204  * pre-gen6 for computing the deltas from v0 for computing
 205  * interpolation.
 206  */
 207 void
 208 fs_generator::generate_pixel_xy(struct brw_reg dst, bool is_x)
 209 {
 210    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
 211    struct brw_reg src;
 212    struct brw_reg deltas;
 213
 214    if (is_x) {
 215       src = stride(suboffset(g1_uw, 4), 2, 4, 0);
 216       deltas = brw_imm_v(0x10101010);
 217    } else {
 218       src = stride(suboffset(g1_uw, 5), 2, 4, 0);
 219       deltas = brw_imm_v(0x11001100);
 220    }
 221
 222    if (dispatch_width == 16) {
 223       dst = vec16(dst);
 224    }
 225
 226    /* We do this SIMD8 or SIMD16, but since the destination is UW we
 227     * don't do compression in the SIMD16 case.
 228     */
 229    brw_push_insn_state(p);
 230    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 231    brw_ADD(p, dst, src, deltas);
 232    brw_pop_insn_state(p);
 233 }
 234
 235 void
 236 fs_generator::generate_linterp(fs_inst *inst,
 237                              struct brw_reg dst, struct brw_reg *src)
 238 {
 239    struct brw_reg delta_x = src[0];
 240    struct brw_reg delta_y = src[1];
 241    struct brw_reg interp = src[2];
 242
 243    if (brw->has_pln &&
 244        delta_y.nr == delta_x.nr + 1 &&
 245        (brw->gen >= 6 || (delta_x.nr & 1) == 0)) {
 246       brw_PLN(p, dst, interp, delta_x);
 247    } else {
 248       brw_LINE(p, brw_null_reg(), interp, delta_x);
 249       brw_MAC(p, dst, suboffset(interp, 1), delta_y);
 250    }
 251 }
 252
 253 void
 254 fs_generator::generate_math1_gen7(fs_inst *inst,
 255                                 struct brw_reg dst,
 256                                 struct brw_reg src0)
 257 {
 258    assert(inst->mlen == 0);
 259    brw_math(p, dst,
 260             brw_math_function(inst->opcode),
 261             0, src0,
 262             BRW_MATH_DATA_VECTOR,
 263             BRW_MATH_PRECISION_FULL);
 264 }
 265
 266 void
 267 fs_generator::generate_math2_gen7(fs_inst *inst,
 268                                 struct brw_reg dst,
 269                                 struct brw_reg src0,
 270                                 struct brw_reg src1)
 271 {
 272    assert(inst->mlen == 0);
 273    brw_math2(p, dst, brw_math_function(inst->opcode), src0, src1);
 274 }
 275
 276 void
 277 fs_generator::generate_math1_gen6(fs_inst *inst,
 278                                 struct brw_reg dst,
 279                                 struct brw_reg src0)
 280 {
 281    int op = brw_math_function(inst->opcode);
 282
 283    assert(inst->mlen == 0);
 284
 285    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 286    brw_math(p, dst,
 287             op,
 288             0, src0,
 289             BRW_MATH_DATA_VECTOR,
 290             BRW_MATH_PRECISION_FULL);
 291
 292    if (dispatch_width == 16) {
 293       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 294       brw_math(p, sechalf(dst),
 295                op,
 296                0, sechalf(src0),
 297                BRW_MATH_DATA_VECTOR,
 298                BRW_MATH_PRECISION_FULL);
 299       brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
 300    }
 301 }
 302
 303 void
 304 fs_generator::generate_math2_gen6(fs_inst *inst,
 305                                 struct brw_reg dst,
 306                                 struct brw_reg src0,
 307                                 struct brw_reg src1)
 308 {
 309    int op = brw_math_function(inst->opcode);
 310
 311    assert(inst->mlen == 0);
 312
 313    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 314    brw_math2(p, dst, op, src0, src1);
 315
 316    if (dispatch_width == 16) {
 317       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 318       brw_math2(p, sechalf(dst), op, sechalf(src0), sechalf(src1));
 319       brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
 320    }
 321 }
 322
 323 void
 324 fs_generator::generate_math_gen4(fs_inst *inst,
 325                                struct brw_reg dst,
 326                                struct brw_reg src)
 327 {
 328    int op = brw_math_function(inst->opcode);
 329
 330    assert(inst->mlen >= 1);
 331
 332    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 333    brw_math(p, dst,
 334             op,
 335             inst->base_mrf, src,
 336             BRW_MATH_DATA_VECTOR,
 337             BRW_MATH_PRECISION_FULL);
 338
 339    if (dispatch_width == 16) {
 340       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 341       brw_math(p, sechalf(dst),
 342                op,
 343                inst->base_mrf + 1, sechalf(src),
 344                BRW_MATH_DATA_VECTOR,
 345                BRW_MATH_PRECISION_FULL);
 346
 347       brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
 348    }
 349 }
 350
 351 void
 352 fs_generator::generate_math_g45(fs_inst *inst,
 353                                 struct brw_reg dst,
 354                                 struct brw_reg src)
 355 {
 356    if (inst->opcode == SHADER_OPCODE_POW ||
 357        inst->opcode == SHADER_OPCODE_INT_QUOTIENT ||
 358        inst->opcode == SHADER_OPCODE_INT_REMAINDER) {
 359       generate_math_gen4(inst, dst, src);
 360       return;
 361    }
 362
 363    int op = brw_math_function(inst->opcode);
 364
 365    assert(inst->mlen >= 1);
 366
 367    brw_math(p, dst,
 368             op,
 369             inst->base_mrf, src,
 370             BRW_MATH_DATA_VECTOR,
 371             BRW_MATH_PRECISION_FULL);
 372 }
 373
 374 void
 375 fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
 376 {
 377    int msg_type = -1;
 378    int rlen = 4;
 379    uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 380    uint32_t return_format;
 381
 382    switch (dst.type) {
 383    case BRW_REGISTER_TYPE_D:
 384       return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
 385       break;
 386    case BRW_REGISTER_TYPE_UD:
 387       return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
 388       break;
 389    default:
 390       return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
 391       break;
 392    }
 393
 394    if (dispatch_width == 16 &&
 395       !inst->force_uncompressed && !inst->force_sechalf)
 396       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 397
 398    if (brw->gen >= 5) {
 399       switch (inst->opcode) {
 400       case SHADER_OPCODE_TEX:
 401          if (inst->shadow_compare) {
 402             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
 403          } else {
 404             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
 405          }
 406          break;
 407       case FS_OPCODE_TXB:
 408          if (inst->shadow_compare) {
 409             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
 410          } else {
 411             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
 412          }
 413          break;
 414       case SHADER_OPCODE_TXL:
 415          if (inst->shadow_compare) {
 416             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
 417          } else {
 418             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
 419          }
 420          break;
 421       case SHADER_OPCODE_TXS:
 422          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
 423          break;
 424       case SHADER_OPCODE_TXD:
 425          if (inst->shadow_compare) {
 426             /* Gen7.5+.  Otherwise, lowered by brw_lower_texture_gradients(). */
 427             assert(brw->is_haswell);
 428             msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
 429          } else {
 430             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
 431          }
 432          break;
 433       case SHADER_OPCODE_TXF:
 434          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
 435          break;
 436       case SHADER_OPCODE_TXF_CMS:
 437          if (brw->gen >= 7)
 438             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
 439          else
 440             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
 441          break;
 442       case SHADER_OPCODE_TXF_UMS:
 443          assert(brw->gen >= 7);
 444          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
 445          break;
 446       case SHADER_OPCODE_TXF_MCS:
 447          assert(brw->gen >= 7);
 448          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
 449          break;
 450       case SHADER_OPCODE_LOD:
 451          msg_type = GEN5_SAMPLER_MESSAGE_LOD;
 452          break;
 453       case SHADER_OPCODE_TG4:
 454          if (inst->shadow_compare) {
 455             assert(brw->gen >= 7);
 456             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
 457          } else {
 458             assert(brw->gen >= 6);
 459             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
 460          }
 461          break;
 462       case SHADER_OPCODE_TG4_OFFSET:
 463          assert(brw->gen >= 7);
 464          if (inst->shadow_compare) {
 465             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
 466          } else {
 467             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
 468          }
 469          break;
 470       default:
 471          assert(!"not reached");
 472          break;
 473       }
 474    } else {
 475       switch (inst->opcode) {
 476       case SHADER_OPCODE_TEX:
 477          /* Note that G45 and older determines shadow compare and dispatch width
 478           * from message length for most messages.
 479           */
 480          assert(dispatch_width == 8);
 481          msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
 482          if (inst->shadow_compare) {
 483             assert(inst->mlen == 6);
 484          } else {
 485             assert(inst->mlen <= 4);
 486          }
 487          break;
 488       case FS_OPCODE_TXB:
 489          if (inst->shadow_compare) {
 490             assert(inst->mlen == 6);
 491             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
 492          } else {
 493             assert(inst->mlen == 9);
 494             msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
 495             simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 496          }
 497          break;
 498       case SHADER_OPCODE_TXL:
 499          if (inst->shadow_compare) {
 500             assert(inst->mlen == 6);
 501             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
 502          } else {
 503             assert(inst->mlen == 9);
 504             msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
 505             simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 506          }
 507          break;
 508       case SHADER_OPCODE_TXD:
 509          /* There is no sample_d_c message; comparisons are done manually */
 510          assert(inst->mlen == 7 || inst->mlen == 10);
 511          msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
 512          break;
 513       case SHADER_OPCODE_TXF:
 514          assert(inst->mlen == 9);
 515          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
 516          simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 517          break;
 518       case SHADER_OPCODE_TXS:
 519          assert(inst->mlen == 3);
 520          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
 521          simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 522          break;
 523       default:
 524          assert(!"not reached");
 525          break;
 526       }
 527    }
 528    assert(msg_type != -1);
 529
 530    if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
 531       rlen = 8;
 532       dst = vec16(dst);
 533    }
 534
 535    if (brw->gen >= 7 && inst->header_present && dispatch_width == 16) {
 536       /* The send-from-GRF for SIMD16 texturing with a header has an extra
 537        * hardware register allocated to it, which we need to skip over (since
 538        * our coordinates in the payload are in the even-numbered registers,
 539        * and the header comes right before the first one).
 540        */
 541       assert(src.file == BRW_GENERAL_REGISTER_FILE);
 542       src.nr++;
 543    }
 544
 545    /* Load the message header if present.  If there's a texture offset,
 546     * we need to set it up explicitly and load the offset bitfield.
 547     * Otherwise, we can use an implied move from g0 to the first message reg.
 548     */
 549    if (inst->header_present) {
 550       if (brw->gen < 6 && !inst->texture_offset) {
 551          /* Set up an implied move from g0 to the MRF. */
 552          src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
 553       } else {
 554          struct brw_reg header_reg;
 555
 556          if (brw->gen >= 7) {
 557             header_reg = src;
 558          } else {
 559             assert(inst->base_mrf != -1);
 560             header_reg = brw_message_reg(inst->base_mrf);
 561          }
 562
 563          brw_push_insn_state(p);
 564          brw_set_mask_control(p, BRW_MASK_DISABLE);
 565          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 566          /* Explicitly set up the message header by copying g0 to the MRF. */
 567          brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
 568
 569          if (inst->texture_offset) {
 570             /* Set the offset bits in DWord 2. */
 571             brw_MOV(p, get_element_ud(header_reg, 2),
 572                        brw_imm_ud(inst->texture_offset));
 573          }
 574
 575          if (inst->sampler >= 16) {
 576             /* The "Sampler Index" field can only store values between 0 and 15.
 577              * However, we can add an offset to the "Sampler State Pointer"
 578              * field, effectively selecting a different set of 16 samplers.
 579              *
 580              * The "Sampler State Pointer" needs to be aligned to a 32-byte
 581              * offset, and each sampler state is only 16-bytes, so we can't
 582              * exclusively use the offset - we have to use both.
 583              */
 584             assert(brw->is_haswell); /* field only exists on Haswell */
 585             brw_ADD(p,
 586                     get_element_ud(header_reg, 3),
 587                     get_element_ud(brw_vec8_grf(0, 0), 3),
 588                     brw_imm_ud(16 * (inst->sampler / 16) *
 589                                sizeof(gen7_sampler_state)));
 590          }
 591          brw_pop_insn_state(p);
 592       }
 593    }
 594
 595    uint32_t surface_index = ((inst->opcode == SHADER_OPCODE_TG4 ||
 596       inst->opcode == SHADER_OPCODE_TG4_OFFSET)
 597       ? c->prog_data.base.binding_table.gather_texture_start
 598       : c->prog_data.base.binding_table.texture_start) + inst->sampler;
 599
 600    brw_SAMPLE(p,
 601               retype(dst, BRW_REGISTER_TYPE_UW),
 602               inst->base_mrf,
 603               src,
 604               surface_index,
 605               inst->sampler % 16,
 606               msg_type,
 607               rlen,
 608               inst->mlen,
 609               inst->header_present,
 610               simd_mode,
 611               return_format);
 612
 613    brw_mark_surface_used(&c->prog_data.base, surface_index);
 614 }
 615
 616
 617 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 618  * looking like:
 619  *
 620  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 621  *
 622  * Ideally, we want to produce:
 623  *
 624  *           DDX                     DDY
 625  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 626  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 627  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 628  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 629  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 630  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 631  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 632  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 633  *
 634  * and add another set of two more subspans if in 16-pixel dispatch mode.
 635  *
 636  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 637  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
 638  * pair.  But the ideal approximation may impose a huge performance cost on
 639  * sample_d.  On at least Haswell, sample_d instruction does some
 640  * optimizations if the same LOD is used for all pixels in the subspan.
 641  *
 642  * For DDY, we need to use ALIGN16 mode since it's capable of doing the
 643  * appropriate swizzling.
 644  */
 645 void
 646 fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
 647 {
 648    unsigned vstride, width;
 649
 650    if (c->key.high_quality_derivatives) {
 651       /* produce accurate derivatives */
 652       vstride = BRW_VERTICAL_STRIDE_2;
 653       width = BRW_WIDTH_2;
 654    }
 655    else {
 656       /* replicate the derivative at the top-left pixel to other pixels */
 657       vstride = BRW_VERTICAL_STRIDE_4;
 658       width = BRW_WIDTH_4;
 659    }
 660
 661    struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
 662                                  BRW_REGISTER_TYPE_F,
 663                                  vstride,
 664                                  width,
 665                                  BRW_HORIZONTAL_STRIDE_0,
 666                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 667    struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
 668                                  BRW_REGISTER_TYPE_F,
 669                                  vstride,
 670                                  width,
 671                                  BRW_HORIZONTAL_STRIDE_0,
 672                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 673    brw_ADD(p, dst, src0, negate(src1));
 674 }
 675
 676 /* The negate_value boolean is used to negate the derivative computation for
 677  * FBOs, since they place the origin at the upper left instead of the lower
 678  * left.
 679  */
 680 void
 681 fs_generator::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
 682                          bool negate_value)
 683 {
 684    if (c->key.high_quality_derivatives) {
 685       /* From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
 686        * Region Restrictions):
 687        *
 688        *     In Align16 access mode, SIMD16 is not allowed for DW operations
 689        *     and SIMD8 is not allowed for DF operations.
 690        *
 691        * In this context, "DW operations" means "operations acting on 32-bit
 692        * values", so it includes operations on floats.
 693        *
 694        * Gen4 has a similar restriction.  From the i965 PRM, section 11.5.3
 695        * (Instruction Compression -> Rules and Restrictions):
 696        *
 697        *     A compressed instruction must be in Align1 access mode. Align16
 698        *     mode instructions cannot be compressed.
 699        *
 700        * Similar text exists in the g45 PRM.
 701        *
 702        * On these platforms, if we're building a SIMD16 shader, we need to
 703        * manually unroll to a pair of SIMD8 instructions.
 704        */
 705       bool unroll_to_simd8 =
 706          (dispatch_width == 16 &&
 707           (brw->gen == 4 || (brw->gen == 7 && !brw->is_haswell)));
 708
 709       /* produce accurate derivatives */
 710       struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
 711                                     BRW_REGISTER_TYPE_F,
 712                                     BRW_VERTICAL_STRIDE_4,
 713                                     BRW_WIDTH_4,
 714                                     BRW_HORIZONTAL_STRIDE_1,
 715                                     BRW_SWIZZLE_XYXY, WRITEMASK_XYZW);
 716       struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
 717                                     BRW_REGISTER_TYPE_F,
 718                                     BRW_VERTICAL_STRIDE_4,
 719                                     BRW_WIDTH_4,
 720                                     BRW_HORIZONTAL_STRIDE_1,
 721                                     BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW);
 722       brw_push_insn_state(p);
 723       brw_set_access_mode(p, BRW_ALIGN_16);
 724       if (unroll_to_simd8)
 725          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 726       if (negate_value)
 727          brw_ADD(p, dst, src1, negate(src0));
 728       else
 729          brw_ADD(p, dst, src0, negate(src1));
 730       if (unroll_to_simd8) {
 731          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 732          src0 = sechalf(src0);
 733          src1 = sechalf(src1);
 734          dst = sechalf(dst);
 735          if (negate_value)
 736             brw_ADD(p, dst, src1, negate(src0));
 737          else
 738             brw_ADD(p, dst, src0, negate(src1));
 739       }
 740       brw_pop_insn_state(p);
 741    } else {
 742       /* replicate the derivative at the top-left pixel to other pixels */
 743       struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
 744                                     BRW_REGISTER_TYPE_F,
 745                                     BRW_VERTICAL_STRIDE_4,
 746                                     BRW_WIDTH_4,
 747                                     BRW_HORIZONTAL_STRIDE_0,
 748                                     BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 749       struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
 750                                     BRW_REGISTER_TYPE_F,
 751                                     BRW_VERTICAL_STRIDE_4,
 752                                     BRW_WIDTH_4,
 753                                     BRW_HORIZONTAL_STRIDE_0,
 754                                     BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 755       if (negate_value)
 756          brw_ADD(p, dst, src1, negate(src0));
 757       else
 758          brw_ADD(p, dst, src0, negate(src1));
 759    }
 760 }
 761
 762 void
 763 fs_generator::generate_discard_jump(fs_inst *inst)
 764 {
 765    assert(brw->gen >= 6);
 766
 767    /* This HALT will be patched up at FB write time to point UIP at the end of
 768     * the program, and at brw_uip_jip() JIP will be set to the end of the
 769     * current block (or the program).
 770     */
 771    this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
 772
 773    brw_push_insn_state(p);
 774    brw_set_mask_control(p, BRW_MASK_DISABLE);
 775    gen6_HALT(p);
 776    brw_pop_insn_state(p);
 777 }
 778
 779 void
 780 fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
 781 {
 782    assert(inst->mlen != 0);
 783
 784    brw_MOV(p,
 785            retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
 786            retype(src, BRW_REGISTER_TYPE_UD));
 787    brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
 788                                  dispatch_width / 8, inst->offset);
 789 }
 790
 791 void
 792 fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
 793 {
 794    assert(inst->mlen != 0);
 795
 796    brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
 797                                 dispatch_width / 8, inst->offset);
 798 }
 799
 800 void
 801 fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
 802 {
 803    gen7_block_read_scratch(p, dst, dispatch_width / 8, inst->offset);
 804 }
 805
 806 void
 807 fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
 808                                                   struct brw_reg dst,
 809                                                   struct brw_reg index,
 810                                                   struct brw_reg offset)
 811 {
 812    assert(inst->mlen != 0);
 813
 814    assert(index.file == BRW_IMMEDIATE_VALUE &&
 815           index.type == BRW_REGISTER_TYPE_UD);
 816    uint32_t surf_index = index.dw1.ud;
 817
 818    assert(offset.file == BRW_IMMEDIATE_VALUE &&
 819           offset.type == BRW_REGISTER_TYPE_UD);
 820    uint32_t read_offset = offset.dw1.ud;
 821
 822    brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
 823                         read_offset, surf_index);
 824
 825    brw_mark_surface_used(&c->prog_data.base, surf_index);
 826 }
 827
 828 void
 829 fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
 830                                                        struct brw_reg dst,
 831                                                        struct brw_reg index,
 832                                                        struct brw_reg offset)
 833 {
 834    assert(inst->mlen == 0);
 835
 836    assert(index.file == BRW_IMMEDIATE_VALUE &&
 837           index.type == BRW_REGISTER_TYPE_UD);
 838    uint32_t surf_index = index.dw1.ud;
 839
 840    assert(offset.file == BRW_GENERAL_REGISTER_FILE);
 841    /* Reference just the dword we need, to avoid angering validate_reg(). */
 842    offset = brw_vec1_grf(offset.nr, 0);
 843
 844    brw_push_insn_state(p);
 845    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 846    brw_set_mask_control(p, BRW_MASK_DISABLE);
 847    struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
 848    brw_pop_insn_state(p);
 849
 850    /* We use the SIMD4x2 mode because we want to end up with 4 components in
 851     * the destination loaded consecutively from the same offset (which appears
 852     * in the first component, and the rest are ignored).
 853     */
 854    dst.width = BRW_WIDTH_4;
 855    brw_set_dest(p, send, dst);
 856    brw_set_src0(p, send, offset);
 857    brw_set_sampler_message(p, send,
 858                            surf_index,
 859                            0, /* LD message ignores sampler unit */
 860                            GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
 861                            1, /* rlen */
 862                            1, /* mlen */
 863                            false, /* no header */
 864                            BRW_SAMPLER_SIMD_MODE_SIMD4X2,
 865                            0);
 866
 867    brw_mark_surface_used(&c->prog_data.base, surf_index);
 868 }
 869
 870 void
 871 fs_generator::generate_varying_pull_constant_load(fs_inst *inst,
 872                                                   struct brw_reg dst,
 873                                                   struct brw_reg index,
 874                                                   struct brw_reg offset)
 875 {
 876    assert(brw->gen < 7); /* Should use the gen7 variant. */
 877    assert(inst->header_present);
 878    assert(inst->mlen);
 879
 880    assert(index.file == BRW_IMMEDIATE_VALUE &&
 881           index.type == BRW_REGISTER_TYPE_UD);
 882    uint32_t surf_index = index.dw1.ud;
 883
 884    uint32_t simd_mode, rlen, msg_type;
 885    if (dispatch_width == 16) {
 886       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 887       rlen = 8;
 888    } else {
 889       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 890       rlen = 4;
 891    }
 892
 893    if (brw->gen >= 5)
 894       msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
 895    else {
 896       /* We always use the SIMD16 message so that we only have to load U, and
 897        * not V or R.
 898        */
 899       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
 900       assert(inst->mlen == 3);
 901       assert(inst->regs_written == 8);
 902       rlen = 8;
 903       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 904    }
 905
 906    struct brw_reg offset_mrf = retype(brw_message_reg(inst->base_mrf + 1),
 907                                       BRW_REGISTER_TYPE_D);
 908    brw_MOV(p, offset_mrf, offset);
 909
 910    struct brw_reg header = brw_vec8_grf(0, 0);
 911    gen6_resolve_implied_move(p, &header, inst->base_mrf);
 912
 913    struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
 914    send->header.compression_control = BRW_COMPRESSION_NONE;
 915    brw_set_dest(p, send, dst);
 916    brw_set_src0(p, send, header);
 917    if (brw->gen < 6)
 918       send->header.destreg__conditionalmod = inst->base_mrf;
 919
 920    /* Our surface is set up as floats, regardless of what actual data is
 921     * stored in it.
 922     */
 923    uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
 924    brw_set_sampler_message(p, send,
 925                            surf_index,
 926                            0, /* sampler (unused) */
 927                            msg_type,
 928                            rlen,
 929                            inst->mlen,
 930                            inst->header_present,
 931                            simd_mode,
 932                            return_format);
 933
 934    brw_mark_surface_used(&c->prog_data.base, surf_index);
 935 }
 936
 937 void
 938 fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
 939                                                        struct brw_reg dst,
 940                                                        struct brw_reg index,
 941                                                        struct brw_reg offset)
 942 {
 943    assert(brw->gen >= 7);
 944    /* Varying-offset pull constant loads are treated as a normal expression on
 945     * gen7, so the fact that it's a send message is hidden at the IR level.
 946     */
 947    assert(!inst->header_present);
 948    assert(!inst->mlen);
 949
 950    assert(index.file == BRW_IMMEDIATE_VALUE &&
 951           index.type == BRW_REGISTER_TYPE_UD);
 952    uint32_t surf_index = index.dw1.ud;
 953
 954    uint32_t simd_mode, rlen, mlen;
 955    if (dispatch_width == 16) {
 956       mlen = 2;
 957       rlen = 8;
 958       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 959    } else {
 960       mlen = 1;
 961       rlen = 4;
 962       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 963    }
 964
 965    struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
 966    brw_set_dest(p, send, dst);
 967    brw_set_src0(p, send, offset);
 968    brw_set_sampler_message(p, send,
 969                            surf_index,
 970                            0, /* LD message ignores sampler unit */
 971                            GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
 972                            rlen,
 973                            mlen,
 974                            false, /* no header */
 975                            simd_mode,
 976                            0);
 977
 978    brw_mark_surface_used(&c->prog_data.base, surf_index);
 979 }
 980
 981 /**
 982  * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
 983  * into the flags register (f0.0).
 984  *
 985  * Used only on Gen6 and above.
 986  */
 987 void
 988 fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst)
 989 {
 990    struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg);
 991    struct brw_reg dispatch_mask;
 992
 993    if (brw->gen >= 6)
 994       dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
 995    else
 996       dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
 997
 998    brw_push_insn_state(p);
 999    brw_set_mask_control(p, BRW_MASK_DISABLE);
1000    brw_MOV(p, flags, dispatch_mask);
1001    brw_pop_insn_state(p);
1002 }
1003
1004
1005 static uint32_t brw_file_from_reg(fs_reg *reg)
1006 {
1007    switch (reg->file) {
1008    case GRF:
1009       return BRW_GENERAL_REGISTER_FILE;
1010    case MRF:
1011       return BRW_MESSAGE_REGISTER_FILE;
1012    case IMM:
1013       return BRW_IMMEDIATE_VALUE;
1014    default:
1015       assert(!"not reached");
1016       return BRW_GENERAL_REGISTER_FILE;
1017    }
1018 }
1019
1020 struct brw_reg
1021 brw_reg_from_fs_reg(fs_reg *reg)
1022 {
1023    struct brw_reg brw_reg;
1024
1025    switch (reg->file) {
1026    case GRF:
1027    case MRF:
1028       if (reg->stride == 0) {
1029          brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, 0);
1030       } else {
1031          brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
1032          brw_reg = stride(brw_reg, 8 * reg->stride, 8, reg->stride);
1033       }
1034
1035       brw_reg = retype(brw_reg, reg->type);
1036       brw_reg = byte_offset(brw_reg, reg->subreg_offset);
1037       break;
1038    case IMM:
1039       switch (reg->type) {
1040       case BRW_REGISTER_TYPE_F:
1041          brw_reg = brw_imm_f(reg->imm.f);
1042          break;
1043       case BRW_REGISTER_TYPE_D:
1044          brw_reg = brw_imm_d(reg->imm.i);
1045          break;
1046       case BRW_REGISTER_TYPE_UD:
1047          brw_reg = brw_imm_ud(reg->imm.u);
1048          break;
1049       default:
1050          assert(!"not reached");
1051          brw_reg = brw_null_reg();
1052          break;
1053       }
1054       break;
1055    case HW_REG:
1056       brw_reg = reg->fixed_hw_reg;
1057       break;
1058    case BAD_FILE:
1059       /* Probably unused. */
1060       brw_reg = brw_null_reg();
1061       break;
1062    case UNIFORM:
1063       assert(!"not reached");
1064       brw_reg = brw_null_reg();
1065       break;
1066    default:
1067       assert(!"not reached");
1068       brw_reg = brw_null_reg();
1069       break;
1070    }
1071    if (reg->abs)
1072       brw_reg = brw_abs(brw_reg);
1073    if (reg->negate)
1074       brw_reg = negate(brw_reg);
1075
1076    return brw_reg;
1077 }
1078
1079 /**
1080  * Sets the first word of a vgrf for gen7+ simd4x2 uniform pull constant
1081  * sampler LD messages.
1082  *
1083  * We don't want to bake it into the send message's code generation because
1084  * that means we don't get a chance to schedule the instructions.
1085  */
1086 void
1087 fs_generator::generate_set_simd4x2_offset(fs_inst *inst,
1088                                           struct brw_reg dst,
1089                                           struct brw_reg value)
1090 {
1091    assert(value.file == BRW_IMMEDIATE_VALUE);
1092
1093    brw_push_insn_state(p);
1094    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1095    brw_set_mask_control(p, BRW_MASK_DISABLE);
1096    brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
1097    brw_pop_insn_state(p);
1098 }
1099
1100 /* Sets vstride=16, width=8, hstride=2 or vstride=0, width=1, hstride=0
1101  * (when mask is passed as a uniform) of register mask before moving it
1102  * to register dst.
1103  */
1104 void
1105 fs_generator::generate_set_omask(fs_inst *inst,
1106                                  struct brw_reg dst,
1107                                  struct brw_reg mask)
1108 {
1109    bool stride_8_8_1 =
1110     (mask.vstride == BRW_VERTICAL_STRIDE_8 &&
1111      mask.width == BRW_WIDTH_8 &&
1112      mask.hstride == BRW_HORIZONTAL_STRIDE_1);
1113
1114    bool stride_0_1_0 =
1115     (mask.vstride == BRW_VERTICAL_STRIDE_0 &&
1116      mask.width == BRW_WIDTH_1 &&
1117      mask.hstride == BRW_HORIZONTAL_STRIDE_0);
1118
1119    assert(stride_8_8_1 || stride_0_1_0);
1120    assert(dst.type == BRW_REGISTER_TYPE_UW);
1121
1122    if (dispatch_width == 16)
1123       dst = vec16(dst);
1124    brw_push_insn_state(p);
1125    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1126    brw_set_mask_control(p, BRW_MASK_DISABLE);
1127
1128    if (stride_8_8_1) {
1129       brw_MOV(p, dst, retype(stride(mask, 16, 8, 2), dst.type));
1130    } else if (stride_0_1_0) {
1131       brw_MOV(p, dst, retype(mask, dst.type));
1132    }
1133    brw_pop_insn_state(p);
1134 }
1135
1136 /* Sets vstride=1, width=4, hstride=0 of register src1 during
1137  * the ADD instruction.
1138  */
1139 void
1140 fs_generator::generate_set_sample_id(fs_inst *inst,
1141                                      struct brw_reg dst,
1142                                      struct brw_reg src0,
1143                                      struct brw_reg src1)
1144 {
1145    assert(dst.type == BRW_REGISTER_TYPE_D ||
1146           dst.type == BRW_REGISTER_TYPE_UD);
1147    assert(src0.type == BRW_REGISTER_TYPE_D ||
1148           src0.type == BRW_REGISTER_TYPE_UD);
1149
1150    brw_push_insn_state(p);
1151    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1152    brw_set_mask_control(p, BRW_MASK_DISABLE);
1153    struct brw_reg reg = retype(stride(src1, 1, 4, 0), BRW_REGISTER_TYPE_UW);
1154    brw_ADD(p, dst, src0, reg);
1155    if (dispatch_width == 16)
1156       brw_ADD(p, offset(dst, 1), offset(src0, 1), suboffset(reg, 2));
1157    brw_pop_insn_state(p);
1158 }
1159
1160 /**
1161  * Change the register's data type from UD to W, doubling the strides in order
1162  * to compensate for halving the data type width.
1163  */
1164 static struct brw_reg
1165 ud_reg_to_w(struct brw_reg r)
1166 {
1167    assert(r.type == BRW_REGISTER_TYPE_UD);
1168    r.type = BRW_REGISTER_TYPE_W;
1169
1170    /* The BRW_*_STRIDE enums are defined so that incrementing the field
1171     * doubles the real stride.
1172     */
1173    if (r.hstride != 0)
1174       ++r.hstride;
1175    if (r.vstride != 0)
1176       ++r.vstride;
1177
1178    return r;
1179 }
1180
1181 void
1182 fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
1183                                             struct brw_reg dst,
1184                                             struct brw_reg x,
1185                                             struct brw_reg y)
1186 {
1187    assert(brw->gen >= 7);
1188    assert(dst.type == BRW_REGISTER_TYPE_UD);
1189    assert(x.type == BRW_REGISTER_TYPE_F);
1190    assert(y.type == BRW_REGISTER_TYPE_F);
1191
1192    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
1193     *
1194     *   Because this instruction does not have a 16-bit floating-point type,
1195     *   the destination data type must be Word (W).
1196     *
1197     *   The destination must be DWord-aligned and specify a horizontal stride
1198     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
1199     *   each destination channel and the upper word is not modified.
1200     */
1201    struct brw_reg dst_w = ud_reg_to_w(dst);
1202
1203    /* Give each 32-bit channel of dst the form below , where "." means
1204     * unchanged.
1205     *   0x....hhhh
1206     */
1207    brw_F32TO16(p, dst_w, y);
1208
1209    /* Now the form:
1210     *   0xhhhh0000
1211     */
1212    brw_SHL(p, dst, dst, brw_imm_ud(16u));
1213
1214    /* And, finally the form of packHalf2x16's output:
1215     *   0xhhhhllll
1216     */
1217    brw_F32TO16(p, dst_w, x);
1218 }
1219
1220 void
1221 fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
1222                                               struct brw_reg dst,
1223                                               struct brw_reg src)
1224 {
1225    assert(brw->gen >= 7);
1226    assert(dst.type == BRW_REGISTER_TYPE_F);
1227    assert(src.type == BRW_REGISTER_TYPE_UD);
1228
1229    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1230     *
1231     *   Because this instruction does not have a 16-bit floating-point type,
1232     *   the source data type must be Word (W). The destination type must be
1233     *   F (Float).
1234     */
1235    struct brw_reg src_w = ud_reg_to_w(src);
1236
1237    /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
1238     * For the Y case, we wish to access only the upper word; therefore
1239     * a 16-bit subregister offset is needed.
1240     */
1241    assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
1242           inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
1243    if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
1244       src_w.subnr += 2;
1245
1246    brw_F16TO32(p, dst, src_w);
1247 }
1248
1249 void
1250 fs_generator::generate_shader_time_add(fs_inst *inst,
1251                                        struct brw_reg payload,
1252                                        struct brw_reg offset,
1253                                        struct brw_reg value)
1254 {
1255    assert(brw->gen >= 7);
1256    brw_push_insn_state(p);
1257    brw_set_mask_control(p, true);
1258
1259    assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1260    struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
1261                                           offset.type);
1262    struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
1263                                          value.type);
1264
1265    assert(offset.file == BRW_IMMEDIATE_VALUE);
1266    if (value.file == BRW_GENERAL_REGISTER_FILE) {
1267       value.width = BRW_WIDTH_1;
1268       value.hstride = BRW_HORIZONTAL_STRIDE_0;
1269       value.vstride = BRW_VERTICAL_STRIDE_0;
1270    } else {
1271       assert(value.file == BRW_IMMEDIATE_VALUE);
1272    }
1273
1274    /* Trying to deal with setup of the params from the IR is crazy in the FS8
1275     * case, and we don't really care about squeezing every bit of performance
1276     * out of this path, so we just emit the MOVs from here.
1277     */
1278    brw_MOV(p, payload_offset, offset);
1279    brw_MOV(p, payload_value, value);
1280    brw_shader_time_add(p, payload,
1281                        c->prog_data.base.binding_table.shader_time_start);
1282    brw_pop_insn_state(p);
1283
1284    brw_mark_surface_used(&c->prog_data.base,
1285                          c->prog_data.base.binding_table.shader_time_start);
1286 }
1287
1288 void
1289 fs_generator::generate_untyped_atomic(fs_inst *inst, struct brw_reg dst,
1290                                       struct brw_reg atomic_op,
1291                                       struct brw_reg surf_index)
1292 {
1293    assert(atomic_op.file == BRW_IMMEDIATE_VALUE &&
1294           atomic_op.type == BRW_REGISTER_TYPE_UD &&
1295           surf_index.file == BRW_IMMEDIATE_VALUE &&
1296           surf_index.type == BRW_REGISTER_TYPE_UD);
1297
1298    brw_untyped_atomic(p, dst, brw_message_reg(inst->base_mrf),
1299                       atomic_op.dw1.ud, surf_index.dw1.ud,
1300                       inst->mlen, dispatch_width / 8);
1301
1302    brw_mark_surface_used(&c->prog_data.base, surf_index.dw1.ud);
1303 }
1304
1305 void
1306 fs_generator::generate_untyped_surface_read(fs_inst *inst, struct brw_reg dst,
1307                                             struct brw_reg surf_index)
1308 {
1309    assert(surf_index.file == BRW_IMMEDIATE_VALUE &&
1310           surf_index.type == BRW_REGISTER_TYPE_UD);
1311
1312    brw_untyped_surface_read(p, dst, brw_message_reg(inst->base_mrf),
1313                             surf_index.dw1.ud,
1314                             inst->mlen, dispatch_width / 8);
1315
1316    brw_mark_surface_used(&c->prog_data.base, surf_index.dw1.ud);
1317 }
1318
1319 void
1320 fs_generator::generate_code(exec_list *instructions, FILE *dump_file)
1321 {
1322    int last_native_insn_offset = p->next_insn_offset;
1323    const char *last_annotation_string = NULL;
1324    const void *last_annotation_ir = NULL;
1325
1326    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1327       if (prog) {
1328          printf("Native code for fragment shader %d (SIMD%d dispatch):\n",
1329                 prog->Name, dispatch_width);
1330       } else if (fp) {
1331          printf("Native code for fragment program %d (SIMD%d dispatch):\n",
1332                 fp->Base.Id, dispatch_width);
1333       } else {
1334          printf("Native code for blorp program (SIMD%d dispatch):\n",
1335                 dispatch_width);
1336       }
1337    }
1338
1339    cfg_t *cfg = NULL;
1340    if (unlikely(INTEL_DEBUG & DEBUG_WM))
1341       cfg = new(mem_ctx) cfg_t(instructions);
1342
1343    foreach_list(node, instructions) {
1344       fs_inst *inst = (fs_inst *)node;
1345       struct brw_reg src[3], dst;
1346
1347       if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1348          foreach_list(node, &cfg->block_list) {
1349             bblock_link *link = (bblock_link *)node;
1350             bblock_t *block = link->block;
1351
1352             if (block->start == inst) {
1353                printf("   START B%d", block->block_num);
1354                foreach_list(predecessor_node, &block->parents) {
1355                   bblock_link *predecessor_link =
1356                      (bblock_link *)predecessor_node;
1357                   bblock_t *predecessor_block = predecessor_link->block;
1358                   printf(" <-B%d", predecessor_block->block_num);
1359                }
1360                printf("\n");
1361             }
1362          }
1363
1364          if (last_annotation_ir != inst->ir) {
1365             last_annotation_ir = inst->ir;
1366             if (last_annotation_ir) {
1367                printf("   ");
1368                if (prog)
1369                   ((ir_instruction *)inst->ir)->print();
1370                else {
1371                   const prog_instruction *fpi;
1372                   fpi = (const prog_instruction *)inst->ir;
1373                   printf("%d: ", (int)(fpi - (fp ? fp->Base.Instructions : 0)));
1374                   _mesa_fprint_instruction_opt(stdout,
1375                                                fpi,
1376                                                0, PROG_PRINT_DEBUG, NULL);
1377                }
1378                printf("\n");
1379             }
1380          }
1381          if (last_annotation_string != inst->annotation) {
1382             last_annotation_string = inst->annotation;
1383             if (last_annotation_string)
1384                printf("   %s\n", last_annotation_string);
1385          }
1386       }
1387
1388       for (unsigned int i = 0; i < 3; i++) {
1389          src[i] = brw_reg_from_fs_reg(&inst->src[i]);
1390
1391          /* The accumulator result appears to get used for the
1392           * conditional modifier generation.  When negating a UD
1393           * value, there is a 33rd bit generated for the sign in the
1394           * accumulator value, so now you can't check, for example,
1395           * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
1396           */
1397          assert(!inst->conditional_mod ||
1398                 inst->src[i].type != BRW_REGISTER_TYPE_UD ||
1399                 !inst->src[i].negate);
1400       }
1401       dst = brw_reg_from_fs_reg(&inst->dst);
1402
1403       brw_set_conditionalmod(p, inst->conditional_mod);
1404       brw_set_predicate_control(p, inst->predicate);
1405       brw_set_predicate_inverse(p, inst->predicate_inverse);
1406       brw_set_flag_reg(p, 0, inst->flag_subreg);
1407       brw_set_saturate(p, inst->saturate);
1408       brw_set_mask_control(p, inst->force_writemask_all);
1409
1410       if (inst->force_uncompressed || dispatch_width == 8) {
1411          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1412       } else if (inst->force_sechalf) {
1413          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1414       } else {
1415          brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1416       }
1417
1418       switch (inst->opcode) {
1419       case BRW_OPCODE_MOV:
1420          brw_MOV(p, dst, src[0]);
1421          break;
1422       case BRW_OPCODE_ADD:
1423          brw_ADD(p, dst, src[0], src[1]);
1424          break;
1425       case BRW_OPCODE_MUL:
1426          brw_MUL(p, dst, src[0], src[1]);
1427          break;
1428       case BRW_OPCODE_AVG:
1429          brw_AVG(p, dst, src[0], src[1]);
1430          break;
1431       case BRW_OPCODE_MACH:
1432          brw_set_acc_write_control(p, 1);
1433          brw_MACH(p, dst, src[0], src[1]);
1434          brw_set_acc_write_control(p, 0);
1435          break;
1436
1437       case BRW_OPCODE_MAD:
1438          assert(brw->gen >= 6);
1439          brw_set_access_mode(p, BRW_ALIGN_16);
1440          if (dispatch_width == 16 && !brw->is_haswell) {
1441             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1442             brw_MAD(p, dst, src[0], src[1], src[2]);
1443             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1444             brw_MAD(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1445             brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1446          } else {
1447             brw_MAD(p, dst, src[0], src[1], src[2]);
1448          }
1449          brw_set_access_mode(p, BRW_ALIGN_1);
1450          break;
1451
1452       case BRW_OPCODE_LRP:
1453          assert(brw->gen >= 6);
1454          brw_set_access_mode(p, BRW_ALIGN_16);
1455          if (dispatch_width == 16 && !brw->is_haswell) {
1456             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1457             brw_LRP(p, dst, src[0], src[1], src[2]);
1458             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1459             brw_LRP(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1460             brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1461          } else {
1462             brw_LRP(p, dst, src[0], src[1], src[2]);
1463          }
1464          brw_set_access_mode(p, BRW_ALIGN_1);
1465          break;
1466
1467       case BRW_OPCODE_FRC:
1468          brw_FRC(p, dst, src[0]);
1469          break;
1470       case BRW_OPCODE_RNDD:
1471          brw_RNDD(p, dst, src[0]);
1472          break;
1473       case BRW_OPCODE_RNDE:
1474          brw_RNDE(p, dst, src[0]);
1475          break;
1476       case BRW_OPCODE_RNDZ:
1477          brw_RNDZ(p, dst, src[0]);
1478          break;
1479
1480       case BRW_OPCODE_AND:
1481          brw_AND(p, dst, src[0], src[1]);
1482          break;
1483       case BRW_OPCODE_OR:
1484          brw_OR(p, dst, src[0], src[1]);
1485          break;
1486       case BRW_OPCODE_XOR:
1487          brw_XOR(p, dst, src[0], src[1]);
1488          break;
1489       case BRW_OPCODE_NOT:
1490          brw_NOT(p, dst, src[0]);
1491          break;
1492       case BRW_OPCODE_ASR:
1493          brw_ASR(p, dst, src[0], src[1]);
1494          break;
1495       case BRW_OPCODE_SHR:
1496          brw_SHR(p, dst, src[0], src[1]);
1497          break;
1498       case BRW_OPCODE_SHL:
1499          brw_SHL(p, dst, src[0], src[1]);
1500          break;
1501       case BRW_OPCODE_F32TO16:
1502          assert(brw->gen >= 7);
1503          brw_F32TO16(p, dst, src[0]);
1504          break;
1505       case BRW_OPCODE_F16TO32:
1506          assert(brw->gen >= 7);
1507          brw_F16TO32(p, dst, src[0]);
1508          break;
1509       case BRW_OPCODE_CMP:
1510          brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1511          break;
1512       case BRW_OPCODE_SEL:
1513          brw_SEL(p, dst, src[0], src[1]);
1514          break;
1515       case BRW_OPCODE_BFREV:
1516          assert(brw->gen >= 7);
1517          /* BFREV only supports UD type for src and dst. */
1518          brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
1519                       retype(src[0], BRW_REGISTER_TYPE_UD));
1520          break;
1521       case BRW_OPCODE_FBH:
1522          assert(brw->gen >= 7);
1523          /* FBH only supports UD type for dst. */
1524          brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1525          break;
1526       case BRW_OPCODE_FBL:
1527          assert(brw->gen >= 7);
1528          /* FBL only supports UD type for dst. */
1529          brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1530          break;
1531       case BRW_OPCODE_CBIT:
1532          assert(brw->gen >= 7);
1533          /* CBIT only supports UD type for dst. */
1534          brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1535          break;
1536       case BRW_OPCODE_ADDC:
1537          assert(brw->gen >= 7);
1538          brw_set_acc_write_control(p, 1);
1539          brw_ADDC(p, dst, src[0], src[1]);
1540          brw_set_acc_write_control(p, 0);
1541          break;
1542       case BRW_OPCODE_SUBB:
1543          assert(brw->gen >= 7);
1544          brw_set_acc_write_control(p, 1);
1545          brw_SUBB(p, dst, src[0], src[1]);
1546          brw_set_acc_write_control(p, 0);
1547          break;
1548
1549       case BRW_OPCODE_BFE:
1550          assert(brw->gen >= 7);
1551          brw_set_access_mode(p, BRW_ALIGN_16);
1552          if (dispatch_width == 16 && !brw->is_haswell) {
1553             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1554             brw_BFE(p, dst, src[0], src[1], src[2]);
1555             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1556             brw_BFE(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1557             brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1558          } else {
1559             brw_BFE(p, dst, src[0], src[1], src[2]);
1560          }
1561          brw_set_access_mode(p, BRW_ALIGN_1);
1562          break;
1563
1564       case BRW_OPCODE_BFI1:
1565          assert(brw->gen >= 7);
1566          /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
1567           * should
1568           *
1569           *    "Force BFI instructions to be executed always in SIMD8."
1570           */
1571          if (dispatch_width == 16 && brw->is_haswell) {
1572             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1573             brw_BFI1(p, dst, src[0], src[1]);
1574             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1575             brw_BFI1(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]));
1576             brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1577          } else {
1578             brw_BFI1(p, dst, src[0], src[1]);
1579          }
1580          break;
1581       case BRW_OPCODE_BFI2:
1582          assert(brw->gen >= 7);
1583          brw_set_access_mode(p, BRW_ALIGN_16);
1584          /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
1585           * should
1586           *
1587           *    "Force BFI instructions to be executed always in SIMD8."
1588           *
1589           * Otherwise we would be able to emit compressed instructions like we
1590           * do for the other three-source instructions.
1591           */
1592          if (dispatch_width == 16) {
1593             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1594             brw_BFI2(p, dst, src[0], src[1], src[2]);
1595             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1596             brw_BFI2(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1597             brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1598          } else {
1599             brw_BFI2(p, dst, src[0], src[1], src[2]);
1600          }
1601          brw_set_access_mode(p, BRW_ALIGN_1);
1602          break;
1603
1604       case BRW_OPCODE_IF:
1605          if (inst->src[0].file != BAD_FILE) {
1606             /* The instruction has an embedded compare (only allowed on gen6) */
1607             assert(brw->gen == 6);
1608             gen6_IF(p, inst->conditional_mod, src[0], src[1]);
1609          } else {
1610             brw_IF(p, dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
1611          }
1612          break;
1613
1614       case BRW_OPCODE_ELSE:
1615          brw_ELSE(p);
1616          break;
1617       case BRW_OPCODE_ENDIF:
1618          brw_ENDIF(p);
1619          break;
1620
1621       case BRW_OPCODE_DO:
1622          brw_DO(p, BRW_EXECUTE_8);
1623          break;
1624
1625       case BRW_OPCODE_BREAK:
1626          brw_BREAK(p);
1627          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1628          break;
1629       case BRW_OPCODE_CONTINUE:
1630          /* FINISHME: We need to write the loop instruction support still. */
1631          if (brw->gen >= 6)
1632             gen6_CONT(p);
1633          else
1634             brw_CONT(p);
1635          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1636          break;
1637
1638       case BRW_OPCODE_WHILE:
1639          brw_WHILE(p);
1640          break;
1641
1642       case SHADER_OPCODE_RCP:
1643       case SHADER_OPCODE_RSQ:
1644       case SHADER_OPCODE_SQRT:
1645       case SHADER_OPCODE_EXP2:
1646       case SHADER_OPCODE_LOG2:
1647       case SHADER_OPCODE_SIN:
1648       case SHADER_OPCODE_COS:
1649          if (brw->gen >= 7) {
1650             generate_math1_gen7(inst, dst, src[0]);
1651          } else if (brw->gen == 6) {
1652             generate_math1_gen6(inst, dst, src[0]);
1653          } else if (brw->gen == 5 || brw->is_g4x) {
1654             generate_math_g45(inst, dst, src[0]);
1655          } else {
1656             generate_math_gen4(inst, dst, src[0]);
1657          }
1658          break;
1659       case SHADER_OPCODE_INT_QUOTIENT:
1660       case SHADER_OPCODE_INT_REMAINDER:
1661       case SHADER_OPCODE_POW:
1662          if (brw->gen >= 7) {
1663             generate_math2_gen7(inst, dst, src[0], src[1]);
1664          } else if (brw->gen == 6) {
1665             generate_math2_gen6(inst, dst, src[0], src[1]);
1666          } else {
1667             generate_math_gen4(inst, dst, src[0]);
1668          }
1669          break;
1670       case FS_OPCODE_PIXEL_X:
1671          generate_pixel_xy(dst, true);
1672          break;
1673       case FS_OPCODE_PIXEL_Y:
1674          generate_pixel_xy(dst, false);
1675          break;
1676       case FS_OPCODE_CINTERP:
1677          brw_MOV(p, dst, src[0]);
1678          break;
1679       case FS_OPCODE_LINTERP:
1680          generate_linterp(inst, dst, src);
1681          break;
1682       case SHADER_OPCODE_TEX:
1683       case FS_OPCODE_TXB:
1684       case SHADER_OPCODE_TXD:
1685       case SHADER_OPCODE_TXF:
1686       case SHADER_OPCODE_TXF_CMS:
1687       case SHADER_OPCODE_TXF_UMS:
1688       case SHADER_OPCODE_TXF_MCS:
1689       case SHADER_OPCODE_TXL:
1690       case SHADER_OPCODE_TXS:
1691       case SHADER_OPCODE_LOD:
1692       case SHADER_OPCODE_TG4:
1693       case SHADER_OPCODE_TG4_OFFSET:
1694          generate_tex(inst, dst, src[0]);
1695          break;
1696       case FS_OPCODE_DDX:
1697          generate_ddx(inst, dst, src[0]);
1698          break;
1699       case FS_OPCODE_DDY:
1700          /* Make sure fp->UsesDFdy flag got set (otherwise there's no
1701           * guarantee that c->key.render_to_fbo is set).
1702           */
1703          assert(fp->UsesDFdy);
1704          generate_ddy(inst, dst, src[0], c->key.render_to_fbo);
1705          break;
1706
1707       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1708          generate_scratch_write(inst, src[0]);
1709          break;
1710
1711       case SHADER_OPCODE_GEN4_SCRATCH_READ:
1712          generate_scratch_read(inst, dst);
1713          break;
1714
1715       case SHADER_OPCODE_GEN7_SCRATCH_READ:
1716          generate_scratch_read_gen7(inst, dst);
1717          break;
1718
1719       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1720          generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
1721          break;
1722
1723       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
1724          generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1725          break;
1726
1727       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1728          generate_varying_pull_constant_load(inst, dst, src[0], src[1]);
1729          break;
1730
1731       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
1732          generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1733          break;
1734
1735       case FS_OPCODE_FB_WRITE:
1736          generate_fb_write(inst);
1737          break;
1738
1739       case FS_OPCODE_BLORP_FB_WRITE:
1740          generate_blorp_fb_write(inst);
1741          break;
1742
1743       case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
1744          generate_mov_dispatch_to_flags(inst);
1745          break;
1746
1747       case FS_OPCODE_DISCARD_JUMP:
1748          generate_discard_jump(inst);
1749          break;
1750
1751       case SHADER_OPCODE_SHADER_TIME_ADD:
1752          generate_shader_time_add(inst, src[0], src[1], src[2]);
1753          break;
1754
1755       case SHADER_OPCODE_UNTYPED_ATOMIC:
1756          generate_untyped_atomic(inst, dst, src[0], src[1]);
1757          break;
1758
1759       case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1760          generate_untyped_surface_read(inst, dst, src[0]);
1761          break;
1762
1763       case FS_OPCODE_SET_SIMD4X2_OFFSET:
1764          generate_set_simd4x2_offset(inst, dst, src[0]);
1765          break;
1766
1767       case FS_OPCODE_SET_OMASK:
1768          generate_set_omask(inst, dst, src[0]);
1769          break;
1770
1771       case FS_OPCODE_SET_SAMPLE_ID:
1772          generate_set_sample_id(inst, dst, src[0], src[1]);
1773          break;
1774
1775       case FS_OPCODE_PACK_HALF_2x16_SPLIT:
1776           generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
1777           break;
1778
1779       case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
1780       case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
1781          generate_unpack_half_2x16_split(inst, dst, src[0]);
1782          break;
1783
1784       case FS_OPCODE_PLACEHOLDER_HALT:
1785          /* This is the place where the final HALT needs to be inserted if
1786           * we've emitted any discards.  If not, this will emit no code.
1787           */
1788          patch_discard_jumps_to_fb_writes();
1789          break;
1790
1791       default:
1792          if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
1793             _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
1794                           opcode_descs[inst->opcode].name);
1795          } else {
1796             _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
1797          }
1798          abort();
1799       }
1800
1801       if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1802          brw_dump_compile(p, stdout,
1803                           last_native_insn_offset, p->next_insn_offset);
1804
1805          foreach_list(node, &cfg->block_list) {
1806             bblock_link *link = (bblock_link *)node;
1807             bblock_t *block = link->block;
1808
1809             if (block->end == inst) {
1810                printf("   END B%d", block->block_num);
1811                foreach_list(successor_node, &block->children) {
1812                   bblock_link *successor_link =
1813                      (bblock_link *)successor_node;
1814                   bblock_t *successor_block = successor_link->block;
1815                   printf(" ->B%d", successor_block->block_num);
1816                }
1817                printf("\n");
1818             }
1819          }
1820       }
1821
1822       last_native_insn_offset = p->next_insn_offset;
1823    }
1824
1825    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1826       printf("\n");
1827    }
1828
1829    brw_set_uip_jip(p);
1830
1831    /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
1832     * emit issues, it doesn't get the jump distances into the output,
1833     * which is often something we want to debug.  So this is here in
1834     * case you're doing that.
1835     */
1836    if (dump_file) {
1837       brw_dump_compile(p, dump_file, 0, p->next_insn_offset);
1838    }
1839 }
1840
1841 const unsigned *
1842 fs_generator::generate_assembly(exec_list *simd8_instructions,
1843                                 exec_list *simd16_instructions,
1844                                 unsigned *assembly_size,
1845                                 FILE *dump_file)
1846 {
1847    assert(simd8_instructions || simd16_instructions);
1848
1849    if (simd8_instructions) {
1850       dispatch_width = 8;
1851       generate_code(simd8_instructions, dump_file);
1852    }
1853
1854    if (simd16_instructions) {
1855       /* We have to do a compaction pass now, or the one at the end of
1856        * execution will squash down where our prog_offset start needs
1857        * to be.
1858        */
1859       brw_compact_instructions(p);
1860
1861       /* align to 64 byte boundary. */
1862       while ((p->nr_insn * sizeof(struct brw_instruction)) % 64) {
1863          brw_NOP(p);
1864       }
1865
1866       /* Save off the start of this SIMD16 program */
1867       c->prog_data.prog_offset_16 = p->nr_insn * sizeof(struct brw_instruction);
1868
1869       brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1870
1871       dispatch_width = 16;
1872       generate_code(simd16_instructions, dump_file);
1873    }
1874
1875    return brw_get_program(p, assembly_size);
1876 }