src/mesa/drivers/dri/i965/brw_fs_generator.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs_generator.cpp
  25  *
  26  * This file supports generating code from the FS LIR to the actual
  27  * native instructions.
  28  */
  29
  30 extern "C" {
  31 #include "main/macros.h"
  32 #include "brw_context.h"
  33 #include "brw_eu.h"
  34 } /* extern "C" */
  35
  36 #include "brw_fs.h"
  37 #include "brw_cfg.h"
  38
  39 fs_generator::fs_generator(struct brw_context *brw,
  40                            struct brw_wm_compile *c,
  41                            struct gl_shader_program *prog,
  42                            struct gl_fragment_program *fp,
  43                            bool dual_source_output)
  44
  45    : brw(brw), c(c), prog(prog), fp(fp), dual_source_output(dual_source_output)
  46 {
  47    ctx = &brw->ctx;
  48
  49    mem_ctx = c;
  50
  51    p = rzalloc(mem_ctx, struct brw_compile);
  52    brw_init_compile(brw, p, mem_ctx);
  53 }
  54
  55 fs_generator::~fs_generator()
  56 {
  57 }
  58
  59 void
  60 fs_generator::patch_discard_jumps_to_fb_writes()
  61 {
  62    if (brw->gen < 6 || this->discard_halt_patches.is_empty())
  63       return;
  64
  65    /* There is a somewhat strange undocumented requirement of using
  66     * HALT, according to the simulator.  If some channel has HALTed to
  67     * a particular UIP, then by the end of the program, every channel
  68     * must have HALTed to that UIP.  Furthermore, the tracking is a
  69     * stack, so you can't do the final halt of a UIP after starting
  70     * halting to a new UIP.
  71     *
  72     * Symptoms of not emitting this instruction on actual hardware
  73     * included GPU hangs and sparkly rendering on the piglit discard
  74     * tests.
  75     */
  76    struct brw_instruction *last_halt = gen6_HALT(p);
  77    last_halt->bits3.break_cont.uip = 2;
  78    last_halt->bits3.break_cont.jip = 2;
  79
  80    int ip = p->nr_insn;
  81
  82    foreach_list(node, &this->discard_halt_patches) {
  83       ip_record *patch_ip = (ip_record *)node;
  84       struct brw_instruction *patch = &p->store[patch_ip->ip];
  85
  86       assert(patch->header.opcode == BRW_OPCODE_HALT);
  87       /* HALT takes a half-instruction distance from the pre-incremented IP. */
  88       patch->bits3.break_cont.uip = (ip - patch_ip->ip) * 2;
  89    }
  90
  91    this->discard_halt_patches.make_empty();
  92 }
  93
  94 void
  95 fs_generator::generate_fb_write(fs_inst *inst)
  96 {
  97    bool eot = inst->eot;
  98    struct brw_reg implied_header;
  99    uint32_t msg_control;
 100
 101    /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
 102     * move, here's g1.
 103     */
 104    brw_push_insn_state(p);
 105    brw_set_mask_control(p, BRW_MASK_DISABLE);
 106    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 107
 108    if (inst->header_present) {
 109       /* On HSW, the GPU will use the predicate on SENDC, unless the header is
 110        * present.
 111        */
 112       if (!brw->is_haswell && ((fp && fp->UsesKill) ||
 113                                c->key.alpha_test_func)) {
 114          struct brw_reg pixel_mask;
 115
 116          if (brw->gen >= 6)
 117             pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
 118          else
 119             pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
 120
 121          brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
 122       }
 123
 124       if (brw->gen >= 6) {
 125          brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
 126          brw_MOV(p,
 127                  retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
 128                  retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
 129          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 130
 131          if (inst->target > 0 && c->key.replicate_alpha) {
 132             /* Set "Source0 Alpha Present to RenderTarget" bit in message
 133              * header.
 134              */
 135             brw_OR(p,
 136                    vec1(retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD)),
 137                    vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
 138                    brw_imm_ud(0x1 << 11));
 139          }
 140
 141          if (inst->target > 0) {
 142             /* Set the render target index for choosing BLEND_STATE. */
 143             brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
 144                                            inst->base_mrf, 2),
 145                               BRW_REGISTER_TYPE_UD),
 146                     brw_imm_ud(inst->target));
 147          }
 148
 149          implied_header = brw_null_reg();
 150       } else {
 151          implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
 152
 153          brw_MOV(p,
 154                  brw_message_reg(inst->base_mrf + 1),
 155                  brw_vec8_grf(1, 0));
 156       }
 157    } else {
 158       implied_header = brw_null_reg();
 159    }
 160
 161    if (this->dual_source_output)
 162       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
 163    else if (dispatch_width == 16)
 164       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
 165    else
 166       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
 167
 168    brw_pop_insn_state(p);
 169
 170    uint32_t surf_index =
 171       c->prog_data.binding_table.render_target_start + inst->target;
 172    brw_fb_WRITE(p,
 173                 dispatch_width,
 174                 inst->base_mrf,
 175                 implied_header,
 176                 msg_control,
 177                 surf_index,
 178                 inst->mlen,
 179                 0,
 180                 eot,
 181                 inst->header_present);
 182
 183    brw_mark_surface_used(&c->prog_data.base, surf_index);
 184 }
 185
 186 void
 187 fs_generator::generate_blorp_fb_write(fs_inst *inst)
 188 {
 189    brw_fb_WRITE(p,
 190                 16 /* dispatch_width */,
 191                 inst->base_mrf,
 192                 brw_reg_from_fs_reg(&inst->src[0]),
 193                 BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE,
 194                 inst->target,
 195                 inst->mlen,
 196                 0,
 197                 true,
 198                 inst->header_present);
 199 }
 200
 201 /* Computes the integer pixel x,y values from the origin.
 202  *
 203  * This is the basis of gl_FragCoord computation, but is also used
 204  * pre-gen6 for computing the deltas from v0 for computing
 205  * interpolation.
 206  */
 207 void
 208 fs_generator::generate_pixel_xy(struct brw_reg dst, bool is_x)
 209 {
 210    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
 211    struct brw_reg src;
 212    struct brw_reg deltas;
 213
 214    if (is_x) {
 215       src = stride(suboffset(g1_uw, 4), 2, 4, 0);
 216       deltas = brw_imm_v(0x10101010);
 217    } else {
 218       src = stride(suboffset(g1_uw, 5), 2, 4, 0);
 219       deltas = brw_imm_v(0x11001100);
 220    }
 221
 222    if (dispatch_width == 16) {
 223       dst = vec16(dst);
 224    }
 225
 226    /* We do this SIMD8 or SIMD16, but since the destination is UW we
 227     * don't do compression in the SIMD16 case.
 228     */
 229    brw_push_insn_state(p);
 230    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 231    brw_ADD(p, dst, src, deltas);
 232    brw_pop_insn_state(p);
 233 }
 234
 235 void
 236 fs_generator::generate_linterp(fs_inst *inst,
 237                              struct brw_reg dst, struct brw_reg *src)
 238 {
 239    struct brw_reg delta_x = src[0];
 240    struct brw_reg delta_y = src[1];
 241    struct brw_reg interp = src[2];
 242
 243    if (brw->has_pln &&
 244        delta_y.nr == delta_x.nr + 1 &&
 245        (brw->gen >= 6 || (delta_x.nr & 1) == 0)) {
 246       brw_PLN(p, dst, interp, delta_x);
 247    } else {
 248       brw_LINE(p, brw_null_reg(), interp, delta_x);
 249       brw_MAC(p, dst, suboffset(interp, 1), delta_y);
 250    }
 251 }
 252
 253 void
 254 fs_generator::generate_math1_gen7(fs_inst *inst,
 255                                 struct brw_reg dst,
 256                                 struct brw_reg src0)
 257 {
 258    assert(inst->mlen == 0);
 259    brw_math(p, dst,
 260             brw_math_function(inst->opcode),
 261             0, src0,
 262             BRW_MATH_DATA_VECTOR,
 263             BRW_MATH_PRECISION_FULL);
 264 }
 265
 266 void
 267 fs_generator::generate_math2_gen7(fs_inst *inst,
 268                                 struct brw_reg dst,
 269                                 struct brw_reg src0,
 270                                 struct brw_reg src1)
 271 {
 272    assert(inst->mlen == 0);
 273    brw_math2(p, dst, brw_math_function(inst->opcode), src0, src1);
 274 }
 275
 276 void
 277 fs_generator::generate_math1_gen6(fs_inst *inst,
 278                                 struct brw_reg dst,
 279                                 struct brw_reg src0)
 280 {
 281    int op = brw_math_function(inst->opcode);
 282
 283    assert(inst->mlen == 0);
 284
 285    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 286    brw_math(p, dst,
 287             op,
 288             0, src0,
 289             BRW_MATH_DATA_VECTOR,
 290             BRW_MATH_PRECISION_FULL);
 291
 292    if (dispatch_width == 16) {
 293       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 294       brw_math(p, sechalf(dst),
 295                op,
 296                0, sechalf(src0),
 297                BRW_MATH_DATA_VECTOR,
 298                BRW_MATH_PRECISION_FULL);
 299       brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
 300    }
 301 }
 302
 303 void
 304 fs_generator::generate_math2_gen6(fs_inst *inst,
 305                                 struct brw_reg dst,
 306                                 struct brw_reg src0,
 307                                 struct brw_reg src1)
 308 {
 309    int op = brw_math_function(inst->opcode);
 310
 311    assert(inst->mlen == 0);
 312
 313    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 314    brw_math2(p, dst, op, src0, src1);
 315
 316    if (dispatch_width == 16) {
 317       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 318       brw_math2(p, sechalf(dst), op, sechalf(src0), sechalf(src1));
 319       brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
 320    }
 321 }
 322
 323 void
 324 fs_generator::generate_math_gen4(fs_inst *inst,
 325                                struct brw_reg dst,
 326                                struct brw_reg src)
 327 {
 328    int op = brw_math_function(inst->opcode);
 329
 330    assert(inst->mlen >= 1);
 331
 332    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 333    brw_math(p, dst,
 334             op,
 335             inst->base_mrf, src,
 336             BRW_MATH_DATA_VECTOR,
 337             BRW_MATH_PRECISION_FULL);
 338
 339    if (dispatch_width == 16) {
 340       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 341       brw_math(p, sechalf(dst),
 342                op,
 343                inst->base_mrf + 1, sechalf(src),
 344                BRW_MATH_DATA_VECTOR,
 345                BRW_MATH_PRECISION_FULL);
 346
 347       brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
 348    }
 349 }
 350
 351 void
 352 fs_generator::generate_math_g45(fs_inst *inst,
 353                                 struct brw_reg dst,
 354                                 struct brw_reg src)
 355 {
 356    if (inst->opcode == SHADER_OPCODE_POW ||
 357        inst->opcode == SHADER_OPCODE_INT_QUOTIENT ||
 358        inst->opcode == SHADER_OPCODE_INT_REMAINDER) {
 359       generate_math_gen4(inst, dst, src);
 360       return;
 361    }
 362
 363    int op = brw_math_function(inst->opcode);
 364
 365    assert(inst->mlen >= 1);
 366
 367    brw_math(p, dst,
 368             op,
 369             inst->base_mrf, src,
 370             BRW_MATH_DATA_VECTOR,
 371             BRW_MATH_PRECISION_FULL);
 372 }
 373
 374 void
 375 fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
 376 {
 377    int msg_type = -1;
 378    int rlen = 4;
 379    uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 380    uint32_t return_format;
 381
 382    switch (dst.type) {
 383    case BRW_REGISTER_TYPE_D:
 384       return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
 385       break;
 386    case BRW_REGISTER_TYPE_UD:
 387       return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
 388       break;
 389    default:
 390       return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
 391       break;
 392    }
 393
 394    if (dispatch_width == 16 &&
 395       !inst->force_uncompressed && !inst->force_sechalf)
 396       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 397
 398    if (brw->gen >= 5) {
 399       switch (inst->opcode) {
 400       case SHADER_OPCODE_TEX:
 401          if (inst->shadow_compare) {
 402             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
 403          } else {
 404             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
 405          }
 406          break;
 407       case FS_OPCODE_TXB:
 408          if (inst->shadow_compare) {
 409             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
 410          } else {
 411             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
 412          }
 413          break;
 414       case SHADER_OPCODE_TXL:
 415          if (inst->shadow_compare) {
 416             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
 417          } else {
 418             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
 419          }
 420          break;
 421       case SHADER_OPCODE_TXS:
 422          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
 423          break;
 424       case SHADER_OPCODE_TXD:
 425          if (inst->shadow_compare) {
 426             /* Gen7.5+.  Otherwise, lowered by brw_lower_texture_gradients(). */
 427             assert(brw->is_haswell);
 428             msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
 429          } else {
 430             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
 431          }
 432          break;
 433       case SHADER_OPCODE_TXF:
 434          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
 435          break;
 436       case SHADER_OPCODE_TXF_CMS:
 437          if (brw->gen >= 7)
 438             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
 439          else
 440             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
 441          break;
 442       case SHADER_OPCODE_TXF_UMS:
 443          assert(brw->gen >= 7);
 444          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
 445          break;
 446       case SHADER_OPCODE_TXF_MCS:
 447          assert(brw->gen >= 7);
 448          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
 449          break;
 450       case SHADER_OPCODE_LOD:
 451          msg_type = GEN5_SAMPLER_MESSAGE_LOD;
 452          break;
 453       case SHADER_OPCODE_TG4:
 454          if (inst->shadow_compare) {
 455             assert(brw->gen >= 7);
 456             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
 457          } else {
 458             assert(brw->gen >= 6);
 459             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
 460          }
 461          break;
 462       case SHADER_OPCODE_TG4_OFFSET:
 463          assert(brw->gen >= 7);
 464          if (inst->shadow_compare) {
 465             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
 466          } else {
 467             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
 468          }
 469          break;
 470       default:
 471          assert(!"not reached");
 472          break;
 473       }
 474    } else {
 475       switch (inst->opcode) {
 476       case SHADER_OPCODE_TEX:
 477          /* Note that G45 and older determines shadow compare and dispatch width
 478           * from message length for most messages.
 479           */
 480          assert(dispatch_width == 8);
 481          msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
 482          if (inst->shadow_compare) {
 483             assert(inst->mlen == 6);
 484          } else {
 485             assert(inst->mlen <= 4);
 486          }
 487          break;
 488       case FS_OPCODE_TXB:
 489          if (inst->shadow_compare) {
 490             assert(inst->mlen == 6);
 491             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
 492          } else {
 493             assert(inst->mlen == 9);
 494             msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
 495             simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 496          }
 497          break;
 498       case SHADER_OPCODE_TXL:
 499          if (inst->shadow_compare) {
 500             assert(inst->mlen == 6);
 501             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
 502          } else {
 503             assert(inst->mlen == 9);
 504             msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
 505             simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 506          }
 507          break;
 508       case SHADER_OPCODE_TXD:
 509          /* There is no sample_d_c message; comparisons are done manually */
 510          assert(inst->mlen == 7 || inst->mlen == 10);
 511          msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
 512          break;
 513       case SHADER_OPCODE_TXF:
 514          assert(inst->mlen == 9);
 515          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
 516          simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 517          break;
 518       case SHADER_OPCODE_TXS:
 519          assert(inst->mlen == 3);
 520          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
 521          simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 522          break;
 523       default:
 524          assert(!"not reached");
 525          break;
 526       }
 527    }
 528    assert(msg_type != -1);
 529
 530    if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
 531       rlen = 8;
 532       dst = vec16(dst);
 533    }
 534
 535    if (brw->gen >= 7 && inst->header_present && dispatch_width == 16) {
 536       /* The send-from-GRF for SIMD16 texturing with a header has an extra
 537        * hardware register allocated to it, which we need to skip over (since
 538        * our coordinates in the payload are in the even-numbered registers,
 539        * and the header comes right before the first one).
 540        */
 541       assert(src.file == BRW_GENERAL_REGISTER_FILE);
 542       src.nr++;
 543    }
 544
 545    /* Load the message header if present.  If there's a texture offset,
 546     * we need to set it up explicitly and load the offset bitfield.
 547     * Otherwise, we can use an implied move from g0 to the first message reg.
 548     */
 549    if (inst->header_present) {
 550       if (brw->gen < 6 && !inst->texture_offset) {
 551          /* Set up an implied move from g0 to the MRF. */
 552          src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
 553       } else {
 554          struct brw_reg header_reg;
 555
 556          if (brw->gen >= 7) {
 557             header_reg = src;
 558          } else {
 559             assert(inst->base_mrf != -1);
 560             header_reg = brw_message_reg(inst->base_mrf);
 561          }
 562
 563          brw_push_insn_state(p);
 564          brw_set_mask_control(p, BRW_MASK_DISABLE);
 565          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 566          /* Explicitly set up the message header by copying g0 to the MRF. */
 567          brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
 568
 569          if (inst->texture_offset) {
 570             /* Set the offset bits in DWord 2. */
 571             brw_MOV(p, get_element_ud(header_reg, 2),
 572                        brw_imm_ud(inst->texture_offset));
 573          }
 574
 575          if (inst->sampler >= 16) {
 576             /* The "Sampler Index" field can only store values between 0 and 15.
 577              * However, we can add an offset to the "Sampler State Pointer"
 578              * field, effectively selecting a different set of 16 samplers.
 579              *
 580              * The "Sampler State Pointer" needs to be aligned to a 32-byte
 581              * offset, and each sampler state is only 16-bytes, so we can't
 582              * exclusively use the offset - we have to use both.
 583              */
 584             assert(brw->is_haswell); /* field only exists on Haswell */
 585             brw_ADD(p,
 586                     get_element_ud(header_reg, 3),
 587                     get_element_ud(brw_vec8_grf(0, 0), 3),
 588                     brw_imm_ud(16 * (inst->sampler / 16) *
 589                                sizeof(gen7_sampler_state)));
 590          }
 591          brw_pop_insn_state(p);
 592       }
 593    }
 594
 595    uint32_t surface_index = ((inst->opcode == SHADER_OPCODE_TG4 ||
 596       inst->opcode == SHADER_OPCODE_TG4_OFFSET)
 597       ? c->prog_data.base.binding_table.gather_texture_start
 598       : c->prog_data.base.binding_table.texture_start) + inst->sampler;
 599
 600    brw_SAMPLE(p,
 601               retype(dst, BRW_REGISTER_TYPE_UW),
 602               inst->base_mrf,
 603               src,
 604               surface_index,
 605               inst->sampler % 16,
 606               msg_type,
 607               rlen,
 608               inst->mlen,
 609               inst->header_present,
 610               simd_mode,
 611               return_format);
 612
 613    brw_mark_surface_used(&c->prog_data.base, surface_index);
 614 }
 615
 616
 617 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 618  * looking like:
 619  *
 620  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 621  *
 622  * Ideally, we want to produce:
 623  *
 624  *           DDX                     DDY
 625  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 626  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 627  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 628  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 629  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 630  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 631  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 632  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 633  *
 634  * and add another set of two more subspans if in 16-pixel dispatch mode.
 635  *
 636  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 637  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
 638  * pair.  But the ideal approximation may impose a huge performance cost on
 639  * sample_d.  On at least Haswell, sample_d instruction does some
 640  * optimizations if the same LOD is used for all pixels in the subspan.
 641  *
 642  * For DDY, we need to use ALIGN16 mode since it's capable of doing the
 643  * appropriate swizzling.
 644  */
 645 void
 646 fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
 647 {
 648    unsigned vstride, width;
 649
 650    if (c->key.high_quality_derivatives) {
 651       /* produce accurate derivatives */
 652       vstride = BRW_VERTICAL_STRIDE_2;
 653       width = BRW_WIDTH_2;
 654    }
 655    else {
 656       /* replicate the derivative at the top-left pixel to other pixels */
 657       vstride = BRW_VERTICAL_STRIDE_4;
 658       width = BRW_WIDTH_4;
 659    }
 660
 661    struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
 662                                  BRW_REGISTER_TYPE_F,
 663                                  vstride,
 664                                  width,
 665                                  BRW_HORIZONTAL_STRIDE_0,
 666                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 667    struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
 668                                  BRW_REGISTER_TYPE_F,
 669                                  vstride,
 670                                  width,
 671                                  BRW_HORIZONTAL_STRIDE_0,
 672                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 673    brw_ADD(p, dst, src0, negate(src1));
 674 }
 675
 676 /* The negate_value boolean is used to negate the derivative computation for
 677  * FBOs, since they place the origin at the upper left instead of the lower
 678  * left.
 679  */
 680 void
 681 fs_generator::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
 682                          bool negate_value)
 683 {
 684    if (c->key.high_quality_derivatives) {
 685       /* From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
 686        * Region Restrictions):
 687        *
 688        *     In Align16 access mode, SIMD16 is not allowed for DW operations
 689        *     and SIMD8 is not allowed for DF operations.
 690        *
 691        * In this context, "DW operations" means "operations acting on 32-bit
 692        * values", so it includes operations on floats.
 693        *
 694        * Gen4 has a similar restriction.  From the i965 PRM, section 11.5.3
 695        * (Instruction Compression -> Rules and Restrictions):
 696        *
 697        *     A compressed instruction must be in Align1 access mode. Align16
 698        *     mode instructions cannot be compressed.
 699        *
 700        * Similar text exists in the g45 PRM.
 701        *
 702        * On these platforms, if we're building a SIMD16 shader, we need to
 703        * manually unroll to a pair of SIMD8 instructions.
 704        */
 705       bool unroll_to_simd8 =
 706          (dispatch_width == 16 &&
 707           (brw->gen == 4 || (brw->gen == 7 && !brw->is_haswell)));
 708
 709       /* produce accurate derivatives */
 710       struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
 711                                     BRW_REGISTER_TYPE_F,
 712                                     BRW_VERTICAL_STRIDE_4,
 713                                     BRW_WIDTH_4,
 714                                     BRW_HORIZONTAL_STRIDE_1,
 715                                     BRW_SWIZZLE_XYXY, WRITEMASK_XYZW);
 716       struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
 717                                     BRW_REGISTER_TYPE_F,
 718                                     BRW_VERTICAL_STRIDE_4,
 719                                     BRW_WIDTH_4,
 720                                     BRW_HORIZONTAL_STRIDE_1,
 721                                     BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW);
 722       brw_push_insn_state(p);
 723       brw_set_access_mode(p, BRW_ALIGN_16);
 724       if (unroll_to_simd8)
 725          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 726       if (negate_value)
 727          brw_ADD(p, dst, src1, negate(src0));
 728       else
 729          brw_ADD(p, dst, src0, negate(src1));
 730       if (unroll_to_simd8) {
 731          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 732          src0 = sechalf(src0);
 733          src1 = sechalf(src1);
 734          dst = sechalf(dst);
 735          if (negate_value)
 736             brw_ADD(p, dst, src1, negate(src0));
 737          else
 738             brw_ADD(p, dst, src0, negate(src1));
 739       }
 740       brw_pop_insn_state(p);
 741    } else {
 742       /* replicate the derivative at the top-left pixel to other pixels */
 743       struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
 744                                     BRW_REGISTER_TYPE_F,
 745                                     BRW_VERTICAL_STRIDE_4,
 746                                     BRW_WIDTH_4,
 747                                     BRW_HORIZONTAL_STRIDE_0,
 748                                     BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 749       struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
 750                                     BRW_REGISTER_TYPE_F,
 751                                     BRW_VERTICAL_STRIDE_4,
 752                                     BRW_WIDTH_4,
 753                                     BRW_HORIZONTAL_STRIDE_0,
 754                                     BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 755       if (negate_value)
 756          brw_ADD(p, dst, src1, negate(src0));
 757       else
 758          brw_ADD(p, dst, src0, negate(src1));
 759    }
 760 }
 761
 762 void
 763 fs_generator::generate_discard_jump(fs_inst *inst)
 764 {
 765    assert(brw->gen >= 6);
 766
 767    /* This HALT will be patched up at FB write time to point UIP at the end of
 768     * the program, and at brw_uip_jip() JIP will be set to the end of the
 769     * current block (or the program).
 770     */
 771    this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
 772
 773    brw_push_insn_state(p);
 774    brw_set_mask_control(p, BRW_MASK_DISABLE);
 775    gen6_HALT(p);
 776    brw_pop_insn_state(p);
 777 }
 778
 779 void
 780 fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
 781 {
 782    assert(inst->mlen != 0);
 783
 784    brw_MOV(p,
 785            retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
 786            retype(src, BRW_REGISTER_TYPE_UD));
 787    brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
 788                                  dispatch_width / 8, inst->offset);
 789 }
 790
 791 void
 792 fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
 793 {
 794    assert(inst->mlen != 0);
 795
 796    brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
 797                                 dispatch_width / 8, inst->offset);
 798 }
 799
 800 void
 801 fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
 802 {
 803    gen7_block_read_scratch(p, dst, dispatch_width / 8, inst->offset);
 804 }
 805
 806 void
 807 fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
 808                                                   struct brw_reg dst,
 809                                                   struct brw_reg index,
 810                                                   struct brw_reg offset)
 811 {
 812    assert(inst->mlen != 0);
 813
 814    assert(index.file == BRW_IMMEDIATE_VALUE &&
 815           index.type == BRW_REGISTER_TYPE_UD);
 816    uint32_t surf_index = index.dw1.ud;
 817
 818    assert(offset.file == BRW_IMMEDIATE_VALUE &&
 819           offset.type == BRW_REGISTER_TYPE_UD);
 820    uint32_t read_offset = offset.dw1.ud;
 821
 822    brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
 823                         read_offset, surf_index);
 824
 825    brw_mark_surface_used(&c->prog_data.base, surf_index);
 826 }
 827
 828 void
 829 fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
 830                                                        struct brw_reg dst,
 831                                                        struct brw_reg index,
 832                                                        struct brw_reg offset)
 833 {
 834    assert(inst->mlen == 0);
 835
 836    assert(index.file == BRW_IMMEDIATE_VALUE &&
 837           index.type == BRW_REGISTER_TYPE_UD);
 838    uint32_t surf_index = index.dw1.ud;
 839
 840    assert(offset.file == BRW_GENERAL_REGISTER_FILE);
 841    /* Reference just the dword we need, to avoid angering validate_reg(). */
 842    offset = brw_vec1_grf(offset.nr, 0);
 843
 844    brw_push_insn_state(p);
 845    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 846    brw_set_mask_control(p, BRW_MASK_DISABLE);
 847    struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
 848    brw_pop_insn_state(p);
 849
 850    /* We use the SIMD4x2 mode because we want to end up with 4 components in
 851     * the destination loaded consecutively from the same offset (which appears
 852     * in the first component, and the rest are ignored).
 853     */
 854    dst.width = BRW_WIDTH_4;
 855    brw_set_dest(p, send, dst);
 856    brw_set_src0(p, send, offset);
 857    brw_set_sampler_message(p, send,
 858                            surf_index,
 859                            0, /* LD message ignores sampler unit */
 860                            GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
 861                            1, /* rlen */
 862                            1, /* mlen */
 863                            false, /* no header */
 864                            BRW_SAMPLER_SIMD_MODE_SIMD4X2,
 865                            0);
 866
 867    brw_mark_surface_used(&c->prog_data.base, surf_index);
 868 }
 869
 870 void
 871 fs_generator::generate_varying_pull_constant_load(fs_inst *inst,
 872                                                   struct brw_reg dst,
 873                                                   struct brw_reg index,
 874                                                   struct brw_reg offset)
 875 {
 876    assert(brw->gen < 7); /* Should use the gen7 variant. */
 877    assert(inst->header_present);
 878    assert(inst->mlen);
 879
 880    assert(index.file == BRW_IMMEDIATE_VALUE &&
 881           index.type == BRW_REGISTER_TYPE_UD);
 882    uint32_t surf_index = index.dw1.ud;
 883
 884    uint32_t simd_mode, rlen, msg_type;
 885    if (dispatch_width == 16) {
 886       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 887       rlen = 8;
 888    } else {
 889       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 890       rlen = 4;
 891    }
 892
 893    if (brw->gen >= 5)
 894       msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
 895    else {
 896       /* We always use the SIMD16 message so that we only have to load U, and
 897        * not V or R.
 898        */
 899       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
 900       assert(inst->mlen == 3);
 901       assert(inst->regs_written == 8);
 902       rlen = 8;
 903       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 904    }
 905
 906    struct brw_reg offset_mrf = retype(brw_message_reg(inst->base_mrf + 1),
 907                                       BRW_REGISTER_TYPE_D);
 908    brw_MOV(p, offset_mrf, offset);
 909
 910    struct brw_reg header = brw_vec8_grf(0, 0);
 911    gen6_resolve_implied_move(p, &header, inst->base_mrf);
 912
 913    struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
 914    send->header.compression_control = BRW_COMPRESSION_NONE;
 915    brw_set_dest(p, send, dst);
 916    brw_set_src0(p, send, header);
 917    if (brw->gen < 6)
 918       send->header.destreg__conditionalmod = inst->base_mrf;
 919
 920    /* Our surface is set up as floats, regardless of what actual data is
 921     * stored in it.
 922     */
 923    uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
 924    brw_set_sampler_message(p, send,
 925                            surf_index,
 926                            0, /* sampler (unused) */
 927                            msg_type,
 928                            rlen,
 929                            inst->mlen,
 930                            inst->header_present,
 931                            simd_mode,
 932                            return_format);
 933
 934    brw_mark_surface_used(&c->prog_data.base, surf_index);
 935 }
 936
 937 void
 938 fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
 939                                                        struct brw_reg dst,
 940                                                        struct brw_reg index,
 941                                                        struct brw_reg offset)
 942 {
 943    assert(brw->gen >= 7);
 944    /* Varying-offset pull constant loads are treated as a normal expression on
 945     * gen7, so the fact that it's a send message is hidden at the IR level.
 946     */
 947    assert(!inst->header_present);
 948    assert(!inst->mlen);
 949
 950    assert(index.file == BRW_IMMEDIATE_VALUE &&
 951           index.type == BRW_REGISTER_TYPE_UD);
 952    uint32_t surf_index = index.dw1.ud;
 953
 954    uint32_t simd_mode, rlen, mlen;
 955    if (dispatch_width == 16) {
 956       mlen = 2;
 957       rlen = 8;
 958       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 959    } else {
 960       mlen = 1;
 961       rlen = 4;
 962       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 963    }
 964
 965    struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
 966    brw_set_dest(p, send, dst);
 967    brw_set_src0(p, send, offset);
 968    brw_set_sampler_message(p, send,
 969                            surf_index,
 970                            0, /* LD message ignores sampler unit */
 971                            GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
 972                            rlen,
 973                            mlen,
 974                            false, /* no header */
 975                            simd_mode,
 976                            0);
 977
 978    brw_mark_surface_used(&c->prog_data.base, surf_index);
 979 }
 980
 981 /**
 982  * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
 983  * into the flags register (f0.0).
 984  *
 985  * Used only on Gen6 and above.
 986  */
 987 void
 988 fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst)
 989 {
 990    struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg);
 991    struct brw_reg dispatch_mask;
 992
 993    if (brw->gen >= 6)
 994       dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
 995    else
 996       dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
 997
 998    brw_push_insn_state(p);
 999    brw_set_mask_control(p, BRW_MASK_DISABLE);
1000    brw_MOV(p, flags, dispatch_mask);
1001    brw_pop_insn_state(p);
1002 }
1003
1004
1005 static uint32_t brw_file_from_reg(fs_reg *reg)
1006 {
1007    switch (reg->file) {
1008    case GRF:
1009       return BRW_GENERAL_REGISTER_FILE;
1010    case MRF:
1011       return BRW_MESSAGE_REGISTER_FILE;
1012    case IMM:
1013       return BRW_IMMEDIATE_VALUE;
1014    default:
1015       assert(!"not reached");
1016       return BRW_GENERAL_REGISTER_FILE;
1017    }
1018 }
1019
1020 struct brw_reg
1021 brw_reg_from_fs_reg(fs_reg *reg)
1022 {
1023    struct brw_reg brw_reg;
1024
1025    switch (reg->file) {
1026    case GRF:
1027    case MRF:
1028       if (reg->stride == 0) {
1029          brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, 0);
1030       } else {
1031          brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
1032          brw_reg = stride(brw_reg, 8 * reg->stride, 8, reg->stride);
1033       }
1034
1035       brw_reg = retype(brw_reg, reg->type);
1036       brw_reg = byte_offset(brw_reg, reg->subreg_offset);
1037       break;
1038    case IMM:
1039       switch (reg->type) {
1040       case BRW_REGISTER_TYPE_F:
1041          brw_reg = brw_imm_f(reg->imm.f);
1042          break;
1043       case BRW_REGISTER_TYPE_D:
1044          brw_reg = brw_imm_d(reg->imm.i);
1045          break;
1046       case BRW_REGISTER_TYPE_UD:
1047          brw_reg = brw_imm_ud(reg->imm.u);
1048          break;
1049       default:
1050          assert(!"not reached");
1051          brw_reg = brw_null_reg();
1052          break;
1053       }
1054       break;
1055    case HW_REG:
1056       assert(reg->type == reg->fixed_hw_reg.type);
1057       brw_reg = reg->fixed_hw_reg;
1058       break;
1059    case BAD_FILE:
1060       /* Probably unused. */
1061       brw_reg = brw_null_reg();
1062       break;
1063    case UNIFORM:
1064       assert(!"not reached");
1065       brw_reg = brw_null_reg();
1066       break;
1067    default:
1068       assert(!"not reached");
1069       brw_reg = brw_null_reg();
1070       break;
1071    }
1072    if (reg->abs)
1073       brw_reg = brw_abs(brw_reg);
1074    if (reg->negate)
1075       brw_reg = negate(brw_reg);
1076
1077    return brw_reg;
1078 }
1079
1080 /**
1081  * Sets the first word of a vgrf for gen7+ simd4x2 uniform pull constant
1082  * sampler LD messages.
1083  *
1084  * We don't want to bake it into the send message's code generation because
1085  * that means we don't get a chance to schedule the instructions.
1086  */
1087 void
1088 fs_generator::generate_set_simd4x2_offset(fs_inst *inst,
1089                                           struct brw_reg dst,
1090                                           struct brw_reg value)
1091 {
1092    assert(value.file == BRW_IMMEDIATE_VALUE);
1093
1094    brw_push_insn_state(p);
1095    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1096    brw_set_mask_control(p, BRW_MASK_DISABLE);
1097    brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
1098    brw_pop_insn_state(p);
1099 }
1100
1101 /* Sets vstride=16, width=8, hstride=2 or vstride=0, width=1, hstride=0
1102  * (when mask is passed as a uniform) of register mask before moving it
1103  * to register dst.
1104  */
1105 void
1106 fs_generator::generate_set_omask(fs_inst *inst,
1107                                  struct brw_reg dst,
1108                                  struct brw_reg mask)
1109 {
1110    bool stride_8_8_1 =
1111     (mask.vstride == BRW_VERTICAL_STRIDE_8 &&
1112      mask.width == BRW_WIDTH_8 &&
1113      mask.hstride == BRW_HORIZONTAL_STRIDE_1);
1114
1115    bool stride_0_1_0 =
1116     (mask.vstride == BRW_VERTICAL_STRIDE_0 &&
1117      mask.width == BRW_WIDTH_1 &&
1118      mask.hstride == BRW_HORIZONTAL_STRIDE_0);
1119
1120    assert(stride_8_8_1 || stride_0_1_0);
1121    assert(dst.type == BRW_REGISTER_TYPE_UW);
1122
1123    if (dispatch_width == 16)
1124       dst = vec16(dst);
1125    brw_push_insn_state(p);
1126    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1127    brw_set_mask_control(p, BRW_MASK_DISABLE);
1128
1129    if (stride_8_8_1) {
1130       brw_MOV(p, dst, retype(stride(mask, 16, 8, 2), dst.type));
1131    } else if (stride_0_1_0) {
1132       brw_MOV(p, dst, retype(mask, dst.type));
1133    }
1134    brw_pop_insn_state(p);
1135 }
1136
1137 /* Sets vstride=1, width=4, hstride=0 of register src1 during
1138  * the ADD instruction.
1139  */
1140 void
1141 fs_generator::generate_set_sample_id(fs_inst *inst,
1142                                      struct brw_reg dst,
1143                                      struct brw_reg src0,
1144                                      struct brw_reg src1)
1145 {
1146    assert(dst.type == BRW_REGISTER_TYPE_D ||
1147           dst.type == BRW_REGISTER_TYPE_UD);
1148    assert(src0.type == BRW_REGISTER_TYPE_D ||
1149           src0.type == BRW_REGISTER_TYPE_UD);
1150
1151    brw_push_insn_state(p);
1152    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1153    brw_set_mask_control(p, BRW_MASK_DISABLE);
1154    struct brw_reg reg = retype(stride(src1, 1, 4, 0), BRW_REGISTER_TYPE_UW);
1155    brw_ADD(p, dst, src0, reg);
1156    if (dispatch_width == 16)
1157       brw_ADD(p, offset(dst, 1), offset(src0, 1), suboffset(reg, 2));
1158    brw_pop_insn_state(p);
1159 }
1160
1161 /**
1162  * Change the register's data type from UD to W, doubling the strides in order
1163  * to compensate for halving the data type width.
1164  */
1165 static struct brw_reg
1166 ud_reg_to_w(struct brw_reg r)
1167 {
1168    assert(r.type == BRW_REGISTER_TYPE_UD);
1169    r.type = BRW_REGISTER_TYPE_W;
1170
1171    /* The BRW_*_STRIDE enums are defined so that incrementing the field
1172     * doubles the real stride.
1173     */
1174    if (r.hstride != 0)
1175       ++r.hstride;
1176    if (r.vstride != 0)
1177       ++r.vstride;
1178
1179    return r;
1180 }
1181
1182 void
1183 fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
1184                                             struct brw_reg dst,
1185                                             struct brw_reg x,
1186                                             struct brw_reg y)
1187 {
1188    assert(brw->gen >= 7);
1189    assert(dst.type == BRW_REGISTER_TYPE_UD);
1190    assert(x.type == BRW_REGISTER_TYPE_F);
1191    assert(y.type == BRW_REGISTER_TYPE_F);
1192
1193    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
1194     *
1195     *   Because this instruction does not have a 16-bit floating-point type,
1196     *   the destination data type must be Word (W).
1197     *
1198     *   The destination must be DWord-aligned and specify a horizontal stride
1199     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
1200     *   each destination channel and the upper word is not modified.
1201     */
1202    struct brw_reg dst_w = ud_reg_to_w(dst);
1203
1204    /* Give each 32-bit channel of dst the form below , where "." means
1205     * unchanged.
1206     *   0x....hhhh
1207     */
1208    brw_F32TO16(p, dst_w, y);
1209
1210    /* Now the form:
1211     *   0xhhhh0000
1212     */
1213    brw_SHL(p, dst, dst, brw_imm_ud(16u));
1214
1215    /* And, finally the form of packHalf2x16's output:
1216     *   0xhhhhllll
1217     */
1218    brw_F32TO16(p, dst_w, x);
1219 }
1220
1221 void
1222 fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
1223                                               struct brw_reg dst,
1224                                               struct brw_reg src)
1225 {
1226    assert(brw->gen >= 7);
1227    assert(dst.type == BRW_REGISTER_TYPE_F);
1228    assert(src.type == BRW_REGISTER_TYPE_UD);
1229
1230    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1231     *
1232     *   Because this instruction does not have a 16-bit floating-point type,
1233     *   the source data type must be Word (W). The destination type must be
1234     *   F (Float).
1235     */
1236    struct brw_reg src_w = ud_reg_to_w(src);
1237
1238    /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
1239     * For the Y case, we wish to access only the upper word; therefore
1240     * a 16-bit subregister offset is needed.
1241     */
1242    assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
1243           inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
1244    if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
1245       src_w.subnr += 2;
1246
1247    brw_F16TO32(p, dst, src_w);
1248 }
1249
1250 void
1251 fs_generator::generate_shader_time_add(fs_inst *inst,
1252                                        struct brw_reg payload,
1253                                        struct brw_reg offset,
1254                                        struct brw_reg value)
1255 {
1256    assert(brw->gen >= 7);
1257    brw_push_insn_state(p);
1258    brw_set_mask_control(p, true);
1259
1260    assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1261    struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
1262                                           offset.type);
1263    struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
1264                                          value.type);
1265
1266    assert(offset.file == BRW_IMMEDIATE_VALUE);
1267    if (value.file == BRW_GENERAL_REGISTER_FILE) {
1268       value.width = BRW_WIDTH_1;
1269       value.hstride = BRW_HORIZONTAL_STRIDE_0;
1270       value.vstride = BRW_VERTICAL_STRIDE_0;
1271    } else {
1272       assert(value.file == BRW_IMMEDIATE_VALUE);
1273    }
1274
1275    /* Trying to deal with setup of the params from the IR is crazy in the FS8
1276     * case, and we don't really care about squeezing every bit of performance
1277     * out of this path, so we just emit the MOVs from here.
1278     */
1279    brw_MOV(p, payload_offset, offset);
1280    brw_MOV(p, payload_value, value);
1281    brw_shader_time_add(p, payload,
1282                        c->prog_data.base.binding_table.shader_time_start);
1283    brw_pop_insn_state(p);
1284
1285    brw_mark_surface_used(&c->prog_data.base,
1286                          c->prog_data.base.binding_table.shader_time_start);
1287 }
1288
1289 void
1290 fs_generator::generate_untyped_atomic(fs_inst *inst, struct brw_reg dst,
1291                                       struct brw_reg atomic_op,
1292                                       struct brw_reg surf_index)
1293 {
1294    assert(atomic_op.file == BRW_IMMEDIATE_VALUE &&
1295           atomic_op.type == BRW_REGISTER_TYPE_UD &&
1296           surf_index.file == BRW_IMMEDIATE_VALUE &&
1297           surf_index.type == BRW_REGISTER_TYPE_UD);
1298
1299    brw_untyped_atomic(p, dst, brw_message_reg(inst->base_mrf),
1300                       atomic_op.dw1.ud, surf_index.dw1.ud,
1301                       inst->mlen, dispatch_width / 8);
1302
1303    brw_mark_surface_used(&c->prog_data.base, surf_index.dw1.ud);
1304 }
1305
1306 void
1307 fs_generator::generate_untyped_surface_read(fs_inst *inst, struct brw_reg dst,
1308                                             struct brw_reg surf_index)
1309 {
1310    assert(surf_index.file == BRW_IMMEDIATE_VALUE &&
1311           surf_index.type == BRW_REGISTER_TYPE_UD);
1312
1313    brw_untyped_surface_read(p, dst, brw_message_reg(inst->base_mrf),
1314                             surf_index.dw1.ud,
1315                             inst->mlen, dispatch_width / 8);
1316
1317    brw_mark_surface_used(&c->prog_data.base, surf_index.dw1.ud);
1318 }
1319
1320 void
1321 fs_generator::generate_code(exec_list *instructions, FILE *dump_file)
1322 {
1323    int last_native_insn_offset = p->next_insn_offset;
1324    const char *last_annotation_string = NULL;
1325    const void *last_annotation_ir = NULL;
1326
1327    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1328       if (prog) {
1329          fprintf(stderr,
1330                  "Native code for %s fragment shader %d (SIMD%d dispatch):\n",
1331                  prog->Label ? prog->Label : "unnamed",
1332                  prog->Name, dispatch_width);
1333       } else if (fp) {
1334          fprintf(stderr,
1335                  "Native code for fragment program %d (SIMD%d dispatch):\n",
1336                  fp->Base.Id, dispatch_width);
1337       } else {
1338          fprintf(stderr, "Native code for blorp program (SIMD%d dispatch):\n",
1339                  dispatch_width);
1340       }
1341    }
1342
1343    cfg_t *cfg = NULL;
1344    if (unlikely(INTEL_DEBUG & DEBUG_WM))
1345       cfg = new(mem_ctx) cfg_t(instructions);
1346
1347    foreach_list(node, instructions) {
1348       fs_inst *inst = (fs_inst *)node;
1349       struct brw_reg src[3], dst;
1350
1351       if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1352          foreach_list(node, &cfg->block_list) {
1353             bblock_link *link = (bblock_link *)node;
1354             bblock_t *block = link->block;
1355
1356             if (block->start == inst) {
1357                fprintf(stderr, "   START B%d", block->block_num);
1358                foreach_list(predecessor_node, &block->parents) {
1359                   bblock_link *predecessor_link =
1360                      (bblock_link *)predecessor_node;
1361                   bblock_t *predecessor_block = predecessor_link->block;
1362                   fprintf(stderr, " <-B%d", predecessor_block->block_num);
1363                }
1364                fprintf(stderr, "\n");
1365             }
1366          }
1367
1368          if (last_annotation_ir != inst->ir) {
1369             last_annotation_ir = inst->ir;
1370             if (last_annotation_ir) {
1371                fprintf(stderr, "   ");
1372                if (prog)
1373                   ((ir_instruction *)inst->ir)->fprint(stderr);
1374                else {
1375                   const prog_instruction *fpi;
1376                   fpi = (const prog_instruction *)inst->ir;
1377                   fprintf(stderr, "%d: ",
1378                           (int)(fpi - (fp ? fp->Base.Instructions : 0)));
1379                   _mesa_fprint_instruction_opt(stderr,
1380                                                fpi,
1381                                                0, PROG_PRINT_DEBUG, NULL);
1382                }
1383                fprintf(stderr, "\n");
1384             }
1385          }
1386          if (last_annotation_string != inst->annotation) {
1387             last_annotation_string = inst->annotation;
1388             if (last_annotation_string)
1389                fprintf(stderr, "   %s\n", last_annotation_string);
1390          }
1391       }
1392
1393       for (unsigned int i = 0; i < 3; i++) {
1394          src[i] = brw_reg_from_fs_reg(&inst->src[i]);
1395
1396          /* The accumulator result appears to get used for the
1397           * conditional modifier generation.  When negating a UD
1398           * value, there is a 33rd bit generated for the sign in the
1399           * accumulator value, so now you can't check, for example,
1400           * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
1401           */
1402          assert(!inst->conditional_mod ||
1403                 inst->src[i].type != BRW_REGISTER_TYPE_UD ||
1404                 !inst->src[i].negate);
1405       }
1406       dst = brw_reg_from_fs_reg(&inst->dst);
1407
1408       brw_set_conditionalmod(p, inst->conditional_mod);
1409       brw_set_predicate_control(p, inst->predicate);
1410       brw_set_predicate_inverse(p, inst->predicate_inverse);
1411       brw_set_flag_reg(p, 0, inst->flag_subreg);
1412       brw_set_saturate(p, inst->saturate);
1413       brw_set_mask_control(p, inst->force_writemask_all);
1414
1415       if (inst->force_uncompressed || dispatch_width == 8) {
1416          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1417       } else if (inst->force_sechalf) {
1418          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1419       } else {
1420          brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1421       }
1422
1423       switch (inst->opcode) {
1424       case BRW_OPCODE_MOV:
1425          brw_MOV(p, dst, src[0]);
1426          break;
1427       case BRW_OPCODE_ADD:
1428          brw_ADD(p, dst, src[0], src[1]);
1429          break;
1430       case BRW_OPCODE_MUL:
1431          brw_MUL(p, dst, src[0], src[1]);
1432          break;
1433       case BRW_OPCODE_AVG:
1434          brw_AVG(p, dst, src[0], src[1]);
1435          break;
1436       case BRW_OPCODE_MACH:
1437          brw_set_acc_write_control(p, 1);
1438          brw_MACH(p, dst, src[0], src[1]);
1439          brw_set_acc_write_control(p, 0);
1440          break;
1441
1442       case BRW_OPCODE_MAD:
1443          assert(brw->gen >= 6);
1444          brw_set_access_mode(p, BRW_ALIGN_16);
1445          if (dispatch_width == 16 && !brw->is_haswell) {
1446             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1447             brw_MAD(p, dst, src[0], src[1], src[2]);
1448             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1449             brw_MAD(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1450             brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1451          } else {
1452             brw_MAD(p, dst, src[0], src[1], src[2]);
1453          }
1454          brw_set_access_mode(p, BRW_ALIGN_1);
1455          break;
1456
1457       case BRW_OPCODE_LRP:
1458          assert(brw->gen >= 6);
1459          brw_set_access_mode(p, BRW_ALIGN_16);
1460          if (dispatch_width == 16 && !brw->is_haswell) {
1461             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1462             brw_LRP(p, dst, src[0], src[1], src[2]);
1463             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1464             brw_LRP(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1465             brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1466          } else {
1467             brw_LRP(p, dst, src[0], src[1], src[2]);
1468          }
1469          brw_set_access_mode(p, BRW_ALIGN_1);
1470          break;
1471
1472       case BRW_OPCODE_FRC:
1473          brw_FRC(p, dst, src[0]);
1474          break;
1475       case BRW_OPCODE_RNDD:
1476          brw_RNDD(p, dst, src[0]);
1477          break;
1478       case BRW_OPCODE_RNDE:
1479          brw_RNDE(p, dst, src[0]);
1480          break;
1481       case BRW_OPCODE_RNDZ:
1482          brw_RNDZ(p, dst, src[0]);
1483          break;
1484
1485       case BRW_OPCODE_AND:
1486          brw_AND(p, dst, src[0], src[1]);
1487          break;
1488       case BRW_OPCODE_OR:
1489          brw_OR(p, dst, src[0], src[1]);
1490          break;
1491       case BRW_OPCODE_XOR:
1492          brw_XOR(p, dst, src[0], src[1]);
1493          break;
1494       case BRW_OPCODE_NOT:
1495          brw_NOT(p, dst, src[0]);
1496          break;
1497       case BRW_OPCODE_ASR:
1498          brw_ASR(p, dst, src[0], src[1]);
1499          break;
1500       case BRW_OPCODE_SHR:
1501          brw_SHR(p, dst, src[0], src[1]);
1502          break;
1503       case BRW_OPCODE_SHL:
1504          brw_SHL(p, dst, src[0], src[1]);
1505          break;
1506       case BRW_OPCODE_F32TO16:
1507          assert(brw->gen >= 7);
1508          brw_F32TO16(p, dst, src[0]);
1509          break;
1510       case BRW_OPCODE_F16TO32:
1511          assert(brw->gen >= 7);
1512          brw_F16TO32(p, dst, src[0]);
1513          break;
1514       case BRW_OPCODE_CMP:
1515          brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1516          break;
1517       case BRW_OPCODE_SEL:
1518          brw_SEL(p, dst, src[0], src[1]);
1519          break;
1520       case BRW_OPCODE_BFREV:
1521          assert(brw->gen >= 7);
1522          /* BFREV only supports UD type for src and dst. */
1523          brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
1524                       retype(src[0], BRW_REGISTER_TYPE_UD));
1525          break;
1526       case BRW_OPCODE_FBH:
1527          assert(brw->gen >= 7);
1528          /* FBH only supports UD type for dst. */
1529          brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1530          break;
1531       case BRW_OPCODE_FBL:
1532          assert(brw->gen >= 7);
1533          /* FBL only supports UD type for dst. */
1534          brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1535          break;
1536       case BRW_OPCODE_CBIT:
1537          assert(brw->gen >= 7);
1538          /* CBIT only supports UD type for dst. */
1539          brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1540          break;
1541       case BRW_OPCODE_ADDC:
1542          assert(brw->gen >= 7);
1543          brw_set_acc_write_control(p, 1);
1544          brw_ADDC(p, dst, src[0], src[1]);
1545          brw_set_acc_write_control(p, 0);
1546          break;
1547       case BRW_OPCODE_SUBB:
1548          assert(brw->gen >= 7);
1549          brw_set_acc_write_control(p, 1);
1550          brw_SUBB(p, dst, src[0], src[1]);
1551          brw_set_acc_write_control(p, 0);
1552          break;
1553
1554       case BRW_OPCODE_BFE:
1555          assert(brw->gen >= 7);
1556          brw_set_access_mode(p, BRW_ALIGN_16);
1557          if (dispatch_width == 16 && !brw->is_haswell) {
1558             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1559             brw_BFE(p, dst, src[0], src[1], src[2]);
1560             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1561             brw_BFE(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1562             brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1563          } else {
1564             brw_BFE(p, dst, src[0], src[1], src[2]);
1565          }
1566          brw_set_access_mode(p, BRW_ALIGN_1);
1567          break;
1568
1569       case BRW_OPCODE_BFI1:
1570          assert(brw->gen >= 7);
1571          /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
1572           * should
1573           *
1574           *    "Force BFI instructions to be executed always in SIMD8."
1575           */
1576          if (dispatch_width == 16 && brw->is_haswell) {
1577             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1578             brw_BFI1(p, dst, src[0], src[1]);
1579             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1580             brw_BFI1(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]));
1581             brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1582          } else {
1583             brw_BFI1(p, dst, src[0], src[1]);
1584          }
1585          break;
1586       case BRW_OPCODE_BFI2:
1587          assert(brw->gen >= 7);
1588          brw_set_access_mode(p, BRW_ALIGN_16);
1589          /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
1590           * should
1591           *
1592           *    "Force BFI instructions to be executed always in SIMD8."
1593           *
1594           * Otherwise we would be able to emit compressed instructions like we
1595           * do for the other three-source instructions.
1596           */
1597          if (dispatch_width == 16) {
1598             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1599             brw_BFI2(p, dst, src[0], src[1], src[2]);
1600             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1601             brw_BFI2(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1602             brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1603          } else {
1604             brw_BFI2(p, dst, src[0], src[1], src[2]);
1605          }
1606          brw_set_access_mode(p, BRW_ALIGN_1);
1607          break;
1608
1609       case BRW_OPCODE_IF:
1610          if (inst->src[0].file != BAD_FILE) {
1611             /* The instruction has an embedded compare (only allowed on gen6) */
1612             assert(brw->gen == 6);
1613             gen6_IF(p, inst->conditional_mod, src[0], src[1]);
1614          } else {
1615             brw_IF(p, dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
1616          }
1617          break;
1618
1619       case BRW_OPCODE_ELSE:
1620          brw_ELSE(p);
1621          break;
1622       case BRW_OPCODE_ENDIF:
1623          brw_ENDIF(p);
1624          break;
1625
1626       case BRW_OPCODE_DO:
1627          brw_DO(p, BRW_EXECUTE_8);
1628          break;
1629
1630       case BRW_OPCODE_BREAK:
1631          brw_BREAK(p);
1632          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1633          break;
1634       case BRW_OPCODE_CONTINUE:
1635          /* FINISHME: We need to write the loop instruction support still. */
1636          if (brw->gen >= 6)
1637             gen6_CONT(p);
1638          else
1639             brw_CONT(p);
1640          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1641          break;
1642
1643       case BRW_OPCODE_WHILE:
1644          brw_WHILE(p);
1645          break;
1646
1647       case SHADER_OPCODE_RCP:
1648       case SHADER_OPCODE_RSQ:
1649       case SHADER_OPCODE_SQRT:
1650       case SHADER_OPCODE_EXP2:
1651       case SHADER_OPCODE_LOG2:
1652       case SHADER_OPCODE_SIN:
1653       case SHADER_OPCODE_COS:
1654          if (brw->gen >= 7) {
1655             generate_math1_gen7(inst, dst, src[0]);
1656          } else if (brw->gen == 6) {
1657             generate_math1_gen6(inst, dst, src[0]);
1658          } else if (brw->gen == 5 || brw->is_g4x) {
1659             generate_math_g45(inst, dst, src[0]);
1660          } else {
1661             generate_math_gen4(inst, dst, src[0]);
1662          }
1663          break;
1664       case SHADER_OPCODE_INT_QUOTIENT:
1665       case SHADER_OPCODE_INT_REMAINDER:
1666       case SHADER_OPCODE_POW:
1667          if (brw->gen >= 7) {
1668             generate_math2_gen7(inst, dst, src[0], src[1]);
1669          } else if (brw->gen == 6) {
1670             generate_math2_gen6(inst, dst, src[0], src[1]);
1671          } else {
1672             generate_math_gen4(inst, dst, src[0]);
1673          }
1674          break;
1675       case FS_OPCODE_PIXEL_X:
1676          generate_pixel_xy(dst, true);
1677          break;
1678       case FS_OPCODE_PIXEL_Y:
1679          generate_pixel_xy(dst, false);
1680          break;
1681       case FS_OPCODE_CINTERP:
1682          brw_MOV(p, dst, src[0]);
1683          break;
1684       case FS_OPCODE_LINTERP:
1685          generate_linterp(inst, dst, src);
1686          break;
1687       case SHADER_OPCODE_TEX:
1688       case FS_OPCODE_TXB:
1689       case SHADER_OPCODE_TXD:
1690       case SHADER_OPCODE_TXF:
1691       case SHADER_OPCODE_TXF_CMS:
1692       case SHADER_OPCODE_TXF_UMS:
1693       case SHADER_OPCODE_TXF_MCS:
1694       case SHADER_OPCODE_TXL:
1695       case SHADER_OPCODE_TXS:
1696       case SHADER_OPCODE_LOD:
1697       case SHADER_OPCODE_TG4:
1698       case SHADER_OPCODE_TG4_OFFSET:
1699          generate_tex(inst, dst, src[0]);
1700          break;
1701       case FS_OPCODE_DDX:
1702          generate_ddx(inst, dst, src[0]);
1703          break;
1704       case FS_OPCODE_DDY:
1705          /* Make sure fp->UsesDFdy flag got set (otherwise there's no
1706           * guarantee that c->key.render_to_fbo is set).
1707           */
1708          assert(fp->UsesDFdy);
1709          generate_ddy(inst, dst, src[0], c->key.render_to_fbo);
1710          break;
1711
1712       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1713          generate_scratch_write(inst, src[0]);
1714          break;
1715
1716       case SHADER_OPCODE_GEN4_SCRATCH_READ:
1717          generate_scratch_read(inst, dst);
1718          break;
1719
1720       case SHADER_OPCODE_GEN7_SCRATCH_READ:
1721          generate_scratch_read_gen7(inst, dst);
1722          break;
1723
1724       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1725          generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
1726          break;
1727
1728       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
1729          generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1730          break;
1731
1732       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1733          generate_varying_pull_constant_load(inst, dst, src[0], src[1]);
1734          break;
1735
1736       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
1737          generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1738          break;
1739
1740       case FS_OPCODE_FB_WRITE:
1741          generate_fb_write(inst);
1742          break;
1743
1744       case FS_OPCODE_BLORP_FB_WRITE:
1745          generate_blorp_fb_write(inst);
1746          break;
1747
1748       case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
1749          generate_mov_dispatch_to_flags(inst);
1750          break;
1751
1752       case FS_OPCODE_DISCARD_JUMP:
1753          generate_discard_jump(inst);
1754          break;
1755
1756       case SHADER_OPCODE_SHADER_TIME_ADD:
1757          generate_shader_time_add(inst, src[0], src[1], src[2]);
1758          break;
1759
1760       case SHADER_OPCODE_UNTYPED_ATOMIC:
1761          generate_untyped_atomic(inst, dst, src[0], src[1]);
1762          break;
1763
1764       case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1765          generate_untyped_surface_read(inst, dst, src[0]);
1766          break;
1767
1768       case FS_OPCODE_SET_SIMD4X2_OFFSET:
1769          generate_set_simd4x2_offset(inst, dst, src[0]);
1770          break;
1771
1772       case FS_OPCODE_SET_OMASK:
1773          generate_set_omask(inst, dst, src[0]);
1774          break;
1775
1776       case FS_OPCODE_SET_SAMPLE_ID:
1777          generate_set_sample_id(inst, dst, src[0], src[1]);
1778          break;
1779
1780       case FS_OPCODE_PACK_HALF_2x16_SPLIT:
1781           generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
1782           break;
1783
1784       case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
1785       case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
1786          generate_unpack_half_2x16_split(inst, dst, src[0]);
1787          break;
1788
1789       case FS_OPCODE_PLACEHOLDER_HALT:
1790          /* This is the place where the final HALT needs to be inserted if
1791           * we've emitted any discards.  If not, this will emit no code.
1792           */
1793          patch_discard_jumps_to_fb_writes();
1794          break;
1795
1796       default:
1797          if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
1798             _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
1799                           opcode_descs[inst->opcode].name);
1800          } else {
1801             _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
1802          }
1803          abort();
1804       }
1805
1806       if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1807          brw_dump_compile(p, stderr,
1808                           last_native_insn_offset, p->next_insn_offset);
1809
1810          foreach_list(node, &cfg->block_list) {
1811             bblock_link *link = (bblock_link *)node;
1812             bblock_t *block = link->block;
1813
1814             if (block->end == inst) {
1815                fprintf(stderr, "   END B%d", block->block_num);
1816                foreach_list(successor_node, &block->children) {
1817                   bblock_link *successor_link =
1818                      (bblock_link *)successor_node;
1819                   bblock_t *successor_block = successor_link->block;
1820                   fprintf(stderr, " ->B%d", successor_block->block_num);
1821                }
1822                fprintf(stderr, "\n");
1823             }
1824          }
1825       }
1826
1827       last_native_insn_offset = p->next_insn_offset;
1828    }
1829
1830    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1831       fprintf(stderr, "\n");
1832    }
1833
1834    brw_set_uip_jip(p);
1835
1836    /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
1837     * emit issues, it doesn't get the jump distances into the output,
1838     * which is often something we want to debug.  So this is here in
1839     * case you're doing that.
1840     */
1841    if (dump_file) {
1842       brw_dump_compile(p, dump_file, 0, p->next_insn_offset);
1843    }
1844 }
1845
1846 const unsigned *
1847 fs_generator::generate_assembly(exec_list *simd8_instructions,
1848                                 exec_list *simd16_instructions,
1849                                 unsigned *assembly_size,
1850                                 FILE *dump_file)
1851 {
1852    assert(simd8_instructions || simd16_instructions);
1853
1854    if (simd8_instructions) {
1855       dispatch_width = 8;
1856       generate_code(simd8_instructions, dump_file);
1857    }
1858
1859    if (simd16_instructions) {
1860       /* We have to do a compaction pass now, or the one at the end of
1861        * execution will squash down where our prog_offset start needs
1862        * to be.
1863        */
1864       brw_compact_instructions(p);
1865
1866       /* align to 64 byte boundary. */
1867       while ((p->nr_insn * sizeof(struct brw_instruction)) % 64) {
1868          brw_NOP(p);
1869       }
1870
1871       /* Save off the start of this SIMD16 program */
1872       c->prog_data.prog_offset_16 = p->nr_insn * sizeof(struct brw_instruction);
1873
1874       brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1875
1876       dispatch_width = 16;
1877       generate_code(simd16_instructions, dump_file);
1878    }
1879
1880    return brw_get_program(p, assembly_size);
1881 }