src/mesa/drivers/dri/i965/gen8_fs_generator.cpp

   1 /*
   2  * Copyright © 2010, 2011, 2012 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file gen8_fs_generate.cpp
  25  *
  26  * Code generation for Gen8+ hardware.
  27  */
  28
  29 extern "C" {
  30 #include "main/macros.h"
  31 #include "brw_context.h"
  32 } /* extern "C" */
  33
  34 #include "brw_fs.h"
  35 #include "brw_cfg.h"
  36 #include "glsl/ir_print_visitor.h"
  37
  38 gen8_fs_generator::gen8_fs_generator(struct brw_context *brw,
  39                                      struct brw_wm_compile *c,
  40                                      struct gl_shader_program *shader_prog,
  41                                      struct gl_fragment_program *fp,
  42                                      bool dual_source_output)
  43    : gen8_generator(brw, shader_prog, fp ? &fp->Base : NULL, c), c(c), fp(fp),
  44      dual_source_output(dual_source_output)
  45 {
  46    shader =
  47       shader_prog ? shader_prog->_LinkedShaders[MESA_SHADER_FRAGMENT] : NULL;
  48 }
  49
  50 gen8_fs_generator::~gen8_fs_generator()
  51 {
  52 }
  53
  54 void
  55 gen8_fs_generator::mark_surface_used(unsigned surf_index)
  56 {
  57    assert(surf_index < BRW_MAX_SURFACES);
  58
  59    c->prog_data.base.binding_table.size_bytes =
  60       MAX2(c->prog_data.base.binding_table.size_bytes, (surf_index + 1) * 4);
  61 }
  62
  63 void
  64 gen8_fs_generator::generate_fb_write(fs_inst *ir)
  65 {
  66    if (fp && fp->UsesKill) {
  67       gen8_instruction *mov =
  68          MOV(retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW),
  69              brw_flag_reg(0, 1));
  70       gen8_set_mask_control(mov, BRW_MASK_DISABLE);
  71    }
  72
  73    if (ir->header_present) {
  74       gen8_instruction *mov =
  75          MOV_RAW(brw_message_reg(ir->base_mrf), brw_vec8_grf(0, 0));
  76       gen8_set_exec_size(mov, BRW_EXECUTE_16);
  77
  78       if (ir->target > 0 && c->key.replicate_alpha) {
  79          /* Set "Source0 Alpha Present to RenderTarget" bit in the header. */
  80          OR(vec1(retype(brw_message_reg(ir->base_mrf), BRW_REGISTER_TYPE_UD)),
  81             vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
  82             brw_imm_ud(1 << 11));
  83       }
  84
  85       if (ir->target > 0) {
  86          /* Set the render target index for choosing BLEND_STATE. */
  87          MOV(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, ir->base_mrf, 2),
  88                     BRW_REGISTER_TYPE_UD),
  89              brw_imm_ud(ir->target));
  90       }
  91    }
  92
  93    gen8_instruction *inst = next_inst(BRW_OPCODE_SENDC);
  94    gen8_set_dst(brw, inst, retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW));
  95    gen8_set_src0(brw, inst, brw_message_reg(ir->base_mrf));
  96
  97    /* Set up the "Message Specific Control" bits for the Data Port Message
  98     * Descriptor.  These are documented in the "Render Target Write" message's
  99     * "Message Descriptor" documentation (vol5c.2).
 100     */
 101    uint32_t msg_type;
 102    /* Set the Message Type */
 103    if (this->dual_source_output)
 104       msg_type = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
 105    else if (dispatch_width == 16)
 106       msg_type = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
 107    else
 108       msg_type = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
 109
 110    uint32_t msg_control = msg_type;
 111
 112    /* "Last Render Target Select" must be set on all writes to the last of
 113     * the render targets (if using MRT), or always for a single RT scenario.
 114     */
 115    if ((ir->target == c->key.nr_color_regions - 1) || !c->key.nr_color_regions)
 116       msg_control |= (1 << 4); /* Last Render Target Select */
 117
 118    uint32_t surf_index =
 119       c->prog_data.binding_table.render_target_start + ir->target;
 120
 121    gen8_set_dp_message(brw, inst,
 122                        GEN6_SFID_DATAPORT_RENDER_CACHE,
 123                        surf_index,
 124                        GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE,
 125                        msg_control,
 126                        ir->mlen,
 127                        0,
 128                        ir->header_present,
 129                        ir->eot);
 130
 131    mark_surface_used(surf_index);
 132 }
 133
 134 void
 135 gen8_fs_generator::generate_linterp(fs_inst *inst,
 136                                     struct brw_reg dst,
 137                                     struct brw_reg *src)
 138 {
 139    struct brw_reg delta_x = src[0];
 140    struct brw_reg delta_y = src[1];
 141    struct brw_reg interp = src[2];
 142
 143    (void) delta_y;
 144    assert(delta_y.nr == delta_x.nr + 1);
 145    PLN(dst, interp, delta_x);
 146 }
 147
 148 void
 149 gen8_fs_generator::generate_tex(fs_inst *ir,
 150                                 struct brw_reg dst,
 151                                 struct brw_reg src)
 152 {
 153    int msg_type = -1;
 154    int rlen = 4;
 155    uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 156
 157    assert(src.file == BRW_GENERAL_REGISTER_FILE);
 158
 159    if (dispatch_width == 16 && !ir->force_uncompressed && !ir->force_sechalf)
 160       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 161
 162    switch (ir->opcode) {
 163    case SHADER_OPCODE_TEX:
 164       if (ir->shadow_compare) {
 165          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
 166       } else {
 167          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
 168       }
 169       break;
 170    case FS_OPCODE_TXB:
 171       if (ir->shadow_compare) {
 172          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
 173       } else {
 174          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
 175       }
 176       break;
 177    case SHADER_OPCODE_TXL:
 178       if (ir->shadow_compare) {
 179          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
 180       } else {
 181          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
 182       }
 183       break;
 184    case SHADER_OPCODE_TXS:
 185       msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
 186       break;
 187    case SHADER_OPCODE_TXD:
 188       if (ir->shadow_compare) {
 189          msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
 190       } else {
 191          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
 192       }
 193       break;
 194    case SHADER_OPCODE_TXF:
 195       msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
 196       break;
 197    case SHADER_OPCODE_TXF_CMS:
 198       msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
 199       break;
 200    case SHADER_OPCODE_TXF_UMS:
 201       msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
 202       break;
 203    case SHADER_OPCODE_TXF_MCS:
 204       msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
 205       break;
 206    case SHADER_OPCODE_LOD:
 207       msg_type = GEN5_SAMPLER_MESSAGE_LOD;
 208       break;
 209    case SHADER_OPCODE_TG4:
 210       if (ir->shadow_compare) {
 211          assert(brw->gen >= 7);
 212          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
 213       } else {
 214          assert(brw->gen >= 6);
 215          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
 216       }
 217       break;
 218    case SHADER_OPCODE_TG4_OFFSET:
 219       assert(brw->gen >= 7);
 220       if (ir->shadow_compare) {
 221          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
 222       } else {
 223          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
 224       }
 225       break;
 226    default:
 227       assert(!"not reached");
 228       break;
 229    }
 230    assert(msg_type != -1);
 231
 232    if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
 233       rlen = 8;
 234       dst = vec16(dst);
 235    }
 236
 237    if (ir->header_present) {
 238       /* The send-from-GRF for SIMD16 texturing with a header has an extra
 239        * hardware register allocated to it, which we need to skip over (since
 240        * our coordinates in the payload are in the even-numbered registers,
 241        * and the header comes right before the first one.
 242        */
 243       if (dispatch_width == 16)
 244          src.nr++;
 245
 246       MOV_RAW(src, brw_vec8_grf(0, 0));
 247
 248       if (ir->texture_offset) {
 249          /* Set the texel offset bits. */
 250          MOV_RAW(retype(brw_vec1_grf(src.nr, 2), BRW_REGISTER_TYPE_UD),
 251                  brw_imm_ud(ir->texture_offset));
 252       }
 253    }
 254
 255    uint32_t surf_index =
 256       c->prog_data.base.binding_table.texture_start + ir->sampler;
 257
 258    gen8_instruction *inst = next_inst(BRW_OPCODE_SEND);
 259    gen8_set_dst(brw, inst, dst);
 260    gen8_set_src0(brw, inst, src);
 261    gen8_set_sampler_message(brw, inst,
 262                             surf_index,
 263                             ir->sampler,
 264                             msg_type,
 265                             rlen,
 266                             ir->mlen,
 267                             ir->header_present,
 268                             simd_mode);
 269
 270    mark_surface_used(surf_index);
 271 }
 272
 273
 274 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 275  * looking like:
 276  *
 277  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 278  *
 279  * and we're trying to produce:
 280  *
 281  *           DDX                     DDY
 282  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 283  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 284  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 285  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 286  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 287  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 288  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 289  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 290  *
 291  * and add another set of two more subspans if in 16-pixel dispatch mode.
 292  *
 293  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 294  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
 295  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
 296  * between each other.  We could probably do it like ddx and swizzle the right
 297  * order later, but bail for now and just produce
 298  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
 299  */
 300 void
 301 gen8_fs_generator::generate_ddx(fs_inst *inst,
 302                                 struct brw_reg dst,
 303                                 struct brw_reg src)
 304 {
 305    unsigned vstride, width;
 306
 307    if (c->key.high_quality_derivatives) {
 308       /* Produce accurate derivatives. */
 309       vstride = BRW_VERTICAL_STRIDE_2;
 310       width = BRW_WIDTH_2;
 311    } else {
 312       /* Replicate the derivative at the top-left pixel to other pixels. */
 313       vstride = BRW_VERTICAL_STRIDE_4;
 314       width = BRW_WIDTH_4;
 315    }
 316
 317    struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
 318                                  BRW_REGISTER_TYPE_F,
 319                                  vstride,
 320                                  width,
 321                                  BRW_HORIZONTAL_STRIDE_0,
 322                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 323    struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
 324                                  BRW_REGISTER_TYPE_F,
 325                                  vstride,
 326                                  width,
 327                                  BRW_HORIZONTAL_STRIDE_0,
 328                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 329    ADD(dst, src0, negate(src1));
 330 }
 331
 332 /* The negate_value boolean is used to negate the derivative computation for
 333  * FBOs, since they place the origin at the upper left instead of the lower
 334  * left.
 335  */
 336 void
 337 gen8_fs_generator::generate_ddy(fs_inst *inst,
 338                                 struct brw_reg dst,
 339                                 struct brw_reg src,
 340                                 bool negate_value)
 341 {
 342    unsigned hstride;
 343    unsigned src0_swizzle;
 344    unsigned src1_swizzle;
 345    unsigned src1_subnr;
 346
 347    if (c->key.high_quality_derivatives) {
 348       /* Produce accurate derivatives. */
 349       hstride = BRW_HORIZONTAL_STRIDE_1;
 350       src0_swizzle = BRW_SWIZZLE_XYXY;
 351       src1_swizzle = BRW_SWIZZLE_ZWZW;
 352       src1_subnr = 0;
 353
 354       default_state.access_mode = BRW_ALIGN_16;
 355    } else {
 356       /* Replicate the derivative at the top-left pixel to other pixels. */
 357       hstride = BRW_HORIZONTAL_STRIDE_0;
 358       src0_swizzle = BRW_SWIZZLE_XYZW;
 359       src1_swizzle = BRW_SWIZZLE_XYZW;
 360       src1_subnr = 2;
 361    }
 362
 363    struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
 364                                  BRW_REGISTER_TYPE_F,
 365                                  BRW_VERTICAL_STRIDE_4,
 366                                  BRW_WIDTH_4,
 367                                  hstride,
 368                                  src0_swizzle, WRITEMASK_XYZW);
 369    struct brw_reg src1 = brw_reg(src.file, src.nr, src1_subnr,
 370                                  BRW_REGISTER_TYPE_F,
 371                                  BRW_VERTICAL_STRIDE_4,
 372                                  BRW_WIDTH_4,
 373                                  hstride,
 374                                  src1_swizzle, WRITEMASK_XYZW);
 375
 376    if (negate_value)
 377       ADD(dst, src1, negate(src0));
 378    else
 379       ADD(dst, src0, negate(src1));
 380
 381    default_state.access_mode = BRW_ALIGN_1;
 382 }
 383
 384 void
 385 gen8_fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg dst)
 386 {
 387    assert(inst->mlen != 0);
 388    assert(!"TODO: Implement generate_scratch_write.");
 389 }
 390
 391 void
 392 gen8_fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
 393 {
 394    assert(inst->mlen != 0);
 395    assert(!"TODO: Implement generate_scratch_read.");
 396 }
 397
 398 void
 399 gen8_fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
 400 {
 401    assert(inst->mlen != 0);
 402    assert(!"TODO: Implement generate_scratch_read_gen7.");
 403 }
 404
 405 void
 406 gen8_fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
 407                                                        struct brw_reg dst,
 408                                                        struct brw_reg index,
 409                                                        struct brw_reg offset)
 410 {
 411    assert(inst->mlen == 0);
 412
 413    assert(index.file == BRW_IMMEDIATE_VALUE &&
 414           index.type == BRW_REGISTER_TYPE_UD);
 415    uint32_t surf_index = index.dw1.ud;
 416
 417    assert(offset.file == BRW_GENERAL_REGISTER_FILE);
 418    /* Reference only the dword we need lest we anger validate_reg() with
 419     * reg.width > reg.execszie.
 420     */
 421    offset = brw_vec1_grf(offset.nr, 0);
 422
 423    gen8_instruction *send = next_inst(BRW_OPCODE_SEND);
 424    gen8_set_mask_control(send, BRW_MASK_DISABLE);
 425
 426    /* We use the SIMD4x2 mode because we want to end up with 4 constants in
 427     * the destination loaded consecutively from the same offset (which appears
 428     * in the first component, and the rest are ignored).
 429     */
 430    dst.width = BRW_WIDTH_4;
 431    gen8_set_dst(brw, send, dst);
 432    gen8_set_src0(brw, send, offset);
 433    gen8_set_sampler_message(brw, send,
 434                             surf_index,
 435                             0, /* The LD message ignores the sampler unit. */
 436                             GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
 437                             1, /* rlen */
 438                             1, /* mlen */
 439                             false, /* no header */
 440                             BRW_SAMPLER_SIMD_MODE_SIMD4X2);
 441
 442    mark_surface_used(surf_index);
 443 }
 444
 445 void
 446 gen8_fs_generator::generate_varying_pull_constant_load(fs_inst *ir,
 447                                                        struct brw_reg dst,
 448                                                        struct brw_reg index,
 449                                                        struct brw_reg offset)
 450 {
 451    /* Varying-offset pull constant loads are treated as a normal expression on
 452     * gen7, so the fact that it's a send message is hidden at the IR level.
 453     */
 454    assert(!ir->header_present);
 455    assert(!ir->mlen);
 456
 457    assert(index.file == BRW_IMMEDIATE_VALUE &&
 458           index.type == BRW_REGISTER_TYPE_UD);
 459    uint32_t surf_index = index.dw1.ud;
 460
 461    uint32_t simd_mode, rlen, mlen;
 462    if (dispatch_width == 16) {
 463       mlen = 2;
 464       rlen = 8;
 465       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 466    } else {
 467       mlen = 1;
 468       rlen = 4;
 469       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 470    }
 471
 472    gen8_instruction *send = next_inst(BRW_OPCODE_SEND);
 473    gen8_set_dst(brw, send, dst);
 474    gen8_set_src0(brw, send, offset);
 475    gen8_set_sampler_message(brw, send,
 476                             surf_index,
 477                             0, /* The LD message ignore the sampler unit. */
 478                             GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
 479                             rlen, /* rlen */
 480                             mlen, /* mlen */
 481                             false, /* no header */
 482                             simd_mode);
 483
 484    mark_surface_used(surf_index);
 485 }
 486
 487 /**
 488  * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
 489  * into the flags register (f0.0).
 490  */
 491 void
 492 gen8_fs_generator::generate_mov_dispatch_to_flags(fs_inst *ir)
 493 {
 494    struct brw_reg flags = brw_flag_reg(0, ir->flag_subreg);
 495    struct brw_reg dispatch_mask =
 496       retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
 497
 498    gen8_instruction *mov = MOV(flags, dispatch_mask);
 499    gen8_set_mask_control(mov, BRW_MASK_DISABLE);
 500 }
 501
 502 void
 503 gen8_fs_generator::generate_discard_jump(fs_inst *ir)
 504 {
 505    /* This HALT will be patched up at FB write time to point UIP at the end of
 506     * the program, and at brw_uip_jip() JIP will be set to the end of the
 507     * current block (or the program).
 508     */
 509    discard_halt_patches.push_tail(new(mem_ctx) ip_record(nr_inst));
 510
 511    HALT();
 512 }
 513
 514 void
 515 gen8_fs_generator::patch_discard_jumps_to_fb_writes()
 516 {
 517    if (discard_halt_patches.is_empty())
 518       return;
 519
 520    /* There is a somewhat strange undocumented requirement of using
 521     * HALT, according to the simulator.  If some channel has HALTed to
 522     * a particular UIP, then by the end of the program, every channel
 523     * must have HALTed to that UIP.  Furthermore, the tracking is a
 524     * stack, so you can't do the final halt of a UIP after starting
 525     * halting to a new UIP.
 526     *
 527     * Symptoms of not emitting this instruction on actual hardware
 528     * included GPU hangs and sparkly rendering on the piglit discard
 529     * tests.
 530     */
 531    gen8_instruction *last_halt = HALT();
 532    gen8_set_uip(last_halt, 16);
 533    gen8_set_jip(last_halt, 16);
 534
 535    int ip = nr_inst;
 536
 537    foreach_list(node, &discard_halt_patches) {
 538       ip_record *patch_ip = (ip_record *) node;
 539       gen8_instruction *patch = &store[patch_ip->ip];
 540       assert(gen8_opcode(patch) == BRW_OPCODE_HALT);
 541
 542       /* HALT takes an instruction distance from the pre-incremented IP. */
 543       gen8_set_uip(patch, (ip - patch_ip->ip) * 16);
 544    }
 545
 546    this->discard_halt_patches.make_empty();
 547 }
 548
 549 /**
 550  * Sets the first dword of a vgrf for simd4x2 uniform pull constant
 551  * sampler LD messages.
 552  *
 553  * We don't want to bake it into the send message's code generation because
 554  * that means we don't get a chance to schedule the instruction.
 555  */
 556 void
 557 gen8_fs_generator::generate_set_simd4x2_offset(fs_inst *ir,
 558                                                struct brw_reg dst,
 559                                                struct brw_reg value)
 560 {
 561    assert(value.file == BRW_IMMEDIATE_VALUE);
 562    MOV_RAW(retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
 563 }
 564
 565 void
 566 gen8_fs_generator::generate_code(exec_list *instructions)
 567 {
 568    int last_native_inst_offset = next_inst_offset;
 569    const char *last_annotation_string = NULL;
 570    const void *last_annotation_ir = NULL;
 571
 572    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
 573       if (shader) {
 574          printf("Native code for fragment shader %d (SIMD%d dispatch):\n",
 575                 shader_prog->Name, dispatch_width);
 576       } else if (fp) {
 577          printf("Native code for fragment program %d (SIMD%d dispatch):\n",
 578                 prog->Id, dispatch_width);
 579       } else {
 580          printf("Native code for blorp program (SIMD%d dispatch):\n",
 581                 dispatch_width);
 582       }
 583    }
 584
 585    cfg_t *cfg = NULL;
 586    if (unlikely(INTEL_DEBUG & DEBUG_WM))
 587       cfg = new(mem_ctx) cfg_t(instructions);
 588
 589    foreach_list(node, instructions) {
 590       fs_inst *ir = (fs_inst *) node;
 591       struct brw_reg src[3], dst;
 592
 593       if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
 594          foreach_list(node, &cfg->block_list) {
 595             bblock_link *link = (bblock_link *)node;
 596             bblock_t *block = link->block;
 597
 598             if (block->start == ir) {
 599                printf("   START B%d", block->block_num);
 600                foreach_list(predecessor_node, &block->parents) {
 601                   bblock_link *predecessor_link =
 602                      (bblock_link *)predecessor_node;
 603                   bblock_t *predecessor_block = predecessor_link->block;
 604                   printf(" <-B%d", predecessor_block->block_num);
 605                }
 606                printf("\n");
 607             }
 608          }
 609
 610          if (last_annotation_ir != ir->ir) {
 611             last_annotation_ir = ir->ir;
 612             if (last_annotation_ir) {
 613                printf("   ");
 614                if (shader) {
 615                   ((ir_instruction *) ir->ir)->print();
 616                } else if (prog) {
 617                   const prog_instruction *fpi;
 618                   fpi = (const prog_instruction *) ir->ir;
 619                   printf("%d: ", (int)(fpi - prog->Instructions));
 620                   _mesa_fprint_instruction_opt(stdout,
 621                                                fpi,
 622                                                0, PROG_PRINT_DEBUG, NULL);
 623                }
 624                printf("\n");
 625             }
 626          }
 627          if (last_annotation_string != ir->annotation) {
 628             last_annotation_string = ir->annotation;
 629             if (last_annotation_string)
 630                printf("   %s\n", last_annotation_string);
 631          }
 632       }
 633
 634       for (unsigned int i = 0; i < 3; i++) {
 635          src[i] = brw_reg_from_fs_reg(&ir->src[i]);
 636
 637          /* The accumulator result appears to get used for the
 638           * conditional modifier generation.  When negating a UD
 639           * value, there is a 33rd bit generated for the sign in the
 640           * accumulator value, so now you can't check, for example,
 641           * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
 642           */
 643          assert(!ir->conditional_mod ||
 644                 ir->src[i].type != BRW_REGISTER_TYPE_UD ||
 645                 !ir->src[i].negate);
 646       }
 647       dst = brw_reg_from_fs_reg(&ir->dst);
 648
 649       default_state.conditional_mod = ir->conditional_mod;
 650       default_state.predicate = ir->predicate;
 651       default_state.predicate_inverse = ir->predicate_inverse;
 652       default_state.saturate = ir->saturate;
 653       default_state.flag_subreg_nr = ir->flag_subreg;
 654
 655       if (dispatch_width == 16 && !ir->force_uncompressed)
 656          default_state.exec_size = BRW_EXECUTE_16;
 657       else
 658          default_state.exec_size = BRW_EXECUTE_8;
 659
 660       /* fs_inst::force_sechalf is only used for original Gen4 code, so we
 661        * don't handle it.  Add qtr_control to default_state if that changes.
 662        */
 663       assert(!ir->force_sechalf);
 664
 665       switch (ir->opcode) {
 666       case BRW_OPCODE_MOV:
 667          MOV(dst, src[0]);
 668          break;
 669       case BRW_OPCODE_ADD:
 670          ADD(dst, src[0], src[1]);
 671          break;
 672       case BRW_OPCODE_MUL:
 673          MUL(dst, src[0], src[1]);
 674          break;
 675       case BRW_OPCODE_MACH:
 676          MACH(dst, src[0], src[1]);
 677          break;
 678
 679       case BRW_OPCODE_MAD:
 680          default_state.access_mode = BRW_ALIGN_16;
 681          MAD(dst, src[0], src[1], src[2]);
 682          default_state.access_mode = BRW_ALIGN_1;
 683          break;
 684
 685       case BRW_OPCODE_LRP:
 686          default_state.access_mode = BRW_ALIGN_16;
 687          LRP(dst, src[0], src[1], src[2]);
 688          default_state.access_mode = BRW_ALIGN_1;
 689          break;
 690
 691
 692       case BRW_OPCODE_FRC:
 693          FRC(dst, src[0]);
 694          break;
 695       case BRW_OPCODE_RNDD:
 696          RNDD(dst, src[0]);
 697          break;
 698       case BRW_OPCODE_RNDE:
 699          RNDE(dst, src[0]);
 700          break;
 701       case BRW_OPCODE_RNDZ:
 702          RNDZ(dst, src[0]);
 703          break;
 704
 705       case BRW_OPCODE_AND:
 706          AND(dst, src[0], src[1]);
 707          break;
 708       case BRW_OPCODE_OR:
 709          OR(dst, src[0], src[1]);
 710          break;
 711       case BRW_OPCODE_XOR:
 712          XOR(dst, src[0], src[1]);
 713          break;
 714       case BRW_OPCODE_NOT:
 715          NOT(dst, src[0]);
 716          break;
 717       case BRW_OPCODE_ASR:
 718          ASR(dst, src[0], src[1]);
 719          break;
 720       case BRW_OPCODE_SHR:
 721          SHR(dst, src[0], src[1]);
 722          break;
 723       case BRW_OPCODE_SHL:
 724          SHL(dst, src[0], src[1]);
 725          break;
 726
 727       case BRW_OPCODE_F32TO16:
 728          F32TO16(dst, src[0]);
 729          break;
 730       case BRW_OPCODE_F16TO32:
 731          F16TO32(dst, src[0]);
 732          break;
 733
 734       case BRW_OPCODE_CMP:
 735          CMP(dst, ir->conditional_mod, src[0], src[1]);
 736          break;
 737       case BRW_OPCODE_SEL:
 738          SEL(dst, src[0], src[1]);
 739          break;
 740
 741       case BRW_OPCODE_BFREV:
 742          /* BFREV only supports UD type for src and dst. */
 743          BFREV(retype(dst, BRW_REGISTER_TYPE_UD),
 744                retype(src[0], BRW_REGISTER_TYPE_UD));
 745          break;
 746
 747       case BRW_OPCODE_FBH:
 748          /* FBH only supports UD type for dst. */
 749          FBH(retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
 750          break;
 751
 752       case BRW_OPCODE_FBL:
 753          /* FBL only supports UD type for dst. */
 754          FBL(retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
 755          break;
 756
 757       case BRW_OPCODE_CBIT:
 758          /* CBIT only supports UD type for dst. */
 759          CBIT(retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
 760          break;
 761
 762       case BRW_OPCODE_ADDC:
 763          ADDC(dst, src[0], src[1]);
 764          break;
 765
 766       case BRW_OPCODE_SUBB:
 767          SUBB(dst, src[0], src[1]);
 768          break;
 769
 770       case BRW_OPCODE_BFE:
 771          default_state.access_mode = BRW_ALIGN_16;
 772          BFE(dst, src[0], src[1], src[2]);
 773          default_state.access_mode = BRW_ALIGN_1;
 774          break;
 775
 776       case BRW_OPCODE_BFI1:
 777          BFI1(dst, src[0], src[1]);
 778          break;
 779
 780       case BRW_OPCODE_BFI2:
 781          default_state.access_mode = BRW_ALIGN_16;
 782          BFI2(dst, src[0], src[1], src[2]);
 783          default_state.access_mode = BRW_ALIGN_1;
 784          break;
 785
 786       case BRW_OPCODE_IF:
 787          IF(BRW_PREDICATE_NORMAL);
 788          break;
 789
 790       case BRW_OPCODE_ELSE:
 791          ELSE();
 792          break;
 793
 794       case BRW_OPCODE_ENDIF:
 795          ENDIF();
 796          break;
 797
 798       case BRW_OPCODE_DO:
 799          DO();
 800          break;
 801
 802       case BRW_OPCODE_BREAK:
 803          BREAK();
 804          break;
 805
 806       case BRW_OPCODE_CONTINUE:
 807          CONTINUE();
 808          break;
 809
 810       case BRW_OPCODE_WHILE:
 811          WHILE();
 812          break;
 813
 814       case SHADER_OPCODE_RCP:
 815          MATH(BRW_MATH_FUNCTION_INV, dst, src[0]);
 816          break;
 817
 818       case SHADER_OPCODE_RSQ:
 819          MATH(BRW_MATH_FUNCTION_RSQ, dst, src[0]);
 820          break;
 821
 822       case SHADER_OPCODE_SQRT:
 823          MATH(BRW_MATH_FUNCTION_SQRT, dst, src[0]);
 824          break;
 825
 826       case SHADER_OPCODE_EXP2:
 827          MATH(BRW_MATH_FUNCTION_EXP, dst, src[0]);
 828          break;
 829
 830       case SHADER_OPCODE_LOG2:
 831          MATH(BRW_MATH_FUNCTION_LOG, dst, src[0]);
 832          break;
 833
 834       case SHADER_OPCODE_SIN:
 835          MATH(BRW_MATH_FUNCTION_SIN, dst, src[0]);
 836          break;
 837
 838       case SHADER_OPCODE_COS:
 839          MATH(BRW_MATH_FUNCTION_COS, dst, src[0]);
 840          break;
 841
 842       case SHADER_OPCODE_INT_QUOTIENT:
 843          MATH(BRW_MATH_FUNCTION_INT_DIV_QUOTIENT, dst, src[0], src[1]);
 844          break;
 845
 846       case SHADER_OPCODE_INT_REMAINDER:
 847          MATH(BRW_MATH_FUNCTION_INT_DIV_REMAINDER, dst, src[0], src[1]);
 848          break;
 849
 850       case SHADER_OPCODE_POW:
 851          MATH(BRW_MATH_FUNCTION_POW, dst, src[0], src[1]);
 852          break;
 853
 854       case FS_OPCODE_PIXEL_X:
 855       case FS_OPCODE_PIXEL_Y:
 856          assert(!"FS_OPCODE_PIXEL_X and FS_OPCODE_PIXEL_Y are only for Gen4-5.");
 857          break;
 858
 859       case FS_OPCODE_CINTERP:
 860          MOV(dst, src[0]);
 861          break;
 862       case FS_OPCODE_LINTERP:
 863          generate_linterp(ir, dst, src);
 864          break;
 865       case SHADER_OPCODE_TEX:
 866       case FS_OPCODE_TXB:
 867       case SHADER_OPCODE_TXD:
 868       case SHADER_OPCODE_TXF:
 869       case SHADER_OPCODE_TXF_CMS:
 870       case SHADER_OPCODE_TXF_UMS:
 871       case SHADER_OPCODE_TXF_MCS:
 872       case SHADER_OPCODE_TXL:
 873       case SHADER_OPCODE_TXS:
 874       case SHADER_OPCODE_LOD:
 875       case SHADER_OPCODE_TG4:
 876       case SHADER_OPCODE_TG4_OFFSET:
 877          generate_tex(ir, dst, src[0]);
 878          break;
 879
 880       case FS_OPCODE_DDX:
 881          generate_ddx(ir, dst, src[0]);
 882          break;
 883       case FS_OPCODE_DDY:
 884          /* Make sure fp->UsesDFdy flag got set (otherwise there's no
 885           * guarantee that c->key.render_to_fbo is set).
 886           */
 887          assert(fp->UsesDFdy);
 888          generate_ddy(ir, dst, src[0], c->key.render_to_fbo);
 889          break;
 890
 891       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 892          generate_scratch_write(ir, src[0]);
 893          break;
 894
 895       case SHADER_OPCODE_GEN4_SCRATCH_READ:
 896          generate_scratch_read(ir, dst);
 897          break;
 898
 899       case SHADER_OPCODE_GEN7_SCRATCH_READ:
 900          generate_scratch_read_gen7(ir, dst);
 901          break;
 902
 903       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
 904          generate_uniform_pull_constant_load(ir, dst, src[0], src[1]);
 905          break;
 906
 907       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 908          generate_varying_pull_constant_load(ir, dst, src[0], src[1]);
 909          break;
 910
 911       case FS_OPCODE_FB_WRITE:
 912          generate_fb_write(ir);
 913          break;
 914
 915       case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
 916          generate_mov_dispatch_to_flags(ir);
 917          break;
 918
 919       case FS_OPCODE_DISCARD_JUMP:
 920          generate_discard_jump(ir);
 921          break;
 922
 923       case SHADER_OPCODE_SHADER_TIME_ADD:
 924          assert(!"XXX: Missing Gen8 scalar support for INTEL_DEBUG=shader_time");
 925          break;
 926
 927       case SHADER_OPCODE_UNTYPED_ATOMIC:
 928          assert(!"XXX: Missing Gen8 scalar support for untyped atomics");
 929          break;
 930
 931       case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 932          assert(!"XXX: Missing Gen8 scalar support for untyped surface reads");
 933          break;
 934
 935       case FS_OPCODE_SET_SIMD4X2_OFFSET:
 936          generate_set_simd4x2_offset(ir, dst, src[0]);
 937          break;
 938
 939       case FS_OPCODE_SET_OMASK:
 940          assert(!"XXX: Missing Gen8 scalar support for SET_OMASK");
 941          break;
 942
 943       case FS_OPCODE_SET_SAMPLE_ID:
 944          assert(!"XXX: Missing Gen8 scalar support for SET_SAMPLE_ID");
 945          break;
 946
 947       case FS_OPCODE_PACK_HALF_2x16_SPLIT:
 948          assert(!"XXX: Missing Gen8 scalar support for PACK_HALF_2x16_SPLIT");
 949          break;
 950
 951       case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
 952       case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
 953          assert(!"XXX: Missing Gen8 scalar support for UNPACK_HALF_2x16_SPLIT");
 954          break;
 955
 956       case FS_OPCODE_PLACEHOLDER_HALT:
 957          /* This is the place where the final HALT needs to be inserted if
 958           * we've emitted any discards.  If not, this will emit no code.
 959           */
 960          patch_discard_jumps_to_fb_writes();
 961          break;
 962
 963       default:
 964          if (ir->opcode < int(ARRAY_SIZE(opcode_descs))) {
 965             _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
 966                           opcode_descs[ir->opcode].name);
 967          } else {
 968             _mesa_problem(ctx, "Unsupported opcode %d in FS", ir->opcode);
 969          }
 970          abort();
 971       }
 972
 973       if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
 974          disassemble(stdout, last_native_inst_offset, next_inst_offset);
 975
 976          foreach_list(node, &cfg->block_list) {
 977             bblock_link *link = (bblock_link *)node;
 978             bblock_t *block = link->block;
 979
 980             if (block->end == ir) {
 981                printf("   END B%d", block->block_num);
 982                foreach_list(successor_node, &block->children) {
 983                   bblock_link *successor_link =
 984                      (bblock_link *)successor_node;
 985                   bblock_t *successor_block = successor_link->block;
 986                   printf(" ->B%d", successor_block->block_num);
 987                }
 988                printf("\n");
 989             }
 990          }
 991       }
 992
 993       last_native_inst_offset = next_inst_offset;
 994    }
 995
 996    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
 997       printf("\n");
 998    }
 999
1000    patch_jump_targets();
1001 }
1002
1003 const unsigned *
1004 gen8_fs_generator::generate_assembly(exec_list *simd8_instructions,
1005                                      exec_list *simd16_instructions,
1006                                      unsigned *assembly_size)
1007 {
1008    assert(simd8_instructions || simd16_instructions);
1009
1010    if (simd8_instructions) {
1011       dispatch_width = 8;
1012       generate_code(simd8_instructions);
1013    }
1014
1015    if (simd16_instructions) {
1016       /* Align to a 64-byte boundary. */
1017       while ((nr_inst * sizeof(gen8_instruction)) % 64)
1018          NOP();
1019
1020       /* Save off the start of this SIMD16 program */
1021       c->prog_data.prog_offset_16 = nr_inst * sizeof(gen8_instruction);
1022
1023       dispatch_width = 16;
1024       generate_code(simd16_instructions);
1025    }
1026
1027    *assembly_size = next_inst_offset;
1028    return (const unsigned *) store;
1029 }