src/mesa/drivers/dri/i965/brw_fs_generator.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs_generator.cpp
  25  *
  26  * This file supports generating code from the FS LIR to the actual
  27  * native instructions.
  28  */
  29
  30 #include "main/macros.h"
  31 #include "brw_context.h"
  32 #include "brw_eu.h"
  33 #include "brw_fs.h"
  34 #include "brw_cfg.h"
  35
  36 static uint32_t brw_file_from_reg(fs_reg *reg)
  37 {
  38    switch (reg->file) {
  39    case GRF:
  40       return BRW_GENERAL_REGISTER_FILE;
  41    case MRF:
  42       return BRW_MESSAGE_REGISTER_FILE;
  43    case IMM:
  44       return BRW_IMMEDIATE_VALUE;
  45    default:
  46       unreachable("not reached");
  47    }
  48 }
  49
  50 static struct brw_reg
  51 brw_reg_from_fs_reg(fs_reg *reg)
  52 {
  53    struct brw_reg brw_reg;
  54
  55    switch (reg->file) {
  56    case GRF:
  57    case MRF:
  58       if (reg->stride == 0) {
  59          brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, 0);
  60       } else if (reg->width < 8) {
  61          brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
  62          brw_reg = stride(brw_reg, reg->width * reg->stride,
  63                           reg->width, reg->stride);
  64       } else {
  65          /* From the Haswell PRM:
  66           *
  67           * VertStride must be used to cross GRF register boundaries. This
  68           * rule implies that elements within a 'Width' cannot cross GRF
  69           * boundaries.
  70           *
  71           * So, for registers with width > 8, we have to use a width of 8
  72           * and trust the compression state to sort out the exec size.
  73           */
  74          brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
  75          brw_reg = stride(brw_reg, 8 * reg->stride, 8, reg->stride);
  76       }
  77
  78       brw_reg = retype(brw_reg, reg->type);
  79       brw_reg = byte_offset(brw_reg, reg->subreg_offset);
  80       break;
  81    case IMM:
  82       switch (reg->type) {
  83       case BRW_REGISTER_TYPE_F:
  84          brw_reg = brw_imm_f(reg->fixed_hw_reg.dw1.f);
  85          break;
  86       case BRW_REGISTER_TYPE_D:
  87          brw_reg = brw_imm_d(reg->fixed_hw_reg.dw1.d);
  88          break;
  89       case BRW_REGISTER_TYPE_UD:
  90          brw_reg = brw_imm_ud(reg->fixed_hw_reg.dw1.ud);
  91          break;
  92       case BRW_REGISTER_TYPE_W:
  93          brw_reg = brw_imm_w(reg->fixed_hw_reg.dw1.d);
  94          break;
  95       case BRW_REGISTER_TYPE_UW:
  96          brw_reg = brw_imm_uw(reg->fixed_hw_reg.dw1.ud);
  97          break;
  98       case BRW_REGISTER_TYPE_VF:
  99          brw_reg = brw_imm_vf(reg->fixed_hw_reg.dw1.ud);
 100          break;
 101       default:
 102          unreachable("not reached");
 103       }
 104       break;
 105    case HW_REG:
 106       assert(reg->type == reg->fixed_hw_reg.type);
 107       brw_reg = reg->fixed_hw_reg;
 108       break;
 109    case BAD_FILE:
 110       /* Probably unused. */
 111       brw_reg = brw_null_reg();
 112       break;
 113    default:
 114       unreachable("not reached");
 115    }
 116    if (reg->abs)
 117       brw_reg = brw_abs(brw_reg);
 118    if (reg->negate)
 119       brw_reg = negate(brw_reg);
 120
 121    return brw_reg;
 122 }
 123
 124 fs_generator::fs_generator(struct brw_context *brw,
 125                            void *mem_ctx,
 126                            const void *key,
 127                            struct brw_stage_prog_data *prog_data,
 128                            struct gl_program *prog,
 129                            bool runtime_check_aads_emit,
 130                            const char *stage_abbrev)
 131
 132    : brw(brw), key(key),
 133      prog_data(prog_data),
 134      prog(prog), runtime_check_aads_emit(runtime_check_aads_emit),
 135      debug_flag(false), stage_abbrev(stage_abbrev), mem_ctx(mem_ctx)
 136 {
 137    ctx = &brw->ctx;
 138
 139    p = rzalloc(mem_ctx, struct brw_compile);
 140    brw_init_compile(brw, p, mem_ctx);
 141 }
 142
 143 fs_generator::~fs_generator()
 144 {
 145 }
 146
 147 class ip_record : public exec_node {
 148 public:
 149    DECLARE_RALLOC_CXX_OPERATORS(ip_record)
 150
 151    ip_record(int ip)
 152    {
 153       this->ip = ip;
 154    }
 155
 156    int ip;
 157 };
 158
 159 bool
 160 fs_generator::patch_discard_jumps_to_fb_writes()
 161 {
 162    if (brw->gen < 6 || this->discard_halt_patches.is_empty())
 163       return false;
 164
 165    int scale = brw_jump_scale(brw);
 166
 167    /* There is a somewhat strange undocumented requirement of using
 168     * HALT, according to the simulator.  If some channel has HALTed to
 169     * a particular UIP, then by the end of the program, every channel
 170     * must have HALTed to that UIP.  Furthermore, the tracking is a
 171     * stack, so you can't do the final halt of a UIP after starting
 172     * halting to a new UIP.
 173     *
 174     * Symptoms of not emitting this instruction on actual hardware
 175     * included GPU hangs and sparkly rendering on the piglit discard
 176     * tests.
 177     */
 178    brw_inst *last_halt = gen6_HALT(p);
 179    brw_inst_set_uip(brw, last_halt, 1 * scale);
 180    brw_inst_set_jip(brw, last_halt, 1 * scale);
 181
 182    int ip = p->nr_insn;
 183
 184    foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
 185       brw_inst *patch = &p->store[patch_ip->ip];
 186
 187       assert(brw_inst_opcode(brw, patch) == BRW_OPCODE_HALT);
 188       /* HALT takes a half-instruction distance from the pre-incremented IP. */
 189       brw_inst_set_uip(brw, patch, (ip - patch_ip->ip) * scale);
 190    }
 191
 192    this->discard_halt_patches.make_empty();
 193    return true;
 194 }
 195
 196 void
 197 fs_generator::fire_fb_write(fs_inst *inst,
 198                             struct brw_reg payload,
 199                             struct brw_reg implied_header,
 200                             GLuint nr)
 201 {
 202    uint32_t msg_control;
 203
 204    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
 205
 206    if (brw->gen < 6) {
 207       brw_push_insn_state(p);
 208       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
 209       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
 210       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
 211       brw_MOV(p, offset(payload, 1), brw_vec8_grf(1, 0));
 212       brw_pop_insn_state(p);
 213    }
 214
 215    if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
 216       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
 217    else if (prog_data->dual_src_blend)
 218       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
 219    else if (dispatch_width == 16)
 220       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
 221    else
 222       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
 223
 224    uint32_t surf_index =
 225       prog_data->binding_table.render_target_start + inst->target;
 226
 227    brw_fb_WRITE(p,
 228                 dispatch_width,
 229                 payload,
 230                 implied_header,
 231                 msg_control,
 232                 surf_index,
 233                 nr,
 234                 0,
 235                 inst->eot,
 236                 inst->header_present);
 237
 238    brw_mark_surface_used(&prog_data->base, surf_index);
 239 }
 240
 241 void
 242 fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
 243 {
 244    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
 245    const brw_wm_prog_key * const key = (brw_wm_prog_key * const) this->key;
 246    struct brw_reg implied_header;
 247
 248    if (brw->gen < 8 && !brw->is_haswell) {
 249       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
 250    }
 251
 252    if (inst->base_mrf >= 0)
 253       payload = brw_message_reg(inst->base_mrf);
 254
 255    /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
 256     * move, here's g1.
 257     */
 258    if (inst->header_present) {
 259       brw_push_insn_state(p);
 260       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
 261       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
 262       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
 263       brw_set_default_flag_reg(p, 0, 0);
 264
 265       /* On HSW, the GPU will use the predicate on SENDC, unless the header is
 266        * present.
 267        */
 268       if (prog_data->uses_kill) {
 269          struct brw_reg pixel_mask;
 270
 271          if (brw->gen >= 6)
 272             pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
 273          else
 274             pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
 275
 276          brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
 277       }
 278
 279       if (brw->gen >= 6) {
 280          brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
 281          brw_MOV(p,
 282                  retype(payload, BRW_REGISTER_TYPE_UD),
 283                  retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
 284          brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
 285
 286          if (inst->target > 0 && key->replicate_alpha) {
 287             /* Set "Source0 Alpha Present to RenderTarget" bit in message
 288              * header.
 289              */
 290             brw_OR(p,
 291                    vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
 292                    vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
 293                    brw_imm_ud(0x1 << 11));
 294          }
 295
 296          if (inst->target > 0) {
 297             /* Set the render target index for choosing BLEND_STATE. */
 298             brw_MOV(p, retype(vec1(suboffset(payload, 2)),
 299                               BRW_REGISTER_TYPE_UD),
 300                     brw_imm_ud(inst->target));
 301          }
 302
 303          implied_header = brw_null_reg();
 304       } else {
 305          implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
 306       }
 307
 308       brw_pop_insn_state(p);
 309    } else {
 310       implied_header = brw_null_reg();
 311    }
 312
 313    if (!runtime_check_aads_emit) {
 314       fire_fb_write(inst, payload, implied_header, inst->mlen);
 315    } else {
 316       /* This can only happen in gen < 6 */
 317       assert(brw->gen < 6);
 318
 319       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
 320
 321       /* Check runtime bit to detect if we have to send AA data or not */
 322       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
 323       brw_AND(p,
 324               v1_null_ud,
 325               retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
 326               brw_imm_ud(1<<26));
 327       brw_inst_set_cond_modifier(brw, brw_last_inst, BRW_CONDITIONAL_NZ);
 328
 329       int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
 330       brw_inst_set_exec_size(brw, brw_last_inst, BRW_EXECUTE_1);
 331       {
 332          /* Don't send AA data */
 333          fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
 334       }
 335       brw_land_fwd_jump(p, jmp);
 336       fire_fb_write(inst, payload, implied_header, inst->mlen);
 337    }
 338 }
 339
 340 void
 341 fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload)
 342 {
 343    brw_inst *insn;
 344
 345    insn = brw_next_insn(p, BRW_OPCODE_SEND);
 346
 347    brw_set_dest(p, insn, brw_null_reg());
 348    brw_set_src0(p, insn, payload);
 349    brw_set_src1(p, insn, brw_imm_d(0));
 350
 351    brw_inst_set_sfid(brw, insn, BRW_SFID_URB);
 352    brw_inst_set_urb_opcode(brw, insn, GEN8_URB_OPCODE_SIMD8_WRITE);
 353
 354    brw_inst_set_mlen(brw, insn, inst->mlen);
 355    brw_inst_set_rlen(brw, insn, 0);
 356    brw_inst_set_eot(brw, insn, inst->eot);
 357    brw_inst_set_header_present(brw, insn, true);
 358    brw_inst_set_urb_global_offset(brw, insn, inst->offset);
 359 }
 360
 361 void
 362 fs_generator::generate_blorp_fb_write(fs_inst *inst)
 363 {
 364    brw_fb_WRITE(p,
 365                 16 /* dispatch_width */,
 366                 brw_message_reg(inst->base_mrf),
 367                 brw_reg_from_fs_reg(&inst->src[0]),
 368                 BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE,
 369                 inst->target,
 370                 inst->mlen,
 371                 0,
 372                 true,
 373                 inst->header_present);
 374 }
 375
 376 /* Computes the integer pixel x,y values from the origin.
 377  *
 378  * This is the basis of gl_FragCoord computation, but is also used
 379  * pre-gen6 for computing the deltas from v0 for computing
 380  * interpolation.
 381  */
 382 void
 383 fs_generator::generate_pixel_xy(struct brw_reg dst, bool is_x)
 384 {
 385    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
 386    struct brw_reg src;
 387    struct brw_reg deltas;
 388
 389    if (is_x) {
 390       src = stride(suboffset(g1_uw, 4), 2, 4, 0);
 391       deltas = brw_imm_v(0x10101010);
 392    } else {
 393       src = stride(suboffset(g1_uw, 5), 2, 4, 0);
 394       deltas = brw_imm_v(0x11001100);
 395    }
 396
 397    if (dispatch_width == 16) {
 398       dst = vec16(dst);
 399    }
 400
 401    /* We do this SIMD8 or SIMD16, but since the destination is UW we
 402     * don't do compression in the SIMD16 case.
 403     */
 404    brw_push_insn_state(p);
 405    brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
 406    brw_ADD(p, dst, src, deltas);
 407    brw_pop_insn_state(p);
 408 }
 409
 410 void
 411 fs_generator::generate_linterp(fs_inst *inst,
 412                              struct brw_reg dst, struct brw_reg *src)
 413 {
 414    struct brw_reg delta_x = src[0];
 415    struct brw_reg delta_y = src[1];
 416    struct brw_reg interp = src[2];
 417
 418    if (brw->has_pln &&
 419        delta_y.nr == delta_x.nr + 1 &&
 420        (brw->gen >= 6 || (delta_x.nr & 1) == 0)) {
 421       brw_PLN(p, dst, interp, delta_x);
 422    } else {
 423       brw_LINE(p, brw_null_reg(), interp, delta_x);
 424       brw_MAC(p, dst, suboffset(interp, 1), delta_y);
 425    }
 426 }
 427
 428 void
 429 fs_generator::generate_math_gen6(fs_inst *inst,
 430                                  struct brw_reg dst,
 431                                  struct brw_reg src0,
 432                                  struct brw_reg src1)
 433 {
 434    int op = brw_math_function(inst->opcode);
 435    bool binop = src1.file != BRW_ARCHITECTURE_REGISTER_FILE;
 436
 437    if (dispatch_width == 8) {
 438       gen6_math(p, dst, op, src0, src1);
 439    } else if (dispatch_width == 16) {
 440       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
 441       gen6_math(p, firsthalf(dst), op, firsthalf(src0), firsthalf(src1));
 442       brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
 443       gen6_math(p, sechalf(dst), op, sechalf(src0),
 444                 binop ? sechalf(src1) : brw_null_reg());
 445       brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
 446    }
 447 }
 448
 449 void
 450 fs_generator::generate_math_gen4(fs_inst *inst,
 451                                struct brw_reg dst,
 452                                struct brw_reg src)
 453 {
 454    int op = brw_math_function(inst->opcode);
 455
 456    assert(inst->mlen >= 1);
 457
 458    if (dispatch_width == 8) {
 459       gen4_math(p, dst,
 460                 op,
 461                 inst->base_mrf, src,
 462                 BRW_MATH_PRECISION_FULL);
 463    } else if (dispatch_width == 16) {
 464       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
 465       gen4_math(p, firsthalf(dst),
 466                 op,
 467                 inst->base_mrf, firsthalf(src),
 468                 BRW_MATH_PRECISION_FULL);
 469       brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
 470       gen4_math(p, sechalf(dst),
 471                 op,
 472                 inst->base_mrf + 1, sechalf(src),
 473                 BRW_MATH_PRECISION_FULL);
 474
 475       brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
 476    }
 477 }
 478
 479 void
 480 fs_generator::generate_math_g45(fs_inst *inst,
 481                                 struct brw_reg dst,
 482                                 struct brw_reg src)
 483 {
 484    if (inst->opcode == SHADER_OPCODE_POW ||
 485        inst->opcode == SHADER_OPCODE_INT_QUOTIENT ||
 486        inst->opcode == SHADER_OPCODE_INT_REMAINDER) {
 487       generate_math_gen4(inst, dst, src);
 488       return;
 489    }
 490
 491    int op = brw_math_function(inst->opcode);
 492
 493    assert(inst->mlen >= 1);
 494
 495    gen4_math(p, dst,
 496              op,
 497              inst->base_mrf, src,
 498              BRW_MATH_PRECISION_FULL);
 499 }
 500
 501 void
 502 fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
 503                            struct brw_reg sampler_index)
 504 {
 505    int msg_type = -1;
 506    int rlen = 4;
 507    uint32_t simd_mode;
 508    uint32_t return_format;
 509
 510    switch (dst.type) {
 511    case BRW_REGISTER_TYPE_D:
 512       return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
 513       break;
 514    case BRW_REGISTER_TYPE_UD:
 515       return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
 516       break;
 517    default:
 518       return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
 519       break;
 520    }
 521
 522    switch (inst->exec_size) {
 523    case 8:
 524       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 525       break;
 526    case 16:
 527       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 528       break;
 529    default:
 530       unreachable("Invalid width for texture instruction");
 531    }
 532
 533    if (brw->gen >= 5) {
 534       switch (inst->opcode) {
 535       case SHADER_OPCODE_TEX:
 536          if (inst->shadow_compare) {
 537             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
 538          } else {
 539             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
 540          }
 541          break;
 542       case FS_OPCODE_TXB:
 543          if (inst->shadow_compare) {
 544             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
 545          } else {
 546             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
 547          }
 548          break;
 549       case SHADER_OPCODE_TXL:
 550          if (inst->shadow_compare) {
 551             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
 552          } else {
 553             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
 554          }
 555          break;
 556       case SHADER_OPCODE_TXS:
 557          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
 558          break;
 559       case SHADER_OPCODE_TXD:
 560          if (inst->shadow_compare) {
 561             /* Gen7.5+.  Otherwise, lowered by brw_lower_texture_gradients(). */
 562             assert(brw->gen >= 8 || brw->is_haswell);
 563             msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
 564          } else {
 565             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
 566          }
 567          break;
 568       case SHADER_OPCODE_TXF:
 569          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
 570          break;
 571       case SHADER_OPCODE_TXF_CMS:
 572          if (brw->gen >= 7)
 573             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
 574          else
 575             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
 576          break;
 577       case SHADER_OPCODE_TXF_UMS:
 578          assert(brw->gen >= 7);
 579          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
 580          break;
 581       case SHADER_OPCODE_TXF_MCS:
 582          assert(brw->gen >= 7);
 583          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
 584          break;
 585       case SHADER_OPCODE_LOD:
 586          msg_type = GEN5_SAMPLER_MESSAGE_LOD;
 587          break;
 588       case SHADER_OPCODE_TG4:
 589          if (inst->shadow_compare) {
 590             assert(brw->gen >= 7);
 591             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
 592          } else {
 593             assert(brw->gen >= 6);
 594             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
 595          }
 596          break;
 597       case SHADER_OPCODE_TG4_OFFSET:
 598          assert(brw->gen >= 7);
 599          if (inst->shadow_compare) {
 600             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
 601          } else {
 602             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
 603          }
 604          break;
 605       default:
 606          unreachable("not reached");
 607       }
 608    } else {
 609       switch (inst->opcode) {
 610       case SHADER_OPCODE_TEX:
 611          /* Note that G45 and older determines shadow compare and dispatch width
 612           * from message length for most messages.
 613           */
 614          assert(dispatch_width == 8);
 615          msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
 616          if (inst->shadow_compare) {
 617             assert(inst->mlen == 6);
 618          } else {
 619             assert(inst->mlen <= 4);
 620          }
 621          break;
 622       case FS_OPCODE_TXB:
 623          if (inst->shadow_compare) {
 624             assert(inst->mlen == 6);
 625             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
 626          } else {
 627             assert(inst->mlen == 9);
 628             msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
 629             simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 630          }
 631          break;
 632       case SHADER_OPCODE_TXL:
 633          if (inst->shadow_compare) {
 634             assert(inst->mlen == 6);
 635             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
 636          } else {
 637             assert(inst->mlen == 9);
 638             msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
 639             simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 640          }
 641          break;
 642       case SHADER_OPCODE_TXD:
 643          /* There is no sample_d_c message; comparisons are done manually */
 644          assert(inst->mlen == 7 || inst->mlen == 10);
 645          msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
 646          break;
 647       case SHADER_OPCODE_TXF:
 648          assert(inst->mlen == 9);
 649          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
 650          simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 651          break;
 652       case SHADER_OPCODE_TXS:
 653          assert(inst->mlen == 3);
 654          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
 655          simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 656          break;
 657       default:
 658          unreachable("not reached");
 659       }
 660    }
 661    assert(msg_type != -1);
 662
 663    if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
 664       rlen = 8;
 665       dst = vec16(dst);
 666    }
 667
 668    assert(brw->gen < 7 || !inst->header_present ||
 669           src.file == BRW_GENERAL_REGISTER_FILE);
 670
 671    assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
 672
 673    /* Load the message header if present.  If there's a texture offset,
 674     * we need to set it up explicitly and load the offset bitfield.
 675     * Otherwise, we can use an implied move from g0 to the first message reg.
 676     */
 677    if (inst->header_present) {
 678       if (brw->gen < 6 && !inst->offset) {
 679          /* Set up an implied move from g0 to the MRF. */
 680          src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
 681       } else {
 682          struct brw_reg header_reg;
 683
 684          if (brw->gen >= 7) {
 685             header_reg = src;
 686          } else {
 687             assert(inst->base_mrf != -1);
 688             header_reg = brw_message_reg(inst->base_mrf);
 689          }
 690
 691          brw_push_insn_state(p);
 692          brw_set_default_mask_control(p, BRW_MASK_DISABLE);
 693          brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
 694          /* Explicitly set up the message header by copying g0 to the MRF. */
 695          brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
 696
 697          if (inst->offset) {
 698             /* Set the offset bits in DWord 2. */
 699             brw_MOV(p, get_element_ud(header_reg, 2),
 700                        brw_imm_ud(inst->offset));
 701          }
 702
 703          brw_adjust_sampler_state_pointer(p, header_reg, sampler_index);
 704          brw_pop_insn_state(p);
 705       }
 706    }
 707
 708    uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
 709          inst->opcode == SHADER_OPCODE_TG4_OFFSET)
 710          ? prog_data->binding_table.gather_texture_start
 711          : prog_data->binding_table.texture_start;
 712
 713    if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
 714       uint32_t sampler = sampler_index.dw1.ud;
 715
 716       brw_SAMPLE(p,
 717                  retype(dst, BRW_REGISTER_TYPE_UW),
 718                  inst->base_mrf,
 719                  src,
 720                  sampler + base_binding_table_index,
 721                  sampler % 16,
 722                  msg_type,
 723                  rlen,
 724                  inst->mlen,
 725                  inst->header_present,
 726                  simd_mode,
 727                  return_format);
 728
 729       brw_mark_surface_used(prog_data, sampler + base_binding_table_index);
 730    } else {
 731       /* Non-const sampler index */
 732       /* Note: this clobbers `dst` as a temporary before emitting the send */
 733
 734       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
 735       struct brw_reg temp = vec1(retype(dst, BRW_REGISTER_TYPE_UD));
 736
 737       struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
 738
 739       brw_push_insn_state(p);
 740       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
 741       brw_set_default_access_mode(p, BRW_ALIGN_1);
 742
 743       /* Some care required: `sampler` and `temp` may alias:
 744        *    addr = sampler & 0xff
 745        *    temp = (sampler << 8) & 0xf00
 746        *    addr = addr | temp
 747        */
 748       brw_ADD(p, addr, sampler_reg, brw_imm_ud(base_binding_table_index));
 749       brw_SHL(p, temp, sampler_reg, brw_imm_ud(8u));
 750       brw_AND(p, temp, temp, brw_imm_ud(0x0f00));
 751       brw_AND(p, addr, addr, brw_imm_ud(0x0ff));
 752       brw_OR(p, addr, addr, temp);
 753
 754       /* a0.0 |= <descriptor> */
 755       brw_inst *insn_or = brw_next_insn(p, BRW_OPCODE_OR);
 756       brw_set_sampler_message(p, insn_or,
 757                               0 /* surface */,
 758                               0 /* sampler */,
 759                               msg_type,
 760                               rlen,
 761                               inst->mlen /* mlen */,
 762                               inst->header_present /* header */,
 763                               simd_mode,
 764                               return_format);
 765       brw_inst_set_exec_size(p->brw, insn_or, BRW_EXECUTE_1);
 766       brw_inst_set_src1_reg_type(p->brw, insn_or, BRW_REGISTER_TYPE_UD);
 767       brw_set_src0(p, insn_or, addr);
 768       brw_set_dest(p, insn_or, addr);
 769
 770
 771       /* dst = send(offset, a0.0) */
 772       brw_inst *insn_send = brw_next_insn(p, BRW_OPCODE_SEND);
 773       brw_set_dest(p, insn_send, dst);
 774       brw_set_src0(p, insn_send, src);
 775       brw_set_indirect_send_descriptor(p, insn_send, BRW_SFID_SAMPLER, addr);
 776
 777       brw_pop_insn_state(p);
 778
 779       /* visitor knows more than we do about the surface limit required,
 780        * so has already done marking.
 781        */
 782    }
 783 }
 784
 785
 786 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 787  * looking like:
 788  *
 789  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 790  *
 791  * Ideally, we want to produce:
 792  *
 793  *           DDX                     DDY
 794  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 795  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 796  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 797  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 798  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 799  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 800  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 801  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 802  *
 803  * and add another set of two more subspans if in 16-pixel dispatch mode.
 804  *
 805  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 806  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
 807  * pair.  But the ideal approximation may impose a huge performance cost on
 808  * sample_d.  On at least Haswell, sample_d instruction does some
 809  * optimizations if the same LOD is used for all pixels in the subspan.
 810  *
 811  * For DDY, we need to use ALIGN16 mode since it's capable of doing the
 812  * appropriate swizzling.
 813  */
 814 void
 815 fs_generator::generate_ddx(enum opcode opcode,
 816                            struct brw_reg dst, struct brw_reg src)
 817 {
 818    unsigned vstride, width;
 819
 820    if (opcode == FS_OPCODE_DDX_FINE) {
 821       /* produce accurate derivatives */
 822       vstride = BRW_VERTICAL_STRIDE_2;
 823       width = BRW_WIDTH_2;
 824    } else {
 825       /* replicate the derivative at the top-left pixel to other pixels */
 826       vstride = BRW_VERTICAL_STRIDE_4;
 827       width = BRW_WIDTH_4;
 828    }
 829
 830    struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
 831                                  src.negate, src.abs,
 832                                  BRW_REGISTER_TYPE_F,
 833                                  vstride,
 834                                  width,
 835                                  BRW_HORIZONTAL_STRIDE_0,
 836                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 837    struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
 838                                  src.negate, src.abs,
 839                                  BRW_REGISTER_TYPE_F,
 840                                  vstride,
 841                                  width,
 842                                  BRW_HORIZONTAL_STRIDE_0,
 843                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 844    brw_ADD(p, dst, src0, negate(src1));
 845 }
 846
 847 /* The negate_value boolean is used to negate the derivative computation for
 848  * FBOs, since they place the origin at the upper left instead of the lower
 849  * left.
 850  */
 851 void
 852 fs_generator::generate_ddy(enum opcode opcode,
 853                            struct brw_reg dst, struct brw_reg src,
 854                            bool negate_value)
 855 {
 856    if (opcode == FS_OPCODE_DDY_FINE) {
 857       /* From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
 858        * Region Restrictions):
 859        *
 860        *     In Align16 access mode, SIMD16 is not allowed for DW operations
 861        *     and SIMD8 is not allowed for DF operations.
 862        *
 863        * In this context, "DW operations" means "operations acting on 32-bit
 864        * values", so it includes operations on floats.
 865        *
 866        * Gen4 has a similar restriction.  From the i965 PRM, section 11.5.3
 867        * (Instruction Compression -> Rules and Restrictions):
 868        *
 869        *     A compressed instruction must be in Align1 access mode. Align16
 870        *     mode instructions cannot be compressed.
 871        *
 872        * Similar text exists in the g45 PRM.
 873        *
 874        * On these platforms, if we're building a SIMD16 shader, we need to
 875        * manually unroll to a pair of SIMD8 instructions.
 876        */
 877       bool unroll_to_simd8 =
 878          (dispatch_width == 16 &&
 879           (brw->gen == 4 || (brw->gen == 7 && !brw->is_haswell)));
 880
 881       /* produce accurate derivatives */
 882       struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
 883                                     src.negate, src.abs,
 884                                     BRW_REGISTER_TYPE_F,
 885                                     BRW_VERTICAL_STRIDE_4,
 886                                     BRW_WIDTH_4,
 887                                     BRW_HORIZONTAL_STRIDE_1,
 888                                     BRW_SWIZZLE_XYXY, WRITEMASK_XYZW);
 889       struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
 890                                     src.negate, src.abs,
 891                                     BRW_REGISTER_TYPE_F,
 892                                     BRW_VERTICAL_STRIDE_4,
 893                                     BRW_WIDTH_4,
 894                                     BRW_HORIZONTAL_STRIDE_1,
 895                                     BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW);
 896       brw_push_insn_state(p);
 897       brw_set_default_access_mode(p, BRW_ALIGN_16);
 898       if (unroll_to_simd8) {
 899          brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
 900          if (negate_value) {
 901             brw_ADD(p, firsthalf(dst), firsthalf(src1), negate(firsthalf(src0)));
 902             brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
 903             brw_ADD(p, sechalf(dst), sechalf(src1), negate(sechalf(src0)));
 904          } else {
 905             brw_ADD(p, firsthalf(dst), firsthalf(src0), negate(firsthalf(src1)));
 906             brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
 907             brw_ADD(p, sechalf(dst), sechalf(src0), negate(sechalf(src1)));
 908          }
 909       } else {
 910          if (negate_value)
 911             brw_ADD(p, dst, src1, negate(src0));
 912          else
 913             brw_ADD(p, dst, src0, negate(src1));
 914       }
 915       brw_pop_insn_state(p);
 916    } else {
 917       /* replicate the derivative at the top-left pixel to other pixels */
 918       struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
 919                                     src.negate, src.abs,
 920                                     BRW_REGISTER_TYPE_F,
 921                                     BRW_VERTICAL_STRIDE_4,
 922                                     BRW_WIDTH_4,
 923                                     BRW_HORIZONTAL_STRIDE_0,
 924                                     BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 925       struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
 926                                     src.negate, src.abs,
 927                                     BRW_REGISTER_TYPE_F,
 928                                     BRW_VERTICAL_STRIDE_4,
 929                                     BRW_WIDTH_4,
 930                                     BRW_HORIZONTAL_STRIDE_0,
 931                                     BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 932       if (negate_value)
 933          brw_ADD(p, dst, src1, negate(src0));
 934       else
 935          brw_ADD(p, dst, src0, negate(src1));
 936    }
 937 }
 938
 939 void
 940 fs_generator::generate_discard_jump(fs_inst *inst)
 941 {
 942    assert(brw->gen >= 6);
 943
 944    /* This HALT will be patched up at FB write time to point UIP at the end of
 945     * the program, and at brw_uip_jip() JIP will be set to the end of the
 946     * current block (or the program).
 947     */
 948    this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
 949
 950    brw_push_insn_state(p);
 951    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
 952    gen6_HALT(p);
 953    brw_pop_insn_state(p);
 954 }
 955
 956 void
 957 fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
 958 {
 959    assert(inst->mlen != 0);
 960
 961    brw_MOV(p,
 962            brw_uvec_mrf(inst->exec_size, (inst->base_mrf + 1), 0),
 963            retype(src, BRW_REGISTER_TYPE_UD));
 964    brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
 965                                  inst->exec_size / 8, inst->offset);
 966 }
 967
 968 void
 969 fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
 970 {
 971    assert(inst->mlen != 0);
 972
 973    brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
 974                                 inst->exec_size / 8, inst->offset);
 975 }
 976
 977 void
 978 fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
 979 {
 980    gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
 981 }
 982
 983 void
 984 fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
 985                                                   struct brw_reg dst,
 986                                                   struct brw_reg index,
 987                                                   struct brw_reg offset)
 988 {
 989    assert(inst->mlen != 0);
 990
 991    assert(index.file == BRW_IMMEDIATE_VALUE &&
 992           index.type == BRW_REGISTER_TYPE_UD);
 993    uint32_t surf_index = index.dw1.ud;
 994
 995    assert(offset.file == BRW_IMMEDIATE_VALUE &&
 996           offset.type == BRW_REGISTER_TYPE_UD);
 997    uint32_t read_offset = offset.dw1.ud;
 998
 999    brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
1000                         read_offset, surf_index);
1001
1002    brw_mark_surface_used(prog_data, surf_index);
1003 }
1004
1005 void
1006 fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
1007                                                        struct brw_reg dst,
1008                                                        struct brw_reg index,
1009                                                        struct brw_reg offset)
1010 {
1011    assert(inst->mlen == 0);
1012    assert(index.type == BRW_REGISTER_TYPE_UD);
1013
1014    assert(offset.file == BRW_GENERAL_REGISTER_FILE);
1015    /* Reference just the dword we need, to avoid angering validate_reg(). */
1016    offset = brw_vec1_grf(offset.nr, 0);
1017
1018    /* We use the SIMD4x2 mode because we want to end up with 4 components in
1019     * the destination loaded consecutively from the same offset (which appears
1020     * in the first component, and the rest are ignored).
1021     */
1022    dst.width = BRW_WIDTH_4;
1023
1024    struct brw_reg src = offset;
1025    bool header_present = false;
1026    int mlen = 1;
1027
1028    if (brw->gen >= 9) {
1029       /* Skylake requires a message header in order to use SIMD4x2 mode. */
1030       src = retype(brw_vec4_grf(offset.nr - 1, 0), BRW_REGISTER_TYPE_UD);
1031       mlen = 2;
1032       header_present = true;
1033
1034       brw_push_insn_state(p);
1035       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1036       brw_MOV(p, src, retype(brw_vec4_grf(0, 0), BRW_REGISTER_TYPE_UD));
1037       brw_set_default_access_mode(p, BRW_ALIGN_1);
1038
1039       brw_MOV(p, get_element_ud(src, 2),
1040               brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2));
1041       brw_pop_insn_state(p);
1042    }
1043
1044    if (index.file == BRW_IMMEDIATE_VALUE) {
1045
1046       uint32_t surf_index = index.dw1.ud;
1047
1048       brw_push_insn_state(p);
1049       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1050       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1051       brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1052       brw_pop_insn_state(p);
1053
1054       brw_set_dest(p, send, dst);
1055       brw_set_src0(p, send, src);
1056       brw_set_sampler_message(p, send,
1057                               surf_index,
1058                               0, /* LD message ignores sampler unit */
1059                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1060                               1, /* rlen */
1061                               mlen,
1062                               header_present,
1063                               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
1064                               0);
1065
1066       brw_mark_surface_used(prog_data, surf_index);
1067
1068    } else {
1069
1070       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1071
1072       brw_push_insn_state(p);
1073       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1074       brw_set_default_access_mode(p, BRW_ALIGN_1);
1075
1076       /* a0.0 = surf_index & 0xff */
1077       brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1078       brw_inst_set_exec_size(p->brw, insn_and, BRW_EXECUTE_1);
1079       brw_set_dest(p, insn_and, addr);
1080       brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1081       brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1082
1083
1084       /* a0.0 |= <descriptor> */
1085       brw_inst *insn_or = brw_next_insn(p, BRW_OPCODE_OR);
1086       brw_set_sampler_message(p, insn_or,
1087                               0 /* surface */,
1088                               0 /* sampler */,
1089                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1090                               1 /* rlen */,
1091                               mlen,
1092                               header_present,
1093                               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
1094                               0);
1095       brw_inst_set_exec_size(p->brw, insn_or, BRW_EXECUTE_1);
1096       brw_inst_set_src1_reg_type(p->brw, insn_or, BRW_REGISTER_TYPE_UD);
1097       brw_set_src0(p, insn_or, addr);
1098       brw_set_dest(p, insn_or, addr);
1099
1100
1101       /* dst = send(offset, a0.0) */
1102       brw_inst *insn_send = brw_next_insn(p, BRW_OPCODE_SEND);
1103       brw_set_dest(p, insn_send, dst);
1104       brw_set_src0(p, insn_send, src);
1105       brw_set_indirect_send_descriptor(p, insn_send, BRW_SFID_SAMPLER, addr);
1106
1107       brw_pop_insn_state(p);
1108
1109       /* visitor knows more than we do about the surface limit required,
1110        * so has already done marking.
1111        */
1112
1113    }
1114 }
1115
1116 void
1117 fs_generator::generate_varying_pull_constant_load(fs_inst *inst,
1118                                                   struct brw_reg dst,
1119                                                   struct brw_reg index,
1120                                                   struct brw_reg offset)
1121 {
1122    assert(brw->gen < 7); /* Should use the gen7 variant. */
1123    assert(inst->header_present);
1124    assert(inst->mlen);
1125
1126    assert(index.file == BRW_IMMEDIATE_VALUE &&
1127           index.type == BRW_REGISTER_TYPE_UD);
1128    uint32_t surf_index = index.dw1.ud;
1129
1130    uint32_t simd_mode, rlen, msg_type;
1131    if (dispatch_width == 16) {
1132       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1133       rlen = 8;
1134    } else {
1135       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1136       rlen = 4;
1137    }
1138
1139    if (brw->gen >= 5)
1140       msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
1141    else {
1142       /* We always use the SIMD16 message so that we only have to load U, and
1143        * not V or R.
1144        */
1145       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
1146       assert(inst->mlen == 3);
1147       assert(inst->regs_written == 8);
1148       rlen = 8;
1149       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1150    }
1151
1152    struct brw_reg offset_mrf = retype(brw_message_reg(inst->base_mrf + 1),
1153                                       BRW_REGISTER_TYPE_D);
1154    brw_MOV(p, offset_mrf, offset);
1155
1156    struct brw_reg header = brw_vec8_grf(0, 0);
1157    gen6_resolve_implied_move(p, &header, inst->base_mrf);
1158
1159    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1160    brw_inst_set_qtr_control(brw, send, BRW_COMPRESSION_NONE);
1161    brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1162    brw_set_src0(p, send, header);
1163    if (brw->gen < 6)
1164       brw_inst_set_base_mrf(brw, send, inst->base_mrf);
1165
1166    /* Our surface is set up as floats, regardless of what actual data is
1167     * stored in it.
1168     */
1169    uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
1170    brw_set_sampler_message(p, send,
1171                            surf_index,
1172                            0, /* sampler (unused) */
1173                            msg_type,
1174                            rlen,
1175                            inst->mlen,
1176                            inst->header_present,
1177                            simd_mode,
1178                            return_format);
1179
1180    brw_mark_surface_used(prog_data, surf_index);
1181 }
1182
1183 void
1184 fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
1185                                                        struct brw_reg dst,
1186                                                        struct brw_reg index,
1187                                                        struct brw_reg offset)
1188 {
1189    assert(brw->gen >= 7);
1190    /* Varying-offset pull constant loads are treated as a normal expression on
1191     * gen7, so the fact that it's a send message is hidden at the IR level.
1192     */
1193    assert(!inst->header_present);
1194    assert(!inst->mlen);
1195    assert(index.type == BRW_REGISTER_TYPE_UD);
1196
1197    uint32_t simd_mode, rlen, mlen;
1198    if (dispatch_width == 16) {
1199       mlen = 2;
1200       rlen = 8;
1201       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1202    } else {
1203       mlen = 1;
1204       rlen = 4;
1205       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1206    }
1207
1208    if (index.file == BRW_IMMEDIATE_VALUE) {
1209
1210       uint32_t surf_index = index.dw1.ud;
1211
1212       brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1213       brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1214       brw_set_src0(p, send, offset);
1215       brw_set_sampler_message(p, send,
1216                               surf_index,
1217                               0, /* LD message ignores sampler unit */
1218                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1219                               rlen,
1220                               mlen,
1221                               false, /* no header */
1222                               simd_mode,
1223                               0);
1224
1225       brw_mark_surface_used(prog_data, surf_index);
1226
1227    } else {
1228
1229       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1230
1231       brw_push_insn_state(p);
1232       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1233       brw_set_default_access_mode(p, BRW_ALIGN_1);
1234
1235       /* a0.0 = surf_index & 0xff */
1236       brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1237       brw_inst_set_exec_size(p->brw, insn_and, BRW_EXECUTE_1);
1238       brw_set_dest(p, insn_and, addr);
1239       brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1240       brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1241
1242
1243       /* a0.0 |= <descriptor> */
1244       brw_inst *insn_or = brw_next_insn(p, BRW_OPCODE_OR);
1245       brw_set_sampler_message(p, insn_or,
1246                               0 /* surface */,
1247                               0 /* sampler */,
1248                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1249                               rlen /* rlen */,
1250                               mlen /* mlen */,
1251                               false /* header */,
1252                               simd_mode,
1253                               0);
1254       brw_inst_set_exec_size(p->brw, insn_or, BRW_EXECUTE_1);
1255       brw_inst_set_src1_reg_type(p->brw, insn_or, BRW_REGISTER_TYPE_UD);
1256       brw_set_src0(p, insn_or, addr);
1257       brw_set_dest(p, insn_or, addr);
1258
1259
1260       /* dst = send(offset, a0.0) */
1261       brw_inst *insn_send = brw_next_insn(p, BRW_OPCODE_SEND);
1262       brw_set_dest(p, insn_send, retype(dst, BRW_REGISTER_TYPE_UW));
1263       brw_set_src0(p, insn_send, offset);
1264       brw_set_indirect_send_descriptor(p, insn_send, BRW_SFID_SAMPLER, addr);
1265
1266       brw_pop_insn_state(p);
1267
1268       /* visitor knows more than we do about the surface limit required,
1269        * so has already done marking.
1270        */
1271    }
1272 }
1273
1274 /**
1275  * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
1276  * into the flags register (f0.0).
1277  *
1278  * Used only on Gen6 and above.
1279  */
1280 void
1281 fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst)
1282 {
1283    struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg);
1284    struct brw_reg dispatch_mask;
1285
1286    if (brw->gen >= 6)
1287       dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
1288    else
1289       dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1290
1291    brw_push_insn_state(p);
1292    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1293    brw_MOV(p, flags, dispatch_mask);
1294    brw_pop_insn_state(p);
1295 }
1296
1297 void
1298 fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
1299                                                 struct brw_reg dst,
1300                                                 struct brw_reg src,
1301                                                 struct brw_reg msg_data,
1302                                                 unsigned msg_type)
1303 {
1304    assert(msg_data.file == BRW_IMMEDIATE_VALUE &&
1305           msg_data.type == BRW_REGISTER_TYPE_UD);
1306
1307    brw_pixel_interpolator_query(p,
1308          retype(dst, BRW_REGISTER_TYPE_UW),
1309          src,
1310          inst->pi_noperspective,
1311          msg_type,
1312          msg_data.dw1.ud,
1313          inst->mlen,
1314          inst->regs_written);
1315 }
1316
1317
1318 /**
1319  * Sets the first word of a vgrf for gen7+ simd4x2 uniform pull constant
1320  * sampler LD messages.
1321  *
1322  * We don't want to bake it into the send message's code generation because
1323  * that means we don't get a chance to schedule the instructions.
1324  */
1325 void
1326 fs_generator::generate_set_simd4x2_offset(fs_inst *inst,
1327                                           struct brw_reg dst,
1328                                           struct brw_reg value)
1329 {
1330    assert(value.file == BRW_IMMEDIATE_VALUE);
1331
1332    brw_push_insn_state(p);
1333    brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1334    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1335    brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
1336    brw_pop_insn_state(p);
1337 }
1338
1339 /* Sets vstride=16, width=8, hstride=2 or vstride=0, width=1, hstride=0
1340  * (when mask is passed as a uniform) of register mask before moving it
1341  * to register dst.
1342  */
1343 void
1344 fs_generator::generate_set_omask(fs_inst *inst,
1345                                  struct brw_reg dst,
1346                                  struct brw_reg mask)
1347 {
1348    bool stride_8_8_1 =
1349     (mask.vstride == BRW_VERTICAL_STRIDE_8 &&
1350      mask.width == BRW_WIDTH_8 &&
1351      mask.hstride == BRW_HORIZONTAL_STRIDE_1);
1352
1353    bool stride_0_1_0 = has_scalar_region(mask);
1354
1355    assert(stride_8_8_1 || stride_0_1_0);
1356    assert(dst.type == BRW_REGISTER_TYPE_UW);
1357
1358    if (dispatch_width == 16)
1359       dst = vec16(dst);
1360    brw_push_insn_state(p);
1361    brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1362    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1363
1364    if (stride_8_8_1) {
1365       brw_MOV(p, dst, retype(stride(mask, 16, 8, 2), dst.type));
1366    } else if (stride_0_1_0) {
1367       brw_MOV(p, dst, retype(mask, dst.type));
1368    }
1369    brw_pop_insn_state(p);
1370 }
1371
1372 /* Sets vstride=1, width=4, hstride=0 of register src1 during
1373  * the ADD instruction.
1374  */
1375 void
1376 fs_generator::generate_set_sample_id(fs_inst *inst,
1377                                      struct brw_reg dst,
1378                                      struct brw_reg src0,
1379                                      struct brw_reg src1)
1380 {
1381    assert(dst.type == BRW_REGISTER_TYPE_D ||
1382           dst.type == BRW_REGISTER_TYPE_UD);
1383    assert(src0.type == BRW_REGISTER_TYPE_D ||
1384           src0.type == BRW_REGISTER_TYPE_UD);
1385
1386    brw_push_insn_state(p);
1387    brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1388    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1389    struct brw_reg reg = retype(stride(src1, 1, 4, 0), BRW_REGISTER_TYPE_UW);
1390    if (dispatch_width == 8) {
1391       brw_ADD(p, dst, src0, reg);
1392    } else if (dispatch_width == 16) {
1393       brw_ADD(p, firsthalf(dst), firsthalf(src0), reg);
1394       brw_ADD(p, sechalf(dst), sechalf(src0), suboffset(reg, 2));
1395    }
1396    brw_pop_insn_state(p);
1397 }
1398
1399 void
1400 fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
1401                                             struct brw_reg dst,
1402                                             struct brw_reg x,
1403                                             struct brw_reg y)
1404 {
1405    assert(brw->gen >= 7);
1406    assert(dst.type == BRW_REGISTER_TYPE_UD);
1407    assert(x.type == BRW_REGISTER_TYPE_F);
1408    assert(y.type == BRW_REGISTER_TYPE_F);
1409
1410    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
1411     *
1412     *   Because this instruction does not have a 16-bit floating-point type,
1413     *   the destination data type must be Word (W).
1414     *
1415     *   The destination must be DWord-aligned and specify a horizontal stride
1416     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
1417     *   each destination channel and the upper word is not modified.
1418     */
1419    struct brw_reg dst_w = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1420
1421    /* Give each 32-bit channel of dst the form below, where "." means
1422     * unchanged.
1423     *   0x....hhhh
1424     */
1425    brw_F32TO16(p, dst_w, y);
1426
1427    /* Now the form:
1428     *   0xhhhh0000
1429     */
1430    brw_SHL(p, dst, dst, brw_imm_ud(16u));
1431
1432    /* And, finally the form of packHalf2x16's output:
1433     *   0xhhhhllll
1434     */
1435    brw_F32TO16(p, dst_w, x);
1436 }
1437
1438 void
1439 fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
1440                                               struct brw_reg dst,
1441                                               struct brw_reg src)
1442 {
1443    assert(brw->gen >= 7);
1444    assert(dst.type == BRW_REGISTER_TYPE_F);
1445    assert(src.type == BRW_REGISTER_TYPE_UD);
1446
1447    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1448     *
1449     *   Because this instruction does not have a 16-bit floating-point type,
1450     *   the source data type must be Word (W). The destination type must be
1451     *   F (Float).
1452     */
1453    struct brw_reg src_w = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1454
1455    /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
1456     * For the Y case, we wish to access only the upper word; therefore
1457     * a 16-bit subregister offset is needed.
1458     */
1459    assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
1460           inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
1461    if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
1462       src_w.subnr += 2;
1463
1464    brw_F16TO32(p, dst, src_w);
1465 }
1466
1467 void
1468 fs_generator::generate_shader_time_add(fs_inst *inst,
1469                                        struct brw_reg payload,
1470                                        struct brw_reg offset,
1471                                        struct brw_reg value)
1472 {
1473    assert(brw->gen >= 7);
1474    brw_push_insn_state(p);
1475    brw_set_default_mask_control(p, true);
1476
1477    assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1478    struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
1479                                           offset.type);
1480    struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
1481                                          value.type);
1482
1483    assert(offset.file == BRW_IMMEDIATE_VALUE);
1484    if (value.file == BRW_GENERAL_REGISTER_FILE) {
1485       value.width = BRW_WIDTH_1;
1486       value.hstride = BRW_HORIZONTAL_STRIDE_0;
1487       value.vstride = BRW_VERTICAL_STRIDE_0;
1488    } else {
1489       assert(value.file == BRW_IMMEDIATE_VALUE);
1490    }
1491
1492    /* Trying to deal with setup of the params from the IR is crazy in the FS8
1493     * case, and we don't really care about squeezing every bit of performance
1494     * out of this path, so we just emit the MOVs from here.
1495     */
1496    brw_MOV(p, payload_offset, offset);
1497    brw_MOV(p, payload_value, value);
1498    brw_shader_time_add(p, payload,
1499                        prog_data->binding_table.shader_time_start);
1500    brw_pop_insn_state(p);
1501
1502    brw_mark_surface_used(prog_data,
1503                          prog_data->binding_table.shader_time_start);
1504 }
1505
1506 void
1507 fs_generator::generate_untyped_atomic(fs_inst *inst, struct brw_reg dst,
1508                                       struct brw_reg payload,
1509                                       struct brw_reg atomic_op,
1510                                       struct brw_reg surf_index)
1511 {
1512    assert(atomic_op.file == BRW_IMMEDIATE_VALUE &&
1513           atomic_op.type == BRW_REGISTER_TYPE_UD &&
1514           surf_index.file == BRW_IMMEDIATE_VALUE &&
1515           surf_index.type == BRW_REGISTER_TYPE_UD);
1516
1517    brw_untyped_atomic(p, dst, payload, atomic_op.dw1.ud, surf_index.dw1.ud,
1518                       inst->mlen, inst->exec_size / 8);
1519
1520    brw_mark_surface_used(prog_data, surf_index.dw1.ud);
1521 }
1522
1523 void
1524 fs_generator::generate_untyped_surface_read(fs_inst *inst, struct brw_reg dst,
1525                                             struct brw_reg payload,
1526                                             struct brw_reg surf_index)
1527 {
1528    assert(surf_index.file == BRW_IMMEDIATE_VALUE &&
1529           surf_index.type == BRW_REGISTER_TYPE_UD);
1530
1531    brw_untyped_surface_read(p, dst, payload,
1532                             surf_index.dw1.ud,
1533                             inst->mlen, inst->exec_size / 8);
1534
1535    brw_mark_surface_used(prog_data, surf_index.dw1.ud);
1536 }
1537
1538 void
1539 fs_generator::enable_debug(const char *shader_name)
1540 {
1541    debug_flag = true;
1542    this->shader_name = shader_name;
1543 }
1544
1545 int
1546 fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
1547 {
1548    /* align to 64 byte boundary. */
1549    while (p->next_insn_offset % 64)
1550       brw_NOP(p);
1551
1552    this->dispatch_width = dispatch_width;
1553    if (dispatch_width == 16)
1554       brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1555
1556    int start_offset = p->next_insn_offset;
1557    int loop_count = 0;
1558
1559    struct annotation_info annotation;
1560    memset(&annotation, 0, sizeof(annotation));
1561
1562    foreach_block_and_inst (block, fs_inst, inst, cfg) {
1563       struct brw_reg src[3], dst;
1564       unsigned int last_insn_offset = p->next_insn_offset;
1565       bool multiple_instructions_emitted = false;
1566
1567       if (unlikely(debug_flag))
1568          annotate(brw, &annotation, cfg, inst, p->next_insn_offset);
1569
1570       for (unsigned int i = 0; i < inst->sources; i++) {
1571          src[i] = brw_reg_from_fs_reg(&inst->src[i]);
1572
1573          /* The accumulator result appears to get used for the
1574           * conditional modifier generation.  When negating a UD
1575           * value, there is a 33rd bit generated for the sign in the
1576           * accumulator value, so now you can't check, for example,
1577           * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
1578           */
1579          assert(!inst->conditional_mod ||
1580                 inst->src[i].type != BRW_REGISTER_TYPE_UD ||
1581                 !inst->src[i].negate);
1582       }
1583       dst = brw_reg_from_fs_reg(&inst->dst);
1584
1585       brw_set_default_predicate_control(p, inst->predicate);
1586       brw_set_default_predicate_inverse(p, inst->predicate_inverse);
1587       brw_set_default_flag_reg(p, 0, inst->flag_subreg);
1588       brw_set_default_saturate(p, inst->saturate);
1589       brw_set_default_mask_control(p, inst->force_writemask_all);
1590       brw_set_default_acc_write_control(p, inst->writes_accumulator);
1591
1592       switch (inst->exec_size) {
1593       case 1:
1594       case 2:
1595       case 4:
1596          assert(inst->force_writemask_all);
1597          brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1598          break;
1599       case 8:
1600          if (inst->force_sechalf) {
1601             brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1602          } else {
1603             brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1604          }
1605          break;
1606       case 16:
1607          brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1608          break;
1609       default:
1610          unreachable(!"Invalid instruction width");
1611       }
1612
1613       switch (inst->opcode) {
1614       case BRW_OPCODE_MOV:
1615          brw_MOV(p, dst, src[0]);
1616          break;
1617       case BRW_OPCODE_ADD:
1618          brw_ADD(p, dst, src[0], src[1]);
1619          break;
1620       case BRW_OPCODE_MUL:
1621          brw_MUL(p, dst, src[0], src[1]);
1622          break;
1623       case BRW_OPCODE_AVG:
1624          brw_AVG(p, dst, src[0], src[1]);
1625          break;
1626       case BRW_OPCODE_MACH:
1627          brw_MACH(p, dst, src[0], src[1]);
1628          break;
1629
1630       case BRW_OPCODE_LINE:
1631          brw_LINE(p, dst, src[0], src[1]);
1632          break;
1633
1634       case BRW_OPCODE_MAD:
1635          assert(brw->gen >= 6);
1636          brw_set_default_access_mode(p, BRW_ALIGN_16);
1637          if (dispatch_width == 16 && brw->gen < 8 && !brw->is_haswell) {
1638             brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1639             brw_inst *f = brw_MAD(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]), firsthalf(src[2]));
1640             brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1641             brw_inst *s = brw_MAD(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1642             brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1643
1644             if (inst->conditional_mod) {
1645                brw_inst_set_cond_modifier(brw, f, inst->conditional_mod);
1646                brw_inst_set_cond_modifier(brw, s, inst->conditional_mod);
1647                multiple_instructions_emitted = true;
1648             }
1649          } else {
1650             brw_MAD(p, dst, src[0], src[1], src[2]);
1651          }
1652          brw_set_default_access_mode(p, BRW_ALIGN_1);
1653          break;
1654
1655       case BRW_OPCODE_LRP:
1656          assert(brw->gen >= 6);
1657          brw_set_default_access_mode(p, BRW_ALIGN_16);
1658          if (dispatch_width == 16 && brw->gen < 8 && !brw->is_haswell) {
1659             brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1660             brw_inst *f = brw_LRP(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]), firsthalf(src[2]));
1661             brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1662             brw_inst *s = brw_LRP(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1663             brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1664
1665             if (inst->conditional_mod) {
1666                brw_inst_set_cond_modifier(brw, f, inst->conditional_mod);
1667                brw_inst_set_cond_modifier(brw, s, inst->conditional_mod);
1668                multiple_instructions_emitted = true;
1669             }
1670          } else {
1671             brw_LRP(p, dst, src[0], src[1], src[2]);
1672          }
1673          brw_set_default_access_mode(p, BRW_ALIGN_1);
1674          break;
1675
1676       case BRW_OPCODE_FRC:
1677          brw_FRC(p, dst, src[0]);
1678          break;
1679       case BRW_OPCODE_RNDD:
1680          brw_RNDD(p, dst, src[0]);
1681          break;
1682       case BRW_OPCODE_RNDE:
1683          brw_RNDE(p, dst, src[0]);
1684          break;
1685       case BRW_OPCODE_RNDZ:
1686          brw_RNDZ(p, dst, src[0]);
1687          break;
1688
1689       case BRW_OPCODE_AND:
1690          brw_AND(p, dst, src[0], src[1]);
1691          break;
1692       case BRW_OPCODE_OR:
1693          brw_OR(p, dst, src[0], src[1]);
1694          break;
1695       case BRW_OPCODE_XOR:
1696          brw_XOR(p, dst, src[0], src[1]);
1697          break;
1698       case BRW_OPCODE_NOT:
1699          brw_NOT(p, dst, src[0]);
1700          break;
1701       case BRW_OPCODE_ASR:
1702          brw_ASR(p, dst, src[0], src[1]);
1703          break;
1704       case BRW_OPCODE_SHR:
1705          brw_SHR(p, dst, src[0], src[1]);
1706          break;
1707       case BRW_OPCODE_SHL:
1708          brw_SHL(p, dst, src[0], src[1]);
1709          break;
1710       case BRW_OPCODE_F32TO16:
1711          assert(brw->gen >= 7);
1712          brw_F32TO16(p, dst, src[0]);
1713          break;
1714       case BRW_OPCODE_F16TO32:
1715          assert(brw->gen >= 7);
1716          brw_F16TO32(p, dst, src[0]);
1717          break;
1718       case BRW_OPCODE_CMP:
1719          /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says
1720           * that when the destination is a GRF that the dependency-clear bit on
1721           * the flag register is cleared early.
1722           *
1723           * Suggested workarounds are to disable coissuing CMP instructions
1724           * or to split CMP(16) instructions into two CMP(8) instructions.
1725           *
1726           * We choose to split into CMP(8) instructions since disabling
1727           * coissuing would affect CMP instructions not otherwise affected by
1728           * the errata.
1729           */
1730          if (dispatch_width == 16 && brw->gen == 7 && !brw->is_haswell) {
1731             if (dst.file == BRW_GENERAL_REGISTER_FILE) {
1732                brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1733                brw_CMP(p, firsthalf(dst), inst->conditional_mod,
1734                           firsthalf(src[0]), firsthalf(src[1]));
1735                brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1736                brw_CMP(p, sechalf(dst), inst->conditional_mod,
1737                           sechalf(src[0]), sechalf(src[1]));
1738                brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1739
1740                multiple_instructions_emitted = true;
1741             } else if (dst.file == BRW_ARCHITECTURE_REGISTER_FILE) {
1742                /* For unknown reasons, the aforementioned workaround is not
1743                 * sufficient. Overriding the type when the destination is the
1744                 * null register is necessary but not sufficient by itself.
1745                 */
1746                assert(dst.nr == BRW_ARF_NULL);
1747                dst.type = BRW_REGISTER_TYPE_D;
1748                brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1749             } else {
1750                unreachable("not reached");
1751             }
1752          } else {
1753             brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1754          }
1755          break;
1756       case BRW_OPCODE_SEL:
1757          brw_SEL(p, dst, src[0], src[1]);
1758          break;
1759       case BRW_OPCODE_BFREV:
1760          assert(brw->gen >= 7);
1761          /* BFREV only supports UD type for src and dst. */
1762          brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
1763                       retype(src[0], BRW_REGISTER_TYPE_UD));
1764          break;
1765       case BRW_OPCODE_FBH:
1766          assert(brw->gen >= 7);
1767          /* FBH only supports UD type for dst. */
1768          brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1769          break;
1770       case BRW_OPCODE_FBL:
1771          assert(brw->gen >= 7);
1772          /* FBL only supports UD type for dst. */
1773          brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1774          break;
1775       case BRW_OPCODE_CBIT:
1776          assert(brw->gen >= 7);
1777          /* CBIT only supports UD type for dst. */
1778          brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1779          break;
1780       case BRW_OPCODE_ADDC:
1781          assert(brw->gen >= 7);
1782          brw_ADDC(p, dst, src[0], src[1]);
1783          break;
1784       case BRW_OPCODE_SUBB:
1785          assert(brw->gen >= 7);
1786          brw_SUBB(p, dst, src[0], src[1]);
1787          break;
1788       case BRW_OPCODE_MAC:
1789          brw_MAC(p, dst, src[0], src[1]);
1790          break;
1791
1792       case BRW_OPCODE_BFE:
1793          assert(brw->gen >= 7);
1794          brw_set_default_access_mode(p, BRW_ALIGN_16);
1795          if (dispatch_width == 16 && brw->gen < 8 && !brw->is_haswell) {
1796             brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1797             brw_BFE(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]), firsthalf(src[2]));
1798             brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1799             brw_BFE(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1800             brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1801          } else {
1802             brw_BFE(p, dst, src[0], src[1], src[2]);
1803          }
1804          brw_set_default_access_mode(p, BRW_ALIGN_1);
1805          break;
1806
1807       case BRW_OPCODE_BFI1:
1808          assert(brw->gen >= 7);
1809          /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
1810           * should
1811           *
1812           *    "Force BFI instructions to be executed always in SIMD8."
1813           */
1814          if (dispatch_width == 16 && brw->is_haswell) {
1815             brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1816             brw_BFI1(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]));
1817             brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1818             brw_BFI1(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]));
1819             brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1820          } else {
1821             brw_BFI1(p, dst, src[0], src[1]);
1822          }
1823          break;
1824       case BRW_OPCODE_BFI2:
1825          assert(brw->gen >= 7);
1826          brw_set_default_access_mode(p, BRW_ALIGN_16);
1827          /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
1828           * should
1829           *
1830           *    "Force BFI instructions to be executed always in SIMD8."
1831           *
1832           * Otherwise we would be able to emit compressed instructions like we
1833           * do for the other three-source instructions.
1834           */
1835          if (dispatch_width == 16 && brw->gen < 8) {
1836             brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1837             brw_BFI2(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]), firsthalf(src[2]));
1838             brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1839             brw_BFI2(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1840             brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1841          } else {
1842             brw_BFI2(p, dst, src[0], src[1], src[2]);
1843          }
1844          brw_set_default_access_mode(p, BRW_ALIGN_1);
1845          break;
1846
1847       case BRW_OPCODE_IF:
1848          if (inst->src[0].file != BAD_FILE) {
1849             /* The instruction has an embedded compare (only allowed on gen6) */
1850             assert(brw->gen == 6);
1851             gen6_IF(p, inst->conditional_mod, src[0], src[1]);
1852          } else {
1853             brw_IF(p, dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
1854          }
1855          break;
1856
1857       case BRW_OPCODE_ELSE:
1858          brw_ELSE(p);
1859          break;
1860       case BRW_OPCODE_ENDIF:
1861          brw_ENDIF(p);
1862          break;
1863
1864       case BRW_OPCODE_DO:
1865          brw_DO(p, BRW_EXECUTE_8);
1866          break;
1867
1868       case BRW_OPCODE_BREAK:
1869          brw_BREAK(p);
1870          brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1871          break;
1872       case BRW_OPCODE_CONTINUE:
1873          brw_CONT(p);
1874          brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1875          break;
1876
1877       case BRW_OPCODE_WHILE:
1878          brw_WHILE(p);
1879          loop_count++;
1880          break;
1881
1882       case SHADER_OPCODE_RCP:
1883       case SHADER_OPCODE_RSQ:
1884       case SHADER_OPCODE_SQRT:
1885       case SHADER_OPCODE_EXP2:
1886       case SHADER_OPCODE_LOG2:
1887       case SHADER_OPCODE_SIN:
1888       case SHADER_OPCODE_COS:
1889          assert(brw->gen < 6 || inst->mlen == 0);
1890          assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1891          if (brw->gen >= 7) {
1892             gen6_math(p, dst, brw_math_function(inst->opcode), src[0],
1893                       brw_null_reg());
1894          } else if (brw->gen == 6) {
1895             generate_math_gen6(inst, dst, src[0], brw_null_reg());
1896          } else if (brw->gen == 5 || brw->is_g4x) {
1897             generate_math_g45(inst, dst, src[0]);
1898          } else {
1899             generate_math_gen4(inst, dst, src[0]);
1900          }
1901          break;
1902       case SHADER_OPCODE_INT_QUOTIENT:
1903       case SHADER_OPCODE_INT_REMAINDER:
1904       case SHADER_OPCODE_POW:
1905          assert(brw->gen < 6 || inst->mlen == 0);
1906          assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1907          if (brw->gen >= 7 && inst->opcode == SHADER_OPCODE_POW) {
1908             gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
1909          } else if (brw->gen >= 6) {
1910             generate_math_gen6(inst, dst, src[0], src[1]);
1911          } else {
1912             generate_math_gen4(inst, dst, src[0]);
1913          }
1914          break;
1915       case FS_OPCODE_PIXEL_X:
1916          generate_pixel_xy(dst, true);
1917          break;
1918       case FS_OPCODE_PIXEL_Y:
1919          generate_pixel_xy(dst, false);
1920          break;
1921       case FS_OPCODE_CINTERP:
1922          brw_MOV(p, dst, src[0]);
1923          break;
1924       case FS_OPCODE_LINTERP:
1925          generate_linterp(inst, dst, src);
1926          break;
1927       case SHADER_OPCODE_TEX:
1928       case FS_OPCODE_TXB:
1929       case SHADER_OPCODE_TXD:
1930       case SHADER_OPCODE_TXF:
1931       case SHADER_OPCODE_TXF_CMS:
1932       case SHADER_OPCODE_TXF_UMS:
1933       case SHADER_OPCODE_TXF_MCS:
1934       case SHADER_OPCODE_TXL:
1935       case SHADER_OPCODE_TXS:
1936       case SHADER_OPCODE_LOD:
1937       case SHADER_OPCODE_TG4:
1938       case SHADER_OPCODE_TG4_OFFSET:
1939          generate_tex(inst, dst, src[0], src[1]);
1940          break;
1941       case FS_OPCODE_DDX_COARSE:
1942       case FS_OPCODE_DDX_FINE:
1943          generate_ddx(inst->opcode, dst, src[0]);
1944          break;
1945       case FS_OPCODE_DDY_COARSE:
1946       case FS_OPCODE_DDY_FINE:
1947          assert(src[1].file == BRW_IMMEDIATE_VALUE);
1948          generate_ddy(inst->opcode, dst, src[0], src[1].dw1.ud);
1949          break;
1950
1951       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1952          generate_scratch_write(inst, src[0]);
1953          break;
1954
1955       case SHADER_OPCODE_GEN4_SCRATCH_READ:
1956          generate_scratch_read(inst, dst);
1957          break;
1958
1959       case SHADER_OPCODE_GEN7_SCRATCH_READ:
1960          generate_scratch_read_gen7(inst, dst);
1961          break;
1962
1963       case SHADER_OPCODE_URB_WRITE_SIMD8:
1964          generate_urb_write(inst, src[0]);
1965          break;
1966
1967       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1968          generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
1969          break;
1970
1971       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
1972          generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1973          break;
1974
1975       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1976          generate_varying_pull_constant_load(inst, dst, src[0], src[1]);
1977          break;
1978
1979       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
1980          generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1981          break;
1982
1983       case FS_OPCODE_REP_FB_WRITE:
1984       case FS_OPCODE_FB_WRITE:
1985          generate_fb_write(inst, src[0]);
1986          break;
1987
1988       case FS_OPCODE_BLORP_FB_WRITE:
1989          generate_blorp_fb_write(inst);
1990          break;
1991
1992       case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
1993          generate_mov_dispatch_to_flags(inst);
1994          break;
1995
1996       case FS_OPCODE_DISCARD_JUMP:
1997          generate_discard_jump(inst);
1998          break;
1999
2000       case SHADER_OPCODE_SHADER_TIME_ADD:
2001          generate_shader_time_add(inst, src[0], src[1], src[2]);
2002          break;
2003
2004       case SHADER_OPCODE_UNTYPED_ATOMIC:
2005          generate_untyped_atomic(inst, dst, src[0], src[1], src[2]);
2006          break;
2007
2008       case SHADER_OPCODE_UNTYPED_SURFACE_READ:
2009          generate_untyped_surface_read(inst, dst, src[0], src[1]);
2010          break;
2011
2012       case FS_OPCODE_SET_SIMD4X2_OFFSET:
2013          generate_set_simd4x2_offset(inst, dst, src[0]);
2014          break;
2015
2016       case FS_OPCODE_SET_OMASK:
2017          generate_set_omask(inst, dst, src[0]);
2018          break;
2019
2020       case FS_OPCODE_SET_SAMPLE_ID:
2021          generate_set_sample_id(inst, dst, src[0], src[1]);
2022          break;
2023
2024       case FS_OPCODE_PACK_HALF_2x16_SPLIT:
2025           generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
2026           break;
2027
2028       case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
2029       case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
2030          generate_unpack_half_2x16_split(inst, dst, src[0]);
2031          break;
2032
2033       case FS_OPCODE_PLACEHOLDER_HALT:
2034          /* This is the place where the final HALT needs to be inserted if
2035           * we've emitted any discards.  If not, this will emit no code.
2036           */
2037          if (!patch_discard_jumps_to_fb_writes()) {
2038             if (unlikely(debug_flag)) {
2039                annotation.ann_count--;
2040             }
2041          }
2042          break;
2043
2044       case FS_OPCODE_INTERPOLATE_AT_CENTROID:
2045          generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2046                                            GEN7_PIXEL_INTERPOLATOR_LOC_CENTROID);
2047          break;
2048
2049       case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2050          generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2051                                            GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE);
2052          break;
2053
2054       case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2055          generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2056                                            GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET);
2057          break;
2058
2059       case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2060          generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2061                                            GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET);
2062          break;
2063
2064       default:
2065          if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
2066             _mesa_problem(ctx, "Unsupported opcode `%s' in %s",
2067                           opcode_descs[inst->opcode].name, stage_abbrev);
2068          } else {
2069             _mesa_problem(ctx, "Unsupported opcode %d in %s", inst->opcode,
2070                           stage_abbrev);
2071          }
2072          abort();
2073
2074       case SHADER_OPCODE_LOAD_PAYLOAD:
2075          unreachable("Should be lowered by lower_load_payload()");
2076       }
2077
2078       if (multiple_instructions_emitted)
2079          continue;
2080
2081       if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
2082          assert(p->next_insn_offset == last_insn_offset + 16 ||
2083                 !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
2084                  "emitting more than 1 instruction");
2085
2086          brw_inst *last = &p->store[last_insn_offset / 16];
2087
2088          if (inst->conditional_mod)
2089             brw_inst_set_cond_modifier(brw, last, inst->conditional_mod);
2090          brw_inst_set_no_dd_clear(brw, last, inst->no_dd_clear);
2091          brw_inst_set_no_dd_check(brw, last, inst->no_dd_check);
2092       }
2093    }
2094
2095    brw_set_uip_jip(p);
2096    annotation_finalize(&annotation, p->next_insn_offset);
2097
2098    int before_size = p->next_insn_offset - start_offset;
2099    brw_compact_instructions(p, start_offset, annotation.ann_count,
2100                             annotation.ann);
2101    int after_size = p->next_insn_offset - start_offset;
2102
2103    if (unlikely(debug_flag)) {
2104       fprintf(stderr, "Native code for %s\n"
2105               "SIMD%d shader: %d instructions. %d loops. Compacted %d to %d"
2106               " bytes (%.0f%%)\n",
2107               shader_name,
2108               dispatch_width, before_size / 16, loop_count, before_size, after_size,
2109               100.0f * (before_size - after_size) / before_size);
2110
2111       dump_assembly(p->store, annotation.ann_count, annotation.ann, brw, prog);
2112       ralloc_free(annotation.ann);
2113    }
2114
2115    static GLuint msg_id = 0;
2116    _mesa_gl_debug(&brw->ctx, &msg_id,
2117                   MESA_DEBUG_SOURCE_SHADER_COMPILER,
2118                   MESA_DEBUG_TYPE_OTHER,
2119                   MESA_DEBUG_SEVERITY_NOTIFICATION,
2120                   "%s SIMD%d shader: %d inst, %d loops, "
2121                   "compacted %d to %d bytes.\n",
2122                   stage_abbrev, dispatch_width, before_size / 16, loop_count,
2123                   before_size, after_size);
2124
2125    return start_offset;
2126 }
2127
2128 const unsigned *
2129 fs_generator::get_assembly(unsigned int *assembly_size)
2130 {
2131    return brw_get_program(p, assembly_size);
2132 }