src/mesa/drivers/dri/i965/brw_fs_emit.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs_emit.cpp
  25  *
  26  * This file supports emitting code from the FS LIR to the actual
  27  * native instructions.
  28  */
  29
  30 extern "C" {
  31 #include "main/macros.h"
  32 #include "brw_context.h"
  33 #include "brw_eu.h"
  34 } /* extern "C" */
  35
  36 #include "brw_fs.h"
  37 #include "brw_cfg.h"
  38 #include "glsl/ir_print_visitor.h"
  39
  40 void
  41 fs_visitor::generate_fb_write(fs_inst *inst)
  42 {
  43    bool eot = inst->eot;
  44    struct brw_reg implied_header;
  45    uint32_t msg_control;
  46
  47    /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
  48     * move, here's g1.
  49     */
  50    brw_push_insn_state(p);
  51    brw_set_mask_control(p, BRW_MASK_DISABLE);
  52    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  53
  54    if (inst->header_present) {
  55       if (intel->gen >= 6) {
  56          brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
  57          brw_MOV(p,
  58                  retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
  59                  retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
  60          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  61
  62          if (inst->target > 0 &&
  63              c->key.nr_color_regions > 1 &&
  64              c->key.sample_alpha_to_coverage) {
  65             /* Set "Source0 Alpha Present to RenderTarget" bit in message
  66              * header.
  67              */
  68             brw_OR(p,
  69                    vec1(retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD)),
  70                    vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
  71                    brw_imm_ud(0x1 << 11));
  72          }
  73
  74          if (inst->target > 0) {
  75             /* Set the render target index for choosing BLEND_STATE. */
  76             brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
  77                                            inst->base_mrf, 2),
  78                               BRW_REGISTER_TYPE_UD),
  79                     brw_imm_ud(inst->target));
  80          }
  81
  82          implied_header = brw_null_reg();
  83       } else {
  84          implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
  85
  86          brw_MOV(p,
  87                  brw_message_reg(inst->base_mrf + 1),
  88                  brw_vec8_grf(1, 0));
  89       }
  90    } else {
  91       implied_header = brw_null_reg();
  92    }
  93
  94    if (this->dual_src_output.file != BAD_FILE)
  95       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
  96    else if (dispatch_width == 16)
  97       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
  98    else
  99       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
 100
 101    brw_pop_insn_state(p);
 102
 103    brw_fb_WRITE(p,
 104                 dispatch_width,
 105                 inst->base_mrf,
 106                 implied_header,
 107                 msg_control,
 108                 inst->target,
 109                 inst->mlen,
 110                 0,
 111                 eot,
 112                 inst->header_present);
 113 }
 114
 115 /* Computes the integer pixel x,y values from the origin.
 116  *
 117  * This is the basis of gl_FragCoord computation, but is also used
 118  * pre-gen6 for computing the deltas from v0 for computing
 119  * interpolation.
 120  */
 121 void
 122 fs_visitor::generate_pixel_xy(struct brw_reg dst, bool is_x)
 123 {
 124    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
 125    struct brw_reg src;
 126    struct brw_reg deltas;
 127
 128    if (is_x) {
 129       src = stride(suboffset(g1_uw, 4), 2, 4, 0);
 130       deltas = brw_imm_v(0x10101010);
 131    } else {
 132       src = stride(suboffset(g1_uw, 5), 2, 4, 0);
 133       deltas = brw_imm_v(0x11001100);
 134    }
 135
 136    if (dispatch_width == 16) {
 137       dst = vec16(dst);
 138    }
 139
 140    /* We do this 8 or 16-wide, but since the destination is UW we
 141     * don't do compression in the 16-wide case.
 142     */
 143    brw_push_insn_state(p);
 144    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 145    brw_ADD(p, dst, src, deltas);
 146    brw_pop_insn_state(p);
 147 }
 148
 149 void
 150 fs_visitor::generate_linterp(fs_inst *inst,
 151                              struct brw_reg dst, struct brw_reg *src)
 152 {
 153    struct brw_reg delta_x = src[0];
 154    struct brw_reg delta_y = src[1];
 155    struct brw_reg interp = src[2];
 156
 157    if (brw->has_pln &&
 158        delta_y.nr == delta_x.nr + 1 &&
 159        (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
 160       brw_PLN(p, dst, interp, delta_x);
 161    } else {
 162       brw_LINE(p, brw_null_reg(), interp, delta_x);
 163       brw_MAC(p, dst, suboffset(interp, 1), delta_y);
 164    }
 165 }
 166
 167 void
 168 fs_visitor::generate_math1_gen7(fs_inst *inst,
 169                                 struct brw_reg dst,
 170                                 struct brw_reg src0)
 171 {
 172    assert(inst->mlen == 0);
 173    brw_math(p, dst,
 174             brw_math_function(inst->opcode),
 175             0, src0,
 176             BRW_MATH_DATA_VECTOR,
 177             BRW_MATH_PRECISION_FULL);
 178 }
 179
 180 void
 181 fs_visitor::generate_math2_gen7(fs_inst *inst,
 182                                 struct brw_reg dst,
 183                                 struct brw_reg src0,
 184                                 struct brw_reg src1)
 185 {
 186    assert(inst->mlen == 0);
 187    brw_math2(p, dst, brw_math_function(inst->opcode), src0, src1);
 188 }
 189
 190 void
 191 fs_visitor::generate_math1_gen6(fs_inst *inst,
 192                                 struct brw_reg dst,
 193                                 struct brw_reg src0)
 194 {
 195    int op = brw_math_function(inst->opcode);
 196
 197    assert(inst->mlen == 0);
 198
 199    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 200    brw_math(p, dst,
 201             op,
 202             0, src0,
 203             BRW_MATH_DATA_VECTOR,
 204             BRW_MATH_PRECISION_FULL);
 205
 206    if (dispatch_width == 16) {
 207       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 208       brw_math(p, sechalf(dst),
 209                op,
 210                0, sechalf(src0),
 211                BRW_MATH_DATA_VECTOR,
 212                BRW_MATH_PRECISION_FULL);
 213       brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
 214    }
 215 }
 216
 217 void
 218 fs_visitor::generate_math2_gen6(fs_inst *inst,
 219                                 struct brw_reg dst,
 220                                 struct brw_reg src0,
 221                                 struct brw_reg src1)
 222 {
 223    int op = brw_math_function(inst->opcode);
 224
 225    assert(inst->mlen == 0);
 226
 227    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 228    brw_math2(p, dst, op, src0, src1);
 229
 230    if (dispatch_width == 16) {
 231       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 232       brw_math2(p, sechalf(dst), op, sechalf(src0), sechalf(src1));
 233       brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
 234    }
 235 }
 236
 237 void
 238 fs_visitor::generate_math_gen4(fs_inst *inst,
 239                                struct brw_reg dst,
 240                                struct brw_reg src)
 241 {
 242    int op = brw_math_function(inst->opcode);
 243
 244    assert(inst->mlen >= 1);
 245
 246    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 247    brw_math(p, dst,
 248             op,
 249             inst->base_mrf, src,
 250             BRW_MATH_DATA_VECTOR,
 251             BRW_MATH_PRECISION_FULL);
 252
 253    if (dispatch_width == 16) {
 254       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 255       brw_math(p, sechalf(dst),
 256                op,
 257                inst->base_mrf + 1, sechalf(src),
 258                BRW_MATH_DATA_VECTOR,
 259                BRW_MATH_PRECISION_FULL);
 260
 261       brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
 262    }
 263 }
 264
 265 void
 266 fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
 267 {
 268    int msg_type = -1;
 269    int rlen = 4;
 270    uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 271    uint32_t return_format;
 272
 273    switch (dst.type) {
 274    case BRW_REGISTER_TYPE_D:
 275       return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
 276       break;
 277    case BRW_REGISTER_TYPE_UD:
 278       return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
 279       break;
 280    default:
 281       return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
 282       break;
 283    }
 284
 285    if (dispatch_width == 16)
 286       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 287
 288    if (intel->gen >= 5) {
 289       switch (inst->opcode) {
 290       case SHADER_OPCODE_TEX:
 291          if (inst->shadow_compare) {
 292             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
 293          } else {
 294             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
 295          }
 296          break;
 297       case FS_OPCODE_TXB:
 298          if (inst->shadow_compare) {
 299             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
 300          } else {
 301             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
 302          }
 303          break;
 304       case SHADER_OPCODE_TXL:
 305          if (inst->shadow_compare) {
 306             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
 307          } else {
 308             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
 309          }
 310          break;
 311       case SHADER_OPCODE_TXS:
 312          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
 313          break;
 314       case SHADER_OPCODE_TXD:
 315          /* There is no sample_d_c message; comparisons are done manually */
 316          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
 317          break;
 318       case SHADER_OPCODE_TXF:
 319          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
 320          break;
 321       default:
 322          assert(!"not reached");
 323          break;
 324       }
 325    } else {
 326       switch (inst->opcode) {
 327       case SHADER_OPCODE_TEX:
 328          /* Note that G45 and older determines shadow compare and dispatch width
 329           * from message length for most messages.
 330           */
 331          assert(dispatch_width == 8);
 332          msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
 333          if (inst->shadow_compare) {
 334             assert(inst->mlen == 6);
 335          } else {
 336             assert(inst->mlen <= 4);
 337          }
 338          break;
 339       case FS_OPCODE_TXB:
 340          if (inst->shadow_compare) {
 341             assert(inst->mlen == 6);
 342             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
 343          } else {
 344             assert(inst->mlen == 9);
 345             msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
 346             simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 347          }
 348          break;
 349       case SHADER_OPCODE_TXL:
 350          if (inst->shadow_compare) {
 351             assert(inst->mlen == 6);
 352             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
 353          } else {
 354             assert(inst->mlen == 9);
 355             msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
 356             simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 357          }
 358          break;
 359       case SHADER_OPCODE_TXD:
 360          /* There is no sample_d_c message; comparisons are done manually */
 361          assert(inst->mlen == 7 || inst->mlen == 10);
 362          msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
 363          break;
 364       case SHADER_OPCODE_TXF:
 365          assert(inst->mlen == 9);
 366          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
 367          simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 368          break;
 369       case SHADER_OPCODE_TXS:
 370          assert(inst->mlen == 3);
 371          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
 372          simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 373          break;
 374       default:
 375          assert(!"not reached");
 376          break;
 377       }
 378    }
 379    assert(msg_type != -1);
 380
 381    if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
 382       rlen = 8;
 383       dst = vec16(dst);
 384    }
 385
 386    /* Load the message header if present.  If there's a texture offset,
 387     * we need to set it up explicitly and load the offset bitfield.
 388     * Otherwise, we can use an implied move from g0 to the first message reg.
 389     */
 390    if (inst->texture_offset) {
 391       brw_push_insn_state(p);
 392       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 393       /* Explicitly set up the message header by copying g0 to the MRF. */
 394       brw_MOV(p, retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
 395                  retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
 396
 397       /* Then set the offset bits in DWord 2. */
 398       brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
 399                                      inst->base_mrf, 2), BRW_REGISTER_TYPE_UD),
 400                  brw_imm_ud(inst->texture_offset));
 401       brw_pop_insn_state(p);
 402    } else if (inst->header_present) {
 403       /* Set up an implied move from g0 to the MRF. */
 404       src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
 405    }
 406
 407    brw_SAMPLE(p,
 408               retype(dst, BRW_REGISTER_TYPE_UW),
 409               inst->base_mrf,
 410               src,
 411               SURF_INDEX_TEXTURE(inst->sampler),
 412               inst->sampler,
 413               WRITEMASK_XYZW,
 414               msg_type,
 415               rlen,
 416               inst->mlen,
 417               inst->header_present,
 418               simd_mode,
 419               return_format);
 420 }
 421
 422
 423 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
 424  * looking like:
 425  *
 426  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
 427  *
 428  * and we're trying to produce:
 429  *
 430  *           DDX                     DDY
 431  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
 432  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
 433  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
 434  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
 435  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
 436  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
 437  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
 438  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
 439  *
 440  * and add another set of two more subspans if in 16-pixel dispatch mode.
 441  *
 442  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
 443  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
 444  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
 445  * between each other.  We could probably do it like ddx and swizzle the right
 446  * order later, but bail for now and just produce
 447  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
 448  */
 449 void
 450 fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
 451 {
 452    struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
 453                                  BRW_REGISTER_TYPE_F,
 454                                  BRW_VERTICAL_STRIDE_2,
 455                                  BRW_WIDTH_2,
 456                                  BRW_HORIZONTAL_STRIDE_0,
 457                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 458    struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
 459                                  BRW_REGISTER_TYPE_F,
 460                                  BRW_VERTICAL_STRIDE_2,
 461                                  BRW_WIDTH_2,
 462                                  BRW_HORIZONTAL_STRIDE_0,
 463                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 464    brw_ADD(p, dst, src0, negate(src1));
 465 }
 466
 467 /* The negate_value boolean is used to negate the derivative computation for
 468  * FBOs, since they place the origin at the upper left instead of the lower
 469  * left.
 470  */
 471 void
 472 fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
 473                          bool negate_value)
 474 {
 475    struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
 476                                  BRW_REGISTER_TYPE_F,
 477                                  BRW_VERTICAL_STRIDE_4,
 478                                  BRW_WIDTH_4,
 479                                  BRW_HORIZONTAL_STRIDE_0,
 480                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 481    struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
 482                                  BRW_REGISTER_TYPE_F,
 483                                  BRW_VERTICAL_STRIDE_4,
 484                                  BRW_WIDTH_4,
 485                                  BRW_HORIZONTAL_STRIDE_0,
 486                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 487    if (negate_value)
 488       brw_ADD(p, dst, src1, negate(src0));
 489    else
 490       brw_ADD(p, dst, src0, negate(src1));
 491 }
 492
 493 void
 494 fs_visitor::generate_discard(fs_inst *inst)
 495 {
 496    struct brw_reg f0 = brw_flag_reg();
 497
 498    if (intel->gen >= 6) {
 499       struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
 500       struct brw_reg some_register;
 501
 502       /* As of gen6, we no longer have the mask register to look at,
 503        * so life gets a bit more complicated.
 504        */
 505
 506       /* Load the flag register with all ones. */
 507       brw_push_insn_state(p);
 508       brw_set_mask_control(p, BRW_MASK_DISABLE);
 509       brw_MOV(p, f0, brw_imm_uw(0xffff));
 510       brw_pop_insn_state(p);
 511
 512       /* Do a comparison that should always fail, to produce 0s in the flag
 513        * reg where we have active channels.
 514        */
 515       some_register = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
 516       brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
 517               BRW_CONDITIONAL_NZ, some_register, some_register);
 518
 519       /* Undo CMP's whacking of predication*/
 520       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 521
 522       brw_push_insn_state(p);
 523       brw_set_mask_control(p, BRW_MASK_DISABLE);
 524       brw_AND(p, g1, f0, g1);
 525       brw_pop_insn_state(p);
 526    } else {
 527       struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
 528
 529       brw_push_insn_state(p);
 530       brw_set_mask_control(p, BRW_MASK_DISABLE);
 531       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 532
 533       /* Unlike the 965, we have the mask reg, so we just need
 534        * somewhere to invert that (containing channels to be disabled)
 535        * so it can be ANDed with the mask of pixels still to be
 536        * written. Use the flag reg for consistency with gen6+.
 537        */
 538       brw_NOT(p, f0, brw_mask_reg(1)); /* IMASK */
 539       brw_AND(p, g0, f0, g0);
 540
 541       brw_pop_insn_state(p);
 542    }
 543 }
 544
 545 void
 546 fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src)
 547 {
 548    assert(inst->mlen != 0);
 549
 550    brw_MOV(p,
 551            retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
 552            retype(src, BRW_REGISTER_TYPE_UD));
 553    brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
 554                                  inst->offset);
 555 }
 556
 557 void
 558 fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst)
 559 {
 560    assert(inst->mlen != 0);
 561
 562    /* Clear any post destination dependencies that would be ignored by
 563     * the block read.  See the B-Spec for pre-gen5 send instruction.
 564     *
 565     * This could use a better solution, since texture sampling and
 566     * math reads could potentially run into it as well -- anywhere
 567     * that we have a SEND with a destination that is a register that
 568     * was written but not read within the last N instructions (what's
 569     * N?  unsure).  This is rare because of dead code elimination, but
 570     * not impossible.
 571     */
 572    if (intel->gen == 4 && !intel->is_g4x)
 573       brw_MOV(p, brw_null_reg(), dst);
 574
 575    brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
 576                                 inst->offset);
 577
 578    if (intel->gen == 4 && !intel->is_g4x) {
 579       /* gen4 errata: destination from a send can't be used as a
 580        * destination until it's been read.  Just read it so we don't
 581        * have to worry.
 582        */
 583       brw_MOV(p, brw_null_reg(), dst);
 584    }
 585 }
 586
 587 void
 588 fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst,
 589                                         struct brw_reg index,
 590                                         struct brw_reg offset)
 591 {
 592    assert(inst->mlen != 0);
 593
 594    /* Clear any post destination dependencies that would be ignored by
 595     * the block read.  See the B-Spec for pre-gen5 send instruction.
 596     *
 597     * This could use a better solution, since texture sampling and
 598     * math reads could potentially run into it as well -- anywhere
 599     * that we have a SEND with a destination that is a register that
 600     * was written but not read within the last N instructions (what's
 601     * N?  unsure).  This is rare because of dead code elimination, but
 602     * not impossible.
 603     */
 604    if (intel->gen == 4 && !intel->is_g4x)
 605       brw_MOV(p, brw_null_reg(), dst);
 606
 607    assert(index.file == BRW_IMMEDIATE_VALUE &&
 608           index.type == BRW_REGISTER_TYPE_UD);
 609    uint32_t surf_index = index.dw1.ud;
 610
 611    assert(offset.file == BRW_IMMEDIATE_VALUE &&
 612           offset.type == BRW_REGISTER_TYPE_UD);
 613    uint32_t read_offset = offset.dw1.ud;
 614
 615    brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
 616                         read_offset, surf_index);
 617
 618    if (intel->gen == 4 && !intel->is_g4x) {
 619       /* gen4 errata: destination from a send can't be used as a
 620        * destination until it's been read.  Just read it so we don't
 621        * have to worry.
 622        */
 623       brw_MOV(p, brw_null_reg(), dst);
 624    }
 625 }
 626
 627
 628 /**
 629  * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
 630  * into the flags register (f0.0).
 631  *
 632  * Used only on Gen6 and above.
 633  */
 634 void
 635 fs_visitor::generate_mov_dispatch_to_flags()
 636 {
 637    struct brw_reg f0 = brw_flag_reg();
 638    struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
 639
 640    assert (intel->gen >= 6);
 641    brw_push_insn_state(p);
 642    brw_set_mask_control(p, BRW_MASK_DISABLE);
 643    brw_MOV(p, f0, g1);
 644    brw_pop_insn_state(p);
 645 }
 646
 647
 648 static uint32_t brw_file_from_reg(fs_reg *reg)
 649 {
 650    switch (reg->file) {
 651    case ARF:
 652       return BRW_ARCHITECTURE_REGISTER_FILE;
 653    case GRF:
 654       return BRW_GENERAL_REGISTER_FILE;
 655    case MRF:
 656       return BRW_MESSAGE_REGISTER_FILE;
 657    case IMM:
 658       return BRW_IMMEDIATE_VALUE;
 659    default:
 660       assert(!"not reached");
 661       return BRW_GENERAL_REGISTER_FILE;
 662    }
 663 }
 664
 665 static struct brw_reg
 666 brw_reg_from_fs_reg(fs_reg *reg)
 667 {
 668    struct brw_reg brw_reg;
 669
 670    switch (reg->file) {
 671    case GRF:
 672    case ARF:
 673    case MRF:
 674       if (reg->smear == -1) {
 675          brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
 676       } else {
 677          brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, reg->smear);
 678       }
 679       brw_reg = retype(brw_reg, reg->type);
 680       if (reg->sechalf)
 681          brw_reg = sechalf(brw_reg);
 682       break;
 683    case IMM:
 684       switch (reg->type) {
 685       case BRW_REGISTER_TYPE_F:
 686          brw_reg = brw_imm_f(reg->imm.f);
 687          break;
 688       case BRW_REGISTER_TYPE_D:
 689          brw_reg = brw_imm_d(reg->imm.i);
 690          break;
 691       case BRW_REGISTER_TYPE_UD:
 692          brw_reg = brw_imm_ud(reg->imm.u);
 693          break;
 694       default:
 695          assert(!"not reached");
 696          brw_reg = brw_null_reg();
 697          break;
 698       }
 699       break;
 700    case FIXED_HW_REG:
 701       brw_reg = reg->fixed_hw_reg;
 702       break;
 703    case BAD_FILE:
 704       /* Probably unused. */
 705       brw_reg = brw_null_reg();
 706       break;
 707    case UNIFORM:
 708       assert(!"not reached");
 709       brw_reg = brw_null_reg();
 710       break;
 711    default:
 712       assert(!"not reached");
 713       brw_reg = brw_null_reg();
 714       break;
 715    }
 716    if (reg->abs)
 717       brw_reg = brw_abs(brw_reg);
 718    if (reg->negate)
 719       brw_reg = negate(brw_reg);
 720
 721    return brw_reg;
 722 }
 723
 724 void
 725 fs_visitor::generate_code()
 726 {
 727    int last_native_insn_offset = p->next_insn_offset;
 728    const char *last_annotation_string = NULL;
 729    const void *last_annotation_ir = NULL;
 730
 731    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
 732       if (shader) {
 733          printf("Native code for fragment shader %d (%d-wide dispatch):\n",
 734                 prog->Name, dispatch_width);
 735       } else {
 736          printf("Native code for fragment program %d (%d-wide dispatch):\n",
 737                 fp->Base.Id, dispatch_width);
 738       }
 739    }
 740
 741    cfg_t *cfg = NULL;
 742    if (unlikely(INTEL_DEBUG & DEBUG_WM))
 743       cfg = new(mem_ctx) cfg_t(this);
 744
 745    foreach_list(node, &this->instructions) {
 746       fs_inst *inst = (fs_inst *)node;
 747       struct brw_reg src[3], dst;
 748
 749       if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
 750          foreach_list(node, &cfg->block_list) {
 751             bblock_link *link = (bblock_link *)node;
 752             bblock_t *block = link->block;
 753
 754             if (block->start == inst) {
 755                printf("   START B%d", block->block_num);
 756                foreach_list(predecessor_node, &block->parents) {
 757                   bblock_link *predecessor_link =
 758                      (bblock_link *)predecessor_node;
 759                   bblock_t *predecessor_block = predecessor_link->block;
 760                   printf(" <-B%d", predecessor_block->block_num);
 761                }
 762                printf("\n");
 763             }
 764          }
 765
 766          if (last_annotation_ir != inst->ir) {
 767             last_annotation_ir = inst->ir;
 768             if (last_annotation_ir) {
 769                printf("   ");
 770                if (shader)
 771                   ((ir_instruction *)inst->ir)->print();
 772                else {
 773                   const prog_instruction *fpi;
 774                   fpi = (const prog_instruction *)inst->ir;
 775                   printf("%d: ", (int)(fpi - fp->Base.Instructions));
 776                   _mesa_fprint_instruction_opt(stdout,
 777                                                fpi,
 778                                                0, PROG_PRINT_DEBUG, NULL);
 779                }
 780                printf("\n");
 781             }
 782          }
 783          if (last_annotation_string != inst->annotation) {
 784             last_annotation_string = inst->annotation;
 785             if (last_annotation_string)
 786                printf("   %s\n", last_annotation_string);
 787          }
 788       }
 789
 790       for (unsigned int i = 0; i < 3; i++) {
 791          src[i] = brw_reg_from_fs_reg(&inst->src[i]);
 792
 793          /* The accumulator result appears to get used for the
 794           * conditional modifier generation.  When negating a UD
 795           * value, there is a 33rd bit generated for the sign in the
 796           * accumulator value, so now you can't check, for example,
 797           * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
 798           */
 799          assert(!inst->conditional_mod ||
 800                 inst->src[i].type != BRW_REGISTER_TYPE_UD ||
 801                 !inst->src[i].negate);
 802       }
 803       dst = brw_reg_from_fs_reg(&inst->dst);
 804
 805       brw_set_conditionalmod(p, inst->conditional_mod);
 806       brw_set_predicate_control(p, inst->predicate);
 807       brw_set_predicate_inverse(p, inst->predicate_inverse);
 808       brw_set_saturate(p, inst->saturate);
 809
 810       if (inst->force_uncompressed || dispatch_width == 8) {
 811          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 812       } else if (inst->force_sechalf) {
 813          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 814       } else {
 815          brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
 816       }
 817
 818       switch (inst->opcode) {
 819       case BRW_OPCODE_MOV:
 820          brw_MOV(p, dst, src[0]);
 821          break;
 822       case BRW_OPCODE_ADD:
 823          brw_ADD(p, dst, src[0], src[1]);
 824          break;
 825       case BRW_OPCODE_MUL:
 826          brw_MUL(p, dst, src[0], src[1]);
 827          break;
 828       case BRW_OPCODE_MACH:
 829          brw_set_acc_write_control(p, 1);
 830          brw_MACH(p, dst, src[0], src[1]);
 831          brw_set_acc_write_control(p, 0);
 832          break;
 833
 834       case BRW_OPCODE_MAD:
 835          brw_set_access_mode(p, BRW_ALIGN_16);
 836          if (dispatch_width == 16) {
 837             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 838             brw_MAD(p, dst, src[0], src[1], src[2]);
 839             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
 840             brw_MAD(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
 841             brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
 842          } else {
 843             brw_MAD(p, dst, src[0], src[1], src[2]);
 844          }
 845          brw_set_access_mode(p, BRW_ALIGN_1);
 846          break;
 847
 848       case BRW_OPCODE_FRC:
 849          brw_FRC(p, dst, src[0]);
 850          break;
 851       case BRW_OPCODE_RNDD:
 852          brw_RNDD(p, dst, src[0]);
 853          break;
 854       case BRW_OPCODE_RNDE:
 855          brw_RNDE(p, dst, src[0]);
 856          break;
 857       case BRW_OPCODE_RNDZ:
 858          brw_RNDZ(p, dst, src[0]);
 859          break;
 860
 861       case BRW_OPCODE_AND:
 862          brw_AND(p, dst, src[0], src[1]);
 863          break;
 864       case BRW_OPCODE_OR:
 865          brw_OR(p, dst, src[0], src[1]);
 866          break;
 867       case BRW_OPCODE_XOR:
 868          brw_XOR(p, dst, src[0], src[1]);
 869          break;
 870       case BRW_OPCODE_NOT:
 871          brw_NOT(p, dst, src[0]);
 872          break;
 873       case BRW_OPCODE_ASR:
 874          brw_ASR(p, dst, src[0], src[1]);
 875          break;
 876       case BRW_OPCODE_SHR:
 877          brw_SHR(p, dst, src[0], src[1]);
 878          break;
 879       case BRW_OPCODE_SHL:
 880          brw_SHL(p, dst, src[0], src[1]);
 881          break;
 882
 883       case BRW_OPCODE_CMP:
 884          brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
 885          break;
 886       case BRW_OPCODE_SEL:
 887          brw_SEL(p, dst, src[0], src[1]);
 888          break;
 889
 890       case BRW_OPCODE_IF:
 891          if (inst->src[0].file != BAD_FILE) {
 892             /* The instruction has an embedded compare (only allowed on gen6) */
 893             assert(intel->gen == 6);
 894             gen6_IF(p, inst->conditional_mod, src[0], src[1]);
 895          } else {
 896             brw_IF(p, dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
 897          }
 898          break;
 899
 900       case BRW_OPCODE_ELSE:
 901          brw_ELSE(p);
 902          break;
 903       case BRW_OPCODE_ENDIF:
 904          brw_ENDIF(p);
 905          break;
 906
 907       case BRW_OPCODE_DO:
 908          brw_DO(p, BRW_EXECUTE_8);
 909          break;
 910
 911       case BRW_OPCODE_BREAK:
 912          brw_BREAK(p);
 913          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 914          break;
 915       case BRW_OPCODE_CONTINUE:
 916          /* FINISHME: We need to write the loop instruction support still. */
 917          if (intel->gen >= 6)
 918             gen6_CONT(p);
 919          else
 920             brw_CONT(p);
 921          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 922          break;
 923
 924       case BRW_OPCODE_WHILE:
 925          brw_WHILE(p);
 926          break;
 927
 928       case SHADER_OPCODE_RCP:
 929       case SHADER_OPCODE_RSQ:
 930       case SHADER_OPCODE_SQRT:
 931       case SHADER_OPCODE_EXP2:
 932       case SHADER_OPCODE_LOG2:
 933       case SHADER_OPCODE_SIN:
 934       case SHADER_OPCODE_COS:
 935          if (intel->gen >= 7) {
 936             generate_math1_gen7(inst, dst, src[0]);
 937          } else if (intel->gen == 6) {
 938             generate_math1_gen6(inst, dst, src[0]);
 939          } else {
 940             generate_math_gen4(inst, dst, src[0]);
 941          }
 942          break;
 943       case SHADER_OPCODE_INT_QUOTIENT:
 944       case SHADER_OPCODE_INT_REMAINDER:
 945       case SHADER_OPCODE_POW:
 946          if (intel->gen >= 7) {
 947             generate_math2_gen7(inst, dst, src[0], src[1]);
 948          } else if (intel->gen == 6) {
 949             generate_math2_gen6(inst, dst, src[0], src[1]);
 950          } else {
 951             generate_math_gen4(inst, dst, src[0]);
 952          }
 953          break;
 954       case FS_OPCODE_PIXEL_X:
 955          generate_pixel_xy(dst, true);
 956          break;
 957       case FS_OPCODE_PIXEL_Y:
 958          generate_pixel_xy(dst, false);
 959          break;
 960       case FS_OPCODE_CINTERP:
 961          brw_MOV(p, dst, src[0]);
 962          break;
 963       case FS_OPCODE_LINTERP:
 964          generate_linterp(inst, dst, src);
 965          break;
 966       case SHADER_OPCODE_TEX:
 967       case FS_OPCODE_TXB:
 968       case SHADER_OPCODE_TXD:
 969       case SHADER_OPCODE_TXF:
 970       case SHADER_OPCODE_TXL:
 971       case SHADER_OPCODE_TXS:
 972          generate_tex(inst, dst, src[0]);
 973          break;
 974       case FS_OPCODE_DISCARD:
 975          generate_discard(inst);
 976          break;
 977       case FS_OPCODE_DDX:
 978          generate_ddx(inst, dst, src[0]);
 979          break;
 980       case FS_OPCODE_DDY:
 981          /* Make sure fp->UsesDFdy flag got set (otherwise there's no
 982           * guarantee that c->key.render_to_fbo is set).
 983           */
 984          assert(fp->UsesDFdy);
 985          generate_ddy(inst, dst, src[0], c->key.render_to_fbo);
 986          break;
 987
 988       case FS_OPCODE_SPILL:
 989          generate_spill(inst, src[0]);
 990          break;
 991
 992       case FS_OPCODE_UNSPILL:
 993          generate_unspill(inst, dst);
 994          break;
 995
 996       case FS_OPCODE_PULL_CONSTANT_LOAD:
 997          generate_pull_constant_load(inst, dst, src[0], src[1]);
 998          break;
 999
1000       case FS_OPCODE_FB_WRITE:
1001          generate_fb_write(inst);
1002          break;
1003
1004       case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
1005          generate_mov_dispatch_to_flags();
1006          break;
1007
1008       default:
1009          if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
1010             _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
1011                           opcode_descs[inst->opcode].name);
1012          } else {
1013             _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
1014          }
1015          fail("unsupported opcode in FS\n");
1016       }
1017
1018       if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1019          brw_dump_compile(p, stdout,
1020                           last_native_insn_offset, p->next_insn_offset);
1021
1022          foreach_list(node, &cfg->block_list) {
1023             bblock_link *link = (bblock_link *)node;
1024             bblock_t *block = link->block;
1025
1026             if (block->end == inst) {
1027                printf("   END B%d", block->block_num);
1028                foreach_list(successor_node, &block->children) {
1029                   bblock_link *successor_link =
1030                      (bblock_link *)successor_node;
1031                   bblock_t *successor_block = successor_link->block;
1032                   printf(" ->B%d", successor_block->block_num);
1033                }
1034                printf("\n");
1035             }
1036          }
1037       }
1038
1039       last_native_insn_offset = p->next_insn_offset;
1040    }
1041
1042    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1043       printf("\n");
1044    }
1045
1046    brw_set_uip_jip(p);
1047
1048    /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
1049     * emit issues, it doesn't get the jump distances into the output,
1050     * which is often something we want to debug.  So this is here in
1051     * case you're doing that.
1052     */
1053    if (0) {
1054       brw_dump_compile(p, stdout, 0, p->next_insn_offset);
1055    }
1056 }