src/mesa/drivers/dri/i965/brw_fs_visitor.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs_visitor.cpp
  25  *
  26  * This file supports generating the FS LIR from the GLSL IR.  The LIR
  27  * makes it easier to do backend-specific optimizations than doing so
  28  * in the GLSL IR or in the native code.
  29  */
  30 #include <sys/types.h>
  31
  32 #include "main/macros.h"
  33 #include "main/shaderobj.h"
  34 #include "program/prog_parameter.h"
  35 #include "program/prog_print.h"
  36 #include "program/prog_optimize.h"
  37 #include "util/register_allocate.h"
  38 #include "program/hash_table.h"
  39 #include "brw_context.h"
  40 #include "brw_eu.h"
  41 #include "brw_wm.h"
  42 #include "brw_cs.h"
  43 #include "brw_vec4.h"
  44 #include "brw_fs.h"
  45 #include "main/uniforms.h"
  46 #include "glsl/glsl_types.h"
  47 #include "glsl/ir_optimization.h"
  48 #include "program/sampler.h"
  49
  50 using namespace brw;
  51
  52 fs_reg *
  53 fs_visitor::emit_vs_system_value(int location)
  54 {
  55    fs_reg *reg = new(this->mem_ctx)
  56       fs_reg(ATTR, VERT_ATTRIB_MAX, BRW_REGISTER_TYPE_D);
  57    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
  58
  59    switch (location) {
  60    case SYSTEM_VALUE_BASE_VERTEX:
  61       reg->reg_offset = 0;
  62       vs_prog_data->uses_vertexid = true;
  63       break;
  64    case SYSTEM_VALUE_VERTEX_ID:
  65    case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
  66       reg->reg_offset = 2;
  67       vs_prog_data->uses_vertexid = true;
  68       break;
  69    case SYSTEM_VALUE_INSTANCE_ID:
  70       reg->reg_offset = 3;
  71       vs_prog_data->uses_instanceid = true;
  72       break;
  73    default:
  74       unreachable("not reached");
  75    }
  76
  77    return reg;
  78 }
  79
  80 fs_inst *
  81 fs_visitor::emit_lrp(const fs_reg &dst, const fs_reg &x, const fs_reg &y,
  82                      const fs_reg &a)
  83 {
  84    if (devinfo->gen < 6) {
  85       /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
  86       fs_reg y_times_a           = vgrf(glsl_type::float_type);
  87       fs_reg one_minus_a         = vgrf(glsl_type::float_type);
  88       fs_reg x_times_one_minus_a = vgrf(glsl_type::float_type);
  89
  90       emit(MUL(y_times_a, y, a));
  91
  92       fs_reg negative_a = a;
  93       negative_a.negate = !a.negate;
  94       emit(ADD(one_minus_a, negative_a, fs_reg(1.0f)));
  95       emit(MUL(x_times_one_minus_a, x, one_minus_a));
  96
  97       return emit(ADD(dst, x_times_one_minus_a, y_times_a));
  98    } else {
  99       /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
 100        * we need to reorder the operands.
 101        */
 102       return emit(LRP(dst, a, y, x));
 103    }
 104 }
 105
 106 void
 107 fs_visitor::emit_uniformize(const fs_reg &dst, const fs_reg &src)
 108 {
 109    const fs_reg chan_index = vgrf(glsl_type::uint_type);
 110
 111    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, component(chan_index, 0))
 112       ->force_writemask_all = true;
 113    emit(SHADER_OPCODE_BROADCAST, component(dst, 0),
 114         src, component(chan_index, 0))
 115       ->force_writemask_all = true;
 116 }
 117
 118 fs_inst *
 119 fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
 120                               fs_reg coordinate, int coord_components,
 121                               fs_reg shadow_c,
 122                               fs_reg lod, fs_reg dPdy, int grad_components,
 123                               uint32_t sampler)
 124 {
 125    int mlen;
 126    int base_mrf = 1;
 127    bool simd16 = false;
 128    fs_reg orig_dst;
 129
 130    /* g0 header. */
 131    mlen = 1;
 132
 133    if (shadow_c.file != BAD_FILE) {
 134       for (int i = 0; i < coord_components; i++) {
 135          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
 136          coordinate = offset(coordinate, 1);
 137       }
 138
 139       /* gen4's SIMD8 sampler always has the slots for u,v,r present.
 140        * the unused slots must be zeroed.
 141        */
 142       for (int i = coord_components; i < 3; i++) {
 143          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
 144       }
 145       mlen += 3;
 146
 147       if (op == ir_tex) {
 148          /* There's no plain shadow compare message, so we use shadow
 149           * compare with a bias of 0.0.
 150           */
 151          emit(MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)));
 152          mlen++;
 153       } else if (op == ir_txb || op == ir_txl) {
 154          emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
 155          mlen++;
 156       } else {
 157          unreachable("Should not get here.");
 158       }
 159
 160       emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
 161       mlen++;
 162    } else if (op == ir_tex) {
 163       for (int i = 0; i < coord_components; i++) {
 164          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
 165          coordinate = offset(coordinate, 1);
 166       }
 167       /* zero the others. */
 168       for (int i = coord_components; i<3; i++) {
 169          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
 170       }
 171       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
 172       mlen += 3;
 173    } else if (op == ir_txd) {
 174       fs_reg &dPdx = lod;
 175
 176       for (int i = 0; i < coord_components; i++) {
 177          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
 178          coordinate = offset(coordinate, 1);
 179       }
 180       /* the slots for u and v are always present, but r is optional */
 181       mlen += MAX2(coord_components, 2);
 182
 183       /*  P   = u, v, r
 184        * dPdx = dudx, dvdx, drdx
 185        * dPdy = dudy, dvdy, drdy
 186        *
 187        * 1-arg: Does not exist.
 188        *
 189        * 2-arg: dudx   dvdx   dudy   dvdy
 190        *        dPdx.x dPdx.y dPdy.x dPdy.y
 191        *        m4     m5     m6     m7
 192        *
 193        * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
 194        *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
 195        *        m5     m6     m7     m8     m9     m10
 196        */
 197       for (int i = 0; i < grad_components; i++) {
 198          emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdx));
 199          dPdx = offset(dPdx, 1);
 200       }
 201       mlen += MAX2(grad_components, 2);
 202
 203       for (int i = 0; i < grad_components; i++) {
 204          emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdy));
 205          dPdy = offset(dPdy, 1);
 206       }
 207       mlen += MAX2(grad_components, 2);
 208    } else if (op == ir_txs) {
 209       /* There's no SIMD8 resinfo message on Gen4.  Use SIMD16 instead. */
 210       simd16 = true;
 211       emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
 212       mlen += 2;
 213    } else {
 214       /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
 215        * instructions.  We'll need to do SIMD16 here.
 216        */
 217       simd16 = true;
 218       assert(op == ir_txb || op == ir_txl || op == ir_txf);
 219
 220       for (int i = 0; i < coord_components; i++) {
 221          emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
 222                   coordinate));
 223          coordinate = offset(coordinate, 1);
 224       }
 225
 226       /* Initialize the rest of u/v/r with 0.0.  Empirically, this seems to
 227        * be necessary for TXF (ld), but seems wise to do for all messages.
 228        */
 229       for (int i = coord_components; i < 3; i++) {
 230          emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f)));
 231       }
 232
 233       /* lod/bias appears after u/v/r. */
 234       mlen += 6;
 235
 236       emit(MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod));
 237       mlen++;
 238
 239       /* The unused upper half. */
 240       mlen++;
 241    }
 242
 243    if (simd16) {
 244       /* Now, since we're doing simd16, the return is 2 interleaved
 245        * vec4s where the odd-indexed ones are junk. We'll need to move
 246        * this weirdness around to the expected layout.
 247        */
 248       orig_dst = dst;
 249       dst = fs_reg(GRF, alloc.allocate(8), orig_dst.type);
 250    }
 251
 252    enum opcode opcode;
 253    switch (op) {
 254    case ir_tex: opcode = SHADER_OPCODE_TEX; break;
 255    case ir_txb: opcode = FS_OPCODE_TXB; break;
 256    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 257    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 258    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 259    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 260    default:
 261       unreachable("not reached");
 262    }
 263
 264    fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
 265    inst->base_mrf = base_mrf;
 266    inst->mlen = mlen;
 267    inst->header_size = 1;
 268    inst->regs_written = simd16 ? 8 : 4;
 269
 270    if (simd16) {
 271       for (int i = 0; i < 4; i++) {
 272          emit(MOV(orig_dst, dst));
 273          orig_dst = offset(orig_dst, 1);
 274          dst = offset(dst, 2);
 275       }
 276    }
 277
 278    return inst;
 279 }
 280
 281 fs_inst *
 282 fs_visitor::emit_texture_gen4_simd16(ir_texture_opcode op, fs_reg dst,
 283                                      fs_reg coordinate, int vector_elements,
 284                                      fs_reg shadow_c, fs_reg lod,
 285                                      uint32_t sampler)
 286 {
 287    fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F, dispatch_width);
 288    bool has_lod = op == ir_txl || op == ir_txb || op == ir_txf;
 289
 290    if (has_lod && shadow_c.file != BAD_FILE)
 291       no16("TXB and TXL with shadow comparison unsupported in SIMD16.");
 292
 293    if (op == ir_txd)
 294       no16("textureGrad unsupported in SIMD16.");
 295
 296    /* Copy the coordinates. */
 297    for (int i = 0; i < vector_elements; i++) {
 298       emit(MOV(retype(offset(message, i), coordinate.type), coordinate));
 299       coordinate = offset(coordinate, 1);
 300    }
 301
 302    fs_reg msg_end = offset(message, vector_elements);
 303
 304    /* Messages other than sample and ld require all three components */
 305    if (has_lod || shadow_c.file != BAD_FILE) {
 306       for (int i = vector_elements; i < 3; i++) {
 307          emit(MOV(offset(message, i), fs_reg(0.0f)));
 308       }
 309    }
 310
 311    if (has_lod) {
 312       fs_reg msg_lod = retype(offset(message, 3), op == ir_txf ?
 313                               BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
 314       emit(MOV(msg_lod, lod));
 315       msg_end = offset(msg_lod, 1);
 316    }
 317
 318    if (shadow_c.file != BAD_FILE) {
 319       fs_reg msg_ref = offset(message, 3 + has_lod);
 320       emit(MOV(msg_ref, shadow_c));
 321       msg_end = offset(msg_ref, 1);
 322    }
 323
 324    enum opcode opcode;
 325    switch (op) {
 326    case ir_tex: opcode = SHADER_OPCODE_TEX; break;
 327    case ir_txb: opcode = FS_OPCODE_TXB;     break;
 328    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 329    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 330    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 331    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 332    default: unreachable("not reached");
 333    }
 334
 335    fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
 336    inst->base_mrf = message.reg - 1;
 337    inst->mlen = msg_end.reg - inst->base_mrf;
 338    inst->header_size = 1;
 339    inst->regs_written = 8;
 340
 341    return inst;
 342 }
 343
 344 /* gen5's sampler has slots for u, v, r, array index, then optional
 345  * parameters like shadow comparitor or LOD bias.  If optional
 346  * parameters aren't present, those base slots are optional and don't
 347  * need to be included in the message.
 348  *
 349  * We don't fill in the unnecessary slots regardless, which may look
 350  * surprising in the disassembly.
 351  */
 352 fs_inst *
 353 fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
 354                               fs_reg coordinate, int vector_elements,
 355                               fs_reg shadow_c,
 356                               fs_reg lod, fs_reg lod2, int grad_components,
 357                               fs_reg sample_index, uint32_t sampler,
 358                               bool has_offset)
 359 {
 360    int reg_width = dispatch_width / 8;
 361    unsigned header_size = 0;
 362
 363    fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F, dispatch_width);
 364    fs_reg msg_coords = message;
 365
 366    if (has_offset) {
 367       /* The offsets set up by the ir_texture visitor are in the
 368        * m1 header, so we can't go headerless.
 369        */
 370       header_size = 1;
 371       message.reg--;
 372    }
 373
 374    for (int i = 0; i < vector_elements; i++) {
 375       emit(MOV(retype(offset(msg_coords, i), coordinate.type), coordinate));
 376       coordinate = offset(coordinate, 1);
 377    }
 378    fs_reg msg_end = offset(msg_coords, vector_elements);
 379    fs_reg msg_lod = offset(msg_coords, 4);
 380
 381    if (shadow_c.file != BAD_FILE) {
 382       fs_reg msg_shadow = msg_lod;
 383       emit(MOV(msg_shadow, shadow_c));
 384       msg_lod = offset(msg_shadow, 1);
 385       msg_end = msg_lod;
 386    }
 387
 388    enum opcode opcode;
 389    switch (op) {
 390    case ir_tex:
 391       opcode = SHADER_OPCODE_TEX;
 392       break;
 393    case ir_txb:
 394       emit(MOV(msg_lod, lod));
 395       msg_end = offset(msg_lod, 1);
 396
 397       opcode = FS_OPCODE_TXB;
 398       break;
 399    case ir_txl:
 400       emit(MOV(msg_lod, lod));
 401       msg_end = offset(msg_lod, 1);
 402
 403       opcode = SHADER_OPCODE_TXL;
 404       break;
 405    case ir_txd: {
 406       /**
 407        *  P   =  u,    v,    r
 408        * dPdx = dudx, dvdx, drdx
 409        * dPdy = dudy, dvdy, drdy
 410        *
 411        * Load up these values:
 412        * - dudx   dudy   dvdx   dvdy   drdx   drdy
 413        * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
 414        */
 415       msg_end = msg_lod;
 416       for (int i = 0; i < grad_components; i++) {
 417          emit(MOV(msg_end, lod));
 418          lod = offset(lod, 1);
 419          msg_end = offset(msg_end, 1);
 420
 421          emit(MOV(msg_end, lod2));
 422          lod2 = offset(lod2, 1);
 423          msg_end = offset(msg_end, 1);
 424       }
 425
 426       opcode = SHADER_OPCODE_TXD;
 427       break;
 428    }
 429    case ir_txs:
 430       msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
 431       emit(MOV(msg_lod, lod));
 432       msg_end = offset(msg_lod, 1);
 433
 434       opcode = SHADER_OPCODE_TXS;
 435       break;
 436    case ir_query_levels:
 437       msg_lod = msg_end;
 438       emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
 439       msg_end = offset(msg_lod, 1);
 440
 441       opcode = SHADER_OPCODE_TXS;
 442       break;
 443    case ir_txf:
 444       msg_lod = offset(msg_coords, 3);
 445       emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod));
 446       msg_end = offset(msg_lod, 1);
 447
 448       opcode = SHADER_OPCODE_TXF;
 449       break;
 450    case ir_txf_ms:
 451       msg_lod = offset(msg_coords, 3);
 452       /* lod */
 453       emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
 454       /* sample index */
 455       emit(MOV(retype(offset(msg_lod, 1), BRW_REGISTER_TYPE_UD), sample_index));
 456       msg_end = offset(msg_lod, 2);
 457
 458       opcode = SHADER_OPCODE_TXF_CMS;
 459       break;
 460    case ir_lod:
 461       opcode = SHADER_OPCODE_LOD;
 462       break;
 463    case ir_tg4:
 464       opcode = SHADER_OPCODE_TG4;
 465       break;
 466    default:
 467       unreachable("not reached");
 468    }
 469
 470    fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
 471    inst->base_mrf = message.reg;
 472    inst->mlen = msg_end.reg - message.reg;
 473    inst->header_size = header_size;
 474    inst->regs_written = 4 * reg_width;
 475
 476    if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
 477       fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
 478            " disallowed by hardware\n");
 479    }
 480
 481    return inst;
 482 }
 483
 484 static bool
 485 is_high_sampler(const struct brw_device_info *devinfo, fs_reg sampler)
 486 {
 487    if (devinfo->gen < 8 && !devinfo->is_haswell)
 488       return false;
 489
 490    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
 491 }
 492
 493 fs_inst *
 494 fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
 495                               fs_reg coordinate, int coord_components,
 496                               fs_reg shadow_c,
 497                               fs_reg lod, fs_reg lod2, int grad_components,
 498                               fs_reg sample_index, fs_reg mcs, fs_reg sampler,
 499                               fs_reg offset_value)
 500 {
 501    int reg_width = dispatch_width / 8;
 502    unsigned header_size = 0;
 503
 504    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, MAX_SAMPLER_MESSAGE_SIZE);
 505    for (int i = 0; i < MAX_SAMPLER_MESSAGE_SIZE; i++) {
 506       sources[i] = vgrf(glsl_type::float_type);
 507    }
 508    int length = 0;
 509
 510    if (op == ir_tg4 || offset_value.file != BAD_FILE ||
 511        is_high_sampler(devinfo, sampler)) {
 512       /* For general texture offsets (no txf workaround), we need a header to
 513        * put them in.  Note that for SIMD16 we're making space for two actual
 514        * hardware registers here, so the emit will have to fix up for this.
 515        *
 516        * * ir4_tg4 needs to place its channel select in the header,
 517        * for interaction with ARB_texture_swizzle
 518        *
 519        * The sampler index is only 4-bits, so for larger sampler numbers we
 520        * need to offset the Sampler State Pointer in the header.
 521        */
 522       header_size = 1;
 523       sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
 524       length++;
 525    }
 526
 527    if (shadow_c.file != BAD_FILE) {
 528       emit(MOV(sources[length], shadow_c));
 529       length++;
 530    }
 531
 532    bool has_nonconstant_offset =
 533       offset_value.file != BAD_FILE && offset_value.file != IMM;
 534    bool coordinate_done = false;
 535
 536    /* The sampler can only meaningfully compute LOD for fragment shader
 537     * messages. For all other stages, we change the opcode to ir_txl and
 538     * hardcode the LOD to 0.
 539     */
 540    if (stage != MESA_SHADER_FRAGMENT && op == ir_tex) {
 541       op = ir_txl;
 542       lod = fs_reg(0.0f);
 543    }
 544
 545    /* Set up the LOD info */
 546    switch (op) {
 547    case ir_tex:
 548    case ir_lod:
 549       break;
 550    case ir_txb:
 551       emit(MOV(sources[length], lod));
 552       length++;
 553       break;
 554    case ir_txl:
 555       emit(MOV(sources[length], lod));
 556       length++;
 557       break;
 558    case ir_txd: {
 559       no16("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
 560
 561       /* Load dPdx and the coordinate together:
 562        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
 563        */
 564       for (int i = 0; i < coord_components; i++) {
 565          emit(MOV(sources[length], coordinate));
 566          coordinate = offset(coordinate, 1);
 567          length++;
 568
 569          /* For cube map array, the coordinate is (u,v,r,ai) but there are
 570           * only derivatives for (u, v, r).
 571           */
 572          if (i < grad_components) {
 573             emit(MOV(sources[length], lod));
 574             lod = offset(lod, 1);
 575             length++;
 576
 577             emit(MOV(sources[length], lod2));
 578             lod2 = offset(lod2, 1);
 579             length++;
 580          }
 581       }
 582
 583       coordinate_done = true;
 584       break;
 585    }
 586    case ir_txs:
 587       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod));
 588       length++;
 589       break;
 590    case ir_query_levels:
 591       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), fs_reg(0u)));
 592       length++;
 593       break;
 594    case ir_txf:
 595       /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
 596        * On Gen9 they are u, v, lod, r
 597        */
 598
 599       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
 600       coordinate = offset(coordinate, 1);
 601       length++;
 602
 603       if (devinfo->gen >= 9) {
 604          if (coord_components >= 2) {
 605             emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
 606             coordinate = offset(coordinate, 1);
 607          }
 608          length++;
 609       }
 610
 611       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod));
 612       length++;
 613
 614       for (int i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
 615          emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
 616          coordinate = offset(coordinate, 1);
 617          length++;
 618       }
 619
 620       coordinate_done = true;
 621       break;
 622    case ir_txf_ms:
 623       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index));
 624       length++;
 625
 626       /* data from the multisample control surface */
 627       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs));
 628       length++;
 629
 630       /* there is no offsetting for this message; just copy in the integer
 631        * texture coordinates
 632        */
 633       for (int i = 0; i < coord_components; i++) {
 634          emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
 635          coordinate = offset(coordinate, 1);
 636          length++;
 637       }
 638
 639       coordinate_done = true;
 640       break;
 641    case ir_tg4:
 642       if (has_nonconstant_offset) {
 643          if (shadow_c.file != BAD_FILE)
 644             no16("Gen7 does not support gather4_po_c in SIMD16 mode.");
 645
 646          /* More crazy intermixing */
 647          for (int i = 0; i < 2; i++) { /* u, v */
 648             emit(MOV(sources[length], coordinate));
 649             coordinate = offset(coordinate, 1);
 650             length++;
 651          }
 652
 653          for (int i = 0; i < 2; i++) { /* offu, offv */
 654             emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value));
 655             offset_value = offset(offset_value, 1);
 656             length++;
 657          }
 658
 659          if (coord_components == 3) { /* r if present */
 660             emit(MOV(sources[length], coordinate));
 661             coordinate = offset(coordinate, 1);
 662             length++;
 663          }
 664
 665          coordinate_done = true;
 666       }
 667       break;
 668    }
 669
 670    /* Set up the coordinate (except for cases where it was done above) */
 671    if (!coordinate_done) {
 672       for (int i = 0; i < coord_components; i++) {
 673          emit(MOV(sources[length], coordinate));
 674          coordinate = offset(coordinate, 1);
 675          length++;
 676       }
 677    }
 678
 679    int mlen;
 680    if (reg_width == 2)
 681       mlen = length * reg_width - header_size;
 682    else
 683       mlen = length * reg_width;
 684
 685    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
 686                                BRW_REGISTER_TYPE_F, dispatch_width);
 687    emit(LOAD_PAYLOAD(src_payload, sources, length, header_size));
 688
 689    /* Generate the SEND */
 690    enum opcode opcode;
 691    switch (op) {
 692    case ir_tex: opcode = SHADER_OPCODE_TEX; break;
 693    case ir_txb: opcode = FS_OPCODE_TXB; break;
 694    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 695    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 696    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 697    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
 698    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 699    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
 700    case ir_lod: opcode = SHADER_OPCODE_LOD; break;
 701    case ir_tg4:
 702       if (has_nonconstant_offset)
 703          opcode = SHADER_OPCODE_TG4_OFFSET;
 704       else
 705          opcode = SHADER_OPCODE_TG4;
 706       break;
 707    default:
 708       unreachable("not reached");
 709    }
 710    fs_inst *inst = emit(opcode, dst, src_payload, sampler);
 711    inst->base_mrf = -1;
 712    inst->mlen = mlen;
 713    inst->header_size = header_size;
 714    inst->regs_written = 4 * reg_width;
 715
 716    if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
 717       fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
 718            " disallowed by hardware\n");
 719    }
 720
 721    return inst;
 722 }
 723
 724 fs_reg
 725 fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
 726                              bool is_rect, uint32_t sampler, int texunit)
 727 {
 728    fs_inst *inst = NULL;
 729    bool needs_gl_clamp = true;
 730    fs_reg scale_x, scale_y;
 731
 732    /* The 965 requires the EU to do the normalization of GL rectangle
 733     * texture coordinates.  We use the program parameter state
 734     * tracking to get the scaling factor.
 735     */
 736    if (is_rect &&
 737        (devinfo->gen < 6 ||
 738         (devinfo->gen >= 6 && (key_tex->gl_clamp_mask[0] & (1 << sampler) ||
 739                                key_tex->gl_clamp_mask[1] & (1 << sampler))))) {
 740       struct gl_program_parameter_list *params = prog->Parameters;
 741       int tokens[STATE_LENGTH] = {
 742          STATE_INTERNAL,
 743          STATE_TEXRECT_SCALE,
 744          texunit,
 745          0,
 746          0
 747       };
 748
 749       no16("rectangle scale uniform setup not supported on SIMD16\n");
 750       if (dispatch_width == 16) {
 751          return coordinate;
 752       }
 753
 754       GLuint index = _mesa_add_state_reference(params,
 755                                                (gl_state_index *)tokens);
 756       /* Try to find existing copies of the texrect scale uniforms. */
 757       for (unsigned i = 0; i < uniforms; i++) {
 758          if (stage_prog_data->param[i] ==
 759              &prog->Parameters->ParameterValues[index][0]) {
 760             scale_x = fs_reg(UNIFORM, i);
 761             scale_y = fs_reg(UNIFORM, i + 1);
 762             break;
 763          }
 764       }
 765
 766       /* If we didn't already set them up, do so now. */
 767       if (scale_x.file == BAD_FILE) {
 768          scale_x = fs_reg(UNIFORM, uniforms);
 769          scale_y = fs_reg(UNIFORM, uniforms + 1);
 770
 771          stage_prog_data->param[uniforms++] =
 772             &prog->Parameters->ParameterValues[index][0];
 773          stage_prog_data->param[uniforms++] =
 774             &prog->Parameters->ParameterValues[index][1];
 775       }
 776    }
 777
 778    /* The 965 requires the EU to do the normalization of GL rectangle
 779     * texture coordinates.  We use the program parameter state
 780     * tracking to get the scaling factor.
 781     */
 782    if (devinfo->gen < 6 && is_rect) {
 783       fs_reg dst = fs_reg(GRF, alloc.allocate(coord_components));
 784       fs_reg src = coordinate;
 785       coordinate = dst;
 786
 787       emit(MUL(dst, src, scale_x));
 788       dst = offset(dst, 1);
 789       src = offset(src, 1);
 790       emit(MUL(dst, src, scale_y));
 791    } else if (is_rect) {
 792       /* On gen6+, the sampler handles the rectangle coordinates
 793        * natively, without needing rescaling.  But that means we have
 794        * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
 795        * not [0, 1] like the default case below.
 796        */
 797       needs_gl_clamp = false;
 798
 799       for (int i = 0; i < 2; i++) {
 800          if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
 801             fs_reg chan = coordinate;
 802             chan = offset(chan, i);
 803
 804             inst = emit(BRW_OPCODE_SEL, chan, chan, fs_reg(0.0f));
 805             inst->conditional_mod = BRW_CONDITIONAL_GE;
 806
 807             /* Our parameter comes in as 1.0/width or 1.0/height,
 808              * because that's what people normally want for doing
 809              * texture rectangle handling.  We need width or height
 810              * for clamping, but we don't care enough to make a new
 811              * parameter type, so just invert back.
 812              */
 813             fs_reg limit = vgrf(glsl_type::float_type);
 814             emit(MOV(limit, i == 0 ? scale_x : scale_y));
 815             emit(SHADER_OPCODE_RCP, limit, limit);
 816
 817             inst = emit(BRW_OPCODE_SEL, chan, chan, limit);
 818             inst->conditional_mod = BRW_CONDITIONAL_L;
 819          }
 820       }
 821    }
 822
 823    if (coord_components > 0 && needs_gl_clamp) {
 824       for (int i = 0; i < MIN2(coord_components, 3); i++) {
 825          if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
 826             fs_reg chan = coordinate;
 827             chan = offset(chan, i);
 828
 829             fs_inst *inst = emit(MOV(chan, chan));
 830             inst->saturate = true;
 831          }
 832       }
 833    }
 834    return coordinate;
 835 }
 836
 837 /* Sample from the MCS surface attached to this multisample texture. */
 838 fs_reg
 839 fs_visitor::emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler)
 840 {
 841    int reg_width = dispatch_width / 8;
 842    fs_reg payload = fs_reg(GRF, alloc.allocate(components * reg_width),
 843                            BRW_REGISTER_TYPE_F, dispatch_width);
 844    fs_reg dest = vgrf(glsl_type::uvec4_type);
 845    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, components);
 846
 847    /* parameters are: u, v, r; missing parameters are treated as zero */
 848    for (int i = 0; i < components; i++) {
 849       sources[i] = vgrf(glsl_type::float_type);
 850       emit(MOV(retype(sources[i], BRW_REGISTER_TYPE_D), coordinate));
 851       coordinate = offset(coordinate, 1);
 852    }
 853
 854    emit(LOAD_PAYLOAD(payload, sources, components, 0));
 855
 856    fs_inst *inst = emit(SHADER_OPCODE_TXF_MCS, dest, payload, sampler);
 857    inst->base_mrf = -1;
 858    inst->mlen = components * reg_width;
 859    inst->header_size = 0;
 860    inst->regs_written = 4 * reg_width; /* we only care about one reg of
 861                                         * response, but the sampler always
 862                                         * writes 4/8
 863                                         */
 864
 865    return dest;
 866 }
 867
 868 void
 869 fs_visitor::emit_texture(ir_texture_opcode op,
 870                          const glsl_type *dest_type,
 871                          fs_reg coordinate, int coord_components,
 872                          fs_reg shadow_c,
 873                          fs_reg lod, fs_reg lod2, int grad_components,
 874                          fs_reg sample_index,
 875                          fs_reg offset_value,
 876                          fs_reg mcs,
 877                          int gather_component,
 878                          bool is_cube_array,
 879                          bool is_rect,
 880                          uint32_t sampler,
 881                          fs_reg sampler_reg, int texunit)
 882 {
 883    fs_inst *inst = NULL;
 884
 885    if (op == ir_tg4) {
 886       /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
 887        * emitting anything other than setting up the constant result.
 888        */
 889       int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
 890       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
 891
 892          fs_reg res = vgrf(glsl_type::vec4_type);
 893          this->result = res;
 894
 895          for (int i=0; i<4; i++) {
 896             emit(MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f)));
 897             res = offset(res, 1);
 898          }
 899          return;
 900       }
 901    }
 902
 903    if (coordinate.file != BAD_FILE) {
 904       /* FINISHME: Texture coordinate rescaling doesn't work with non-constant
 905        * samplers.  This should only be a problem with GL_CLAMP on Gen7.
 906        */
 907       coordinate = rescale_texcoord(coordinate, coord_components, is_rect,
 908                                     sampler, texunit);
 909    }
 910
 911    /* Writemasking doesn't eliminate channels on SIMD8 texture
 912     * samples, so don't worry about them.
 913     */
 914    fs_reg dst = vgrf(glsl_type::get_instance(dest_type->base_type, 4, 1));
 915
 916    if (devinfo->gen >= 7) {
 917       inst = emit_texture_gen7(op, dst, coordinate, coord_components,
 918                                shadow_c, lod, lod2, grad_components,
 919                                sample_index, mcs, sampler_reg,
 920                                offset_value);
 921    } else if (devinfo->gen >= 5) {
 922       inst = emit_texture_gen5(op, dst, coordinate, coord_components,
 923                                shadow_c, lod, lod2, grad_components,
 924                                sample_index, sampler,
 925                                offset_value.file != BAD_FILE);
 926    } else if (dispatch_width == 16) {
 927       inst = emit_texture_gen4_simd16(op, dst, coordinate, coord_components,
 928                                       shadow_c, lod, sampler);
 929    } else {
 930       inst = emit_texture_gen4(op, dst, coordinate, coord_components,
 931                                shadow_c, lod, lod2, grad_components,
 932                                sampler);
 933    }
 934
 935    if (shadow_c.file != BAD_FILE)
 936       inst->shadow_compare = true;
 937
 938    if (offset_value.file == IMM)
 939       inst->offset = offset_value.fixed_hw_reg.dw1.ud;
 940
 941    if (op == ir_tg4) {
 942       inst->offset |=
 943          gather_channel(gather_component, sampler) << 16; /* M0.2:16-17 */
 944
 945       if (devinfo->gen == 6)
 946          emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], dst);
 947    }
 948
 949    /* fixup #layers for cube map arrays */
 950    if (op == ir_txs && is_cube_array) {
 951       fs_reg depth = offset(dst, 2);
 952       fs_reg fixed_depth = vgrf(glsl_type::int_type);
 953       emit_math(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
 954
 955       fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
 956       int components = inst->regs_written / (dst.width / 8);
 957       for (int i = 0; i < components; i++) {
 958          if (i == 2) {
 959             fixed_payload[i] = fixed_depth;
 960          } else {
 961             fixed_payload[i] = offset(dst, i);
 962          }
 963       }
 964       emit(LOAD_PAYLOAD(dst, fixed_payload, components, 0));
 965    }
 966
 967    swizzle_result(op, dest_type->vector_elements, dst, sampler);
 968 }
 969
 970 /**
 971  * Apply workarounds for Gen6 gather with UINT/SINT
 972  */
 973 void
 974 fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
 975 {
 976    if (!wa)
 977       return;
 978
 979    int width = (wa & WA_8BIT) ? 8 : 16;
 980
 981    for (int i = 0; i < 4; i++) {
 982       fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
 983       /* Convert from UNORM to UINT */
 984       emit(MUL(dst_f, dst_f, fs_reg((float)((1 << width) - 1))));
 985       emit(MOV(dst, dst_f));
 986
 987       if (wa & WA_SIGN) {
 988          /* Reinterpret the UINT value as a signed INT value by
 989           * shifting the sign bit into place, then shifting back
 990           * preserving sign.
 991           */
 992          emit(SHL(dst, dst, fs_reg(32 - width)));
 993          emit(ASR(dst, dst, fs_reg(32 - width)));
 994       }
 995
 996       dst = offset(dst, 1);
 997    }
 998 }
 999
1000 /**
1001  * Set up the gather channel based on the swizzle, for gather4.
1002  */
1003 uint32_t
1004 fs_visitor::gather_channel(int orig_chan, uint32_t sampler)
1005 {
1006    int swiz = GET_SWZ(key_tex->swizzles[sampler], orig_chan);
1007    switch (swiz) {
1008       case SWIZZLE_X: return 0;
1009       case SWIZZLE_Y:
1010          /* gather4 sampler is broken for green channel on RG32F --
1011           * we must ask for blue instead.
1012           */
1013          if (key_tex->gather_channel_quirk_mask & (1 << sampler))
1014             return 2;
1015          return 1;
1016       case SWIZZLE_Z: return 2;
1017       case SWIZZLE_W: return 3;
1018       default:
1019          unreachable("Not reached"); /* zero, one swizzles handled already */
1020    }
1021 }
1022
1023 /**
1024  * Swizzle the result of a texture result.  This is necessary for
1025  * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
1026  */
1027 void
1028 fs_visitor::swizzle_result(ir_texture_opcode op, int dest_components,
1029                            fs_reg orig_val, uint32_t sampler)
1030 {
1031    if (op == ir_query_levels) {
1032       /* # levels is in .w */
1033       this->result = offset(orig_val, 3);
1034       return;
1035    }
1036
1037    this->result = orig_val;
1038
1039    /* txs,lod don't actually sample the texture, so swizzling the result
1040     * makes no sense.
1041     */
1042    if (op == ir_txs || op == ir_lod || op == ir_tg4)
1043       return;
1044
1045    if (dest_components == 1) {
1046       /* Ignore DEPTH_TEXTURE_MODE swizzling. */
1047    } else if (key_tex->swizzles[sampler] != SWIZZLE_NOOP) {
1048       fs_reg swizzled_result = vgrf(glsl_type::vec4_type);
1049       swizzled_result.type = orig_val.type;
1050
1051       for (int i = 0; i < 4; i++) {
1052          int swiz = GET_SWZ(key_tex->swizzles[sampler], i);
1053          fs_reg l = swizzled_result;
1054          l = offset(l, i);
1055
1056          if (swiz == SWIZZLE_ZERO) {
1057             emit(MOV(l, fs_reg(0.0f)));
1058          } else if (swiz == SWIZZLE_ONE) {
1059             emit(MOV(l, fs_reg(1.0f)));
1060          } else {
1061             emit(MOV(l, offset(orig_val,
1062                                GET_SWZ(key_tex->swizzles[sampler], i))));
1063          }
1064       }
1065       this->result = swizzled_result;
1066    }
1067 }
1068
1069 /**
1070  * Try to replace IF/MOV/ELSE/MOV/ENDIF with SEL.
1071  *
1072  * Many GLSL shaders contain the following pattern:
1073  *
1074  *    x = condition ? foo : bar
1075  *
1076  * The compiler emits an ir_if tree for this, since each subexpression might be
1077  * a complex tree that could have side-effects or short-circuit logic.
1078  *
1079  * However, the common case is to simply select one of two constants or
1080  * variable values---which is exactly what SEL is for.  In this case, the
1081  * assembly looks like:
1082  *
1083  *    (+f0) IF
1084  *    MOV dst src0
1085  *    ELSE
1086  *    MOV dst src1
1087  *    ENDIF
1088  *
1089  * which can be easily translated into:
1090  *
1091  *    (+f0) SEL dst src0 src1
1092  *
1093  * If src0 is an immediate value, we promote it to a temporary GRF.
1094  */
1095 bool
1096 fs_visitor::try_replace_with_sel()
1097 {
1098    fs_inst *endif_inst = (fs_inst *) instructions.get_tail();
1099    assert(endif_inst->opcode == BRW_OPCODE_ENDIF);
1100
1101    /* Pattern match in reverse: IF, MOV, ELSE, MOV, ENDIF. */
1102    int opcodes[] = {
1103       BRW_OPCODE_IF, BRW_OPCODE_MOV, BRW_OPCODE_ELSE, BRW_OPCODE_MOV,
1104    };
1105
1106    fs_inst *match = (fs_inst *) endif_inst->prev;
1107    for (int i = 0; i < 4; i++) {
1108       if (match->is_head_sentinel() || match->opcode != opcodes[4-i-1])
1109          return false;
1110       match = (fs_inst *) match->prev;
1111    }
1112
1113    /* The opcodes match; it looks like the right sequence of instructions. */
1114    fs_inst *else_mov = (fs_inst *) endif_inst->prev;
1115    fs_inst *then_mov = (fs_inst *) else_mov->prev->prev;
1116    fs_inst *if_inst = (fs_inst *) then_mov->prev;
1117
1118    /* Check that the MOVs are the right form. */
1119    if (then_mov->dst.equals(else_mov->dst) &&
1120        !then_mov->is_partial_write() &&
1121        !else_mov->is_partial_write()) {
1122
1123       /* Remove the matched instructions; we'll emit a SEL to replace them. */
1124       while (!if_inst->next->is_tail_sentinel())
1125          if_inst->next->exec_node::remove();
1126       if_inst->exec_node::remove();
1127
1128       /* Only the last source register can be a constant, so if the MOV in
1129        * the "then" clause uses a constant, we need to put it in a temporary.
1130        */
1131       fs_reg src0(then_mov->src[0]);
1132       if (src0.file == IMM) {
1133          src0 = vgrf(glsl_type::float_type);
1134          src0.type = then_mov->src[0].type;
1135          bld.MOV(src0, then_mov->src[0]);
1136       }
1137
1138       if (if_inst->conditional_mod) {
1139          /* Sandybridge-specific IF with embedded comparison */
1140          bld.CMP(bld.null_reg_d(), if_inst->src[0], if_inst->src[1],
1141                  if_inst->conditional_mod);
1142          set_predicate(BRW_PREDICATE_NORMAL,
1143                        bld.emit(BRW_OPCODE_SEL, then_mov->dst,
1144                                 src0, else_mov->src[0]));
1145       } else {
1146          /* Separate CMP and IF instructions */
1147          set_predicate_inv(if_inst->predicate, if_inst->predicate_inverse,
1148                            bld.emit(BRW_OPCODE_SEL, then_mov->dst,
1149                                     src0, else_mov->src[0]));
1150       }
1151
1152       return true;
1153    }
1154
1155    return false;
1156 }
1157
1158 void
1159 fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
1160                                 fs_reg dst, fs_reg offset, fs_reg src0,
1161                                 fs_reg src1)
1162 {
1163    int reg_width = dispatch_width / 8;
1164    int length = 0;
1165
1166    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 4);
1167
1168    sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
1169    /* Initialize the sample mask in the message header. */
1170    emit(MOV(sources[0], fs_reg(0u)))
1171       ->force_writemask_all = true;
1172
1173    if (stage == MESA_SHADER_FRAGMENT) {
1174       if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
1175          emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
1176             ->force_writemask_all = true;
1177       } else {
1178          emit(MOV(component(sources[0], 7),
1179                   retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
1180             ->force_writemask_all = true;
1181       }
1182    } else {
1183       /* The execution mask is part of the side-band information sent together with
1184        * the message payload to the data port. It's implicitly ANDed with the sample
1185        * mask sent in the header to compute the actual set of channels that execute
1186        * the atomic operation.
1187        */
1188       assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
1189       emit(MOV(component(sources[0], 7),
1190                fs_reg(0xffffu)))->force_writemask_all = true;
1191    }
1192    length++;
1193
1194    /* Set the atomic operation offset. */
1195    sources[1] = vgrf(glsl_type::uint_type);
1196    emit(MOV(sources[1], offset));
1197    length++;
1198
1199    /* Set the atomic operation arguments. */
1200    if (src0.file != BAD_FILE) {
1201       sources[length] = vgrf(glsl_type::uint_type);
1202       emit(MOV(sources[length], src0));
1203       length++;
1204    }
1205
1206    if (src1.file != BAD_FILE) {
1207       sources[length] = vgrf(glsl_type::uint_type);
1208       emit(MOV(sources[length], src1));
1209       length++;
1210    }
1211
1212    int mlen = 1 + (length - 1) * reg_width;
1213    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
1214                                BRW_REGISTER_TYPE_UD, dispatch_width);
1215    emit(LOAD_PAYLOAD(src_payload, sources, length, 1));
1216
1217    /* Emit the instruction. */
1218    fs_inst *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst, src_payload,
1219                         fs_reg(surf_index), fs_reg(atomic_op));
1220    inst->mlen = mlen;
1221 }
1222
1223 void
1224 fs_visitor::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
1225                                       fs_reg offset)
1226 {
1227    int reg_width = dispatch_width / 8;
1228
1229    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
1230
1231    sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
1232    /* Initialize the sample mask in the message header. */
1233    emit(MOV(sources[0], fs_reg(0u)))
1234       ->force_writemask_all = true;
1235
1236    if (stage == MESA_SHADER_FRAGMENT) {
1237       if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
1238          emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
1239             ->force_writemask_all = true;
1240       } else {
1241          emit(MOV(component(sources[0], 7),
1242                   retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
1243             ->force_writemask_all = true;
1244       }
1245    } else {
1246       /* The execution mask is part of the side-band information sent together with
1247        * the message payload to the data port. It's implicitly ANDed with the sample
1248        * mask sent in the header to compute the actual set of channels that execute
1249        * the atomic operation.
1250        */
1251       assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
1252       emit(MOV(component(sources[0], 7),
1253                fs_reg(0xffffu)))->force_writemask_all = true;
1254    }
1255
1256    /* Set the surface read offset. */
1257    sources[1] = vgrf(glsl_type::uint_type);
1258    emit(MOV(sources[1], offset));
1259
1260    int mlen = 1 + reg_width;
1261    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
1262                                BRW_REGISTER_TYPE_UD, dispatch_width);
1263    fs_inst *inst = emit(LOAD_PAYLOAD(src_payload, sources, 2, 1));
1264
1265    /* Emit the instruction. */
1266    inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, src_payload,
1267                fs_reg(surf_index), fs_reg(1));
1268    inst->mlen = mlen;
1269 }
1270
1271 fs_inst *
1272 fs_visitor::emit(fs_inst *inst)
1273 {
1274    if (dispatch_width == 16 && inst->exec_size == 8)
1275       inst->force_uncompressed = true;
1276
1277    inst->annotation = this->current_annotation;
1278    inst->ir = this->base_ir;
1279
1280    this->instructions.push_tail(inst);
1281
1282    return inst;
1283 }
1284
1285 void
1286 fs_visitor::emit(exec_list list)
1287 {
1288    foreach_in_list_safe(fs_inst, inst, &list) {
1289       inst->exec_node::remove();
1290       emit(inst);
1291    }
1292 }
1293
1294 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1295 void
1296 fs_visitor::emit_dummy_fs()
1297 {
1298    int reg_width = dispatch_width / 8;
1299
1300    /* Everyone's favorite color. */
1301    const float color[4] = { 1.0, 0.0, 1.0, 0.0 };
1302    for (int i = 0; i < 4; i++) {
1303       emit(MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F,
1304                       dispatch_width), fs_reg(color[i])));
1305    }
1306
1307    fs_inst *write;
1308    write = emit(FS_OPCODE_FB_WRITE);
1309    write->eot = true;
1310    if (devinfo->gen >= 6) {
1311       write->base_mrf = 2;
1312       write->mlen = 4 * reg_width;
1313    } else {
1314       write->header_size = 2;
1315       write->base_mrf = 0;
1316       write->mlen = 2 + 4 * reg_width;
1317    }
1318
1319    /* Tell the SF we don't have any inputs.  Gen4-5 require at least one
1320     * varying to avoid GPU hangs, so set that.
1321     */
1322    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
1323    wm_prog_data->num_varying_inputs = devinfo->gen < 6 ? 1 : 0;
1324    memset(wm_prog_data->urb_setup, -1,
1325           sizeof(wm_prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1326
1327    /* We don't have any uniforms. */
1328    stage_prog_data->nr_params = 0;
1329    stage_prog_data->nr_pull_params = 0;
1330    stage_prog_data->curb_read_length = 0;
1331    stage_prog_data->dispatch_grf_start_reg = 2;
1332    wm_prog_data->dispatch_grf_start_reg_16 = 2;
1333    grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */
1334
1335    calculate_cfg();
1336 }
1337
1338 /* The register location here is relative to the start of the URB
1339  * data.  It will get adjusted to be a real location before
1340  * generate_code() time.
1341  */
1342 struct brw_reg
1343 fs_visitor::interp_reg(int location, int channel)
1344 {
1345    assert(stage == MESA_SHADER_FRAGMENT);
1346    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1347    int regnr = prog_data->urb_setup[location] * 2 + channel / 2;
1348    int stride = (channel & 1) * 4;
1349
1350    assert(prog_data->urb_setup[location] != -1);
1351
1352    return brw_vec1_grf(regnr, stride);
1353 }
1354
1355 /** Emits the interpolation for the varying inputs. */
1356 void
1357 fs_visitor::emit_interpolation_setup_gen4()
1358 {
1359    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1360
1361    this->current_annotation = "compute pixel centers";
1362    this->pixel_x = vgrf(glsl_type::uint_type);
1363    this->pixel_y = vgrf(glsl_type::uint_type);
1364    this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1365    this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1366    emit(ADD(this->pixel_x,
1367             fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1368             fs_reg(brw_imm_v(0x10101010))));
1369    emit(ADD(this->pixel_y,
1370             fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1371             fs_reg(brw_imm_v(0x11001100))));
1372
1373    this->current_annotation = "compute pixel deltas from v0";
1374
1375    this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
1376       vgrf(glsl_type::vec2_type);
1377    const fs_reg &delta_xy = this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC];
1378    const fs_reg xstart(negate(brw_vec1_grf(1, 0)));
1379    const fs_reg ystart(negate(brw_vec1_grf(1, 1)));
1380
1381    if (devinfo->has_pln && dispatch_width == 16) {
1382       emit(ADD(half(offset(delta_xy, 0), 0), half(this->pixel_x, 0), xstart));
1383       emit(ADD(half(offset(delta_xy, 0), 1), half(this->pixel_y, 0), ystart));
1384       emit(ADD(half(offset(delta_xy, 1), 0), half(this->pixel_x, 1), xstart))
1385          ->force_sechalf = true;
1386       emit(ADD(half(offset(delta_xy, 1), 1), half(this->pixel_y, 1), ystart))
1387          ->force_sechalf = true;
1388    } else {
1389       emit(ADD(offset(delta_xy, 0), this->pixel_x, xstart));
1390       emit(ADD(offset(delta_xy, 1), this->pixel_y, ystart));
1391    }
1392
1393    this->current_annotation = "compute pos.w and 1/pos.w";
1394    /* Compute wpos.w.  It's always in our setup, since it's needed to
1395     * interpolate the other attributes.
1396     */
1397    this->wpos_w = vgrf(glsl_type::float_type);
1398    emit(FS_OPCODE_LINTERP, wpos_w, delta_xy, interp_reg(VARYING_SLOT_POS, 3));
1399    /* Compute the pixel 1/W value from wpos.w. */
1400    this->pixel_w = vgrf(glsl_type::float_type);
1401    emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
1402    this->current_annotation = NULL;
1403 }
1404
1405 /** Emits the interpolation for the varying inputs. */
1406 void
1407 fs_visitor::emit_interpolation_setup_gen6()
1408 {
1409    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1410
1411    this->current_annotation = "compute pixel centers";
1412    if (brw->gen >= 8 || dispatch_width == 8) {
1413       /* The "Register Region Restrictions" page says for BDW (and newer,
1414        * presumably):
1415        *
1416        *     "When destination spans two registers, the source may be one or
1417        *      two registers. The destination elements must be evenly split
1418        *      between the two registers."
1419        *
1420        * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16 to
1421        * compute our pixel centers.
1422        */
1423       fs_reg int_pixel_xy(GRF, alloc.allocate(dispatch_width / 8),
1424                           BRW_REGISTER_TYPE_UW, dispatch_width * 2);
1425       emit(ADD(int_pixel_xy,
1426                fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)),
1427                fs_reg(brw_imm_v(0x11001010))))
1428          ->force_writemask_all = true;
1429
1430       this->pixel_x = vgrf(glsl_type::float_type);
1431       this->pixel_y = vgrf(glsl_type::float_type);
1432       emit(FS_OPCODE_PIXEL_X, this->pixel_x, int_pixel_xy);
1433       emit(FS_OPCODE_PIXEL_Y, this->pixel_y, int_pixel_xy);
1434    } else {
1435       /* The "Register Region Restrictions" page says for SNB, IVB, HSW:
1436        *
1437        *     "When destination spans two registers, the source MUST span two
1438        *      registers."
1439        *
1440        * Since the GRF source of the ADD will only read a single register, we
1441        * must do two separate ADDs in SIMD16.
1442        */
1443       fs_reg int_pixel_x = vgrf(glsl_type::uint_type);
1444       fs_reg int_pixel_y = vgrf(glsl_type::uint_type);
1445       int_pixel_x.type = BRW_REGISTER_TYPE_UW;
1446       int_pixel_y.type = BRW_REGISTER_TYPE_UW;
1447       emit(ADD(int_pixel_x,
1448                fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1449                fs_reg(brw_imm_v(0x10101010))));
1450       emit(ADD(int_pixel_y,
1451                fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1452                fs_reg(brw_imm_v(0x11001100))));
1453
1454       /* As of gen6, we can no longer mix float and int sources.  We have
1455        * to turn the integer pixel centers into floats for their actual
1456        * use.
1457        */
1458       this->pixel_x = vgrf(glsl_type::float_type);
1459       this->pixel_y = vgrf(glsl_type::float_type);
1460       emit(MOV(this->pixel_x, int_pixel_x));
1461       emit(MOV(this->pixel_y, int_pixel_y));
1462    }
1463
1464    this->current_annotation = "compute pos.w";
1465    this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0));
1466    this->wpos_w = vgrf(glsl_type::float_type);
1467    emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
1468
1469    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
1470       uint8_t reg = payload.barycentric_coord_reg[i];
1471       this->delta_xy[i] = fs_reg(brw_vec16_grf(reg, 0));
1472    }
1473
1474    this->current_annotation = NULL;
1475 }
1476
1477 void
1478 fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
1479                                 unsigned exec_size, bool use_2nd_half)
1480 {
1481    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1482    fs_inst *inst;
1483
1484    if (key->clamp_fragment_color) {
1485       fs_reg tmp = vgrf(glsl_type::vec4_type);
1486       assert(color.type == BRW_REGISTER_TYPE_F);
1487       for (unsigned i = 0; i < components; i++) {
1488          inst = emit(MOV(offset(tmp, i), offset(color, i)));
1489          inst->saturate = true;
1490       }
1491       color = tmp;
1492    }
1493
1494    if (exec_size < dispatch_width) {
1495       unsigned half_idx = use_2nd_half ? 1 : 0;
1496       for (unsigned i = 0; i < components; i++)
1497          dst[i] = half(offset(color, i), half_idx);
1498    } else {
1499       for (unsigned i = 0; i < components; i++)
1500          dst[i] = offset(color, i);
1501    }
1502 }
1503
1504 static enum brw_conditional_mod
1505 cond_for_alpha_func(GLenum func)
1506 {
1507    switch(func) {
1508       case GL_GREATER:
1509          return BRW_CONDITIONAL_G;
1510       case GL_GEQUAL:
1511          return BRW_CONDITIONAL_GE;
1512       case GL_LESS:
1513          return BRW_CONDITIONAL_L;
1514       case GL_LEQUAL:
1515          return BRW_CONDITIONAL_LE;
1516       case GL_EQUAL:
1517          return BRW_CONDITIONAL_EQ;
1518       case GL_NOTEQUAL:
1519          return BRW_CONDITIONAL_NEQ;
1520       default:
1521          unreachable("Not reached");
1522    }
1523 }
1524
1525 /**
1526  * Alpha test support for when we compile it into the shader instead
1527  * of using the normal fixed-function alpha test.
1528  */
1529 void
1530 fs_visitor::emit_alpha_test()
1531 {
1532    assert(stage == MESA_SHADER_FRAGMENT);
1533    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1534    this->current_annotation = "Alpha test";
1535
1536    fs_inst *cmp;
1537    if (key->alpha_test_func == GL_ALWAYS)
1538       return;
1539
1540    if (key->alpha_test_func == GL_NEVER) {
1541       /* f0.1 = 0 */
1542       fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
1543                                       BRW_REGISTER_TYPE_UW));
1544       cmp = emit(CMP(reg_null_f, some_reg, some_reg,
1545                      BRW_CONDITIONAL_NEQ));
1546    } else {
1547       /* RT0 alpha */
1548       fs_reg color = offset(outputs[0], 3);
1549
1550       /* f0.1 &= func(color, ref) */
1551       cmp = emit(CMP(reg_null_f, color, fs_reg(key->alpha_test_ref),
1552                      cond_for_alpha_func(key->alpha_test_func)));
1553    }
1554    cmp->predicate = BRW_PREDICATE_NORMAL;
1555    cmp->flag_subreg = 1;
1556 }
1557
1558 fs_inst *
1559 fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
1560                                  fs_reg src0_alpha, unsigned components,
1561                                  unsigned exec_size, bool use_2nd_half)
1562 {
1563    assert(stage == MESA_SHADER_FRAGMENT);
1564    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1565    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1566
1567    this->current_annotation = "FB write header";
1568    int header_size = 2, payload_header_size;
1569
1570    /* We can potentially have a message length of up to 15, so we have to set
1571     * base_mrf to either 0 or 1 in order to fit in m0..m15.
1572     */
1573    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 15);
1574    int length = 0;
1575
1576    /* From the Sandy Bridge PRM, volume 4, page 198:
1577     *
1578     *     "Dispatched Pixel Enables. One bit per pixel indicating
1579     *      which pixels were originally enabled when the thread was
1580     *      dispatched. This field is only required for the end-of-
1581     *      thread message and on all dual-source messages."
1582     */
1583    if (devinfo->gen >= 6 &&
1584        (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
1585        color1.file == BAD_FILE &&
1586        key->nr_color_regions == 1) {
1587       header_size = 0;
1588    }
1589
1590    if (header_size != 0) {
1591       assert(header_size == 2);
1592       /* Allocate 2 registers for a header */
1593       length += 2;
1594    }
1595
1596    if (payload.aa_dest_stencil_reg) {
1597       sources[length] = fs_reg(GRF, alloc.allocate(1));
1598       emit(MOV(sources[length],
1599                fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0))));
1600       length++;
1601    }
1602
1603    prog_data->uses_omask =
1604       prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
1605    if (prog_data->uses_omask) {
1606       this->current_annotation = "FB write oMask";
1607       assert(this->sample_mask.file != BAD_FILE);
1608       /* Hand over gl_SampleMask. Only lower 16 bits are relevant.  Since
1609        * it's unsinged single words, one vgrf is always 16-wide.
1610        */
1611       sources[length] = fs_reg(GRF, alloc.allocate(1),
1612                                BRW_REGISTER_TYPE_UW, 16);
1613       emit(FS_OPCODE_SET_OMASK, sources[length], this->sample_mask);
1614       length++;
1615    }
1616
1617    payload_header_size = length;
1618
1619    if (color0.file == BAD_FILE) {
1620       /* Even if there's no color buffers enabled, we still need to send
1621        * alpha out the pipeline to our null renderbuffer to support
1622        * alpha-testing, alpha-to-coverage, and so on.
1623        */
1624       if (this->outputs[0].file != BAD_FILE)
1625          setup_color_payload(&sources[length + 3], offset(this->outputs[0], 3),
1626                              1, exec_size, false);
1627       length += 4;
1628    } else if (color1.file == BAD_FILE) {
1629       if (src0_alpha.file != BAD_FILE) {
1630          setup_color_payload(&sources[length], src0_alpha, 1, exec_size, false);
1631          length++;
1632       }
1633
1634       setup_color_payload(&sources[length], color0, components,
1635                           exec_size, use_2nd_half);
1636       length += 4;
1637    } else {
1638       setup_color_payload(&sources[length], color0, components,
1639                           exec_size, use_2nd_half);
1640       length += 4;
1641       setup_color_payload(&sources[length], color1, components,
1642                           exec_size, use_2nd_half);
1643       length += 4;
1644    }
1645
1646    if (source_depth_to_render_target) {
1647       if (devinfo->gen == 6) {
1648          /* For outputting oDepth on gen6, SIMD8 writes have to be
1649           * used.  This would require SIMD8 moves of each half to
1650           * message regs, kind of like pre-gen5 SIMD16 FB writes.
1651           * Just bail on doing so for now.
1652           */
1653          no16("Missing support for simd16 depth writes on gen6\n");
1654       }
1655
1656       if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
1657          /* Hand over gl_FragDepth. */
1658          assert(this->frag_depth.file != BAD_FILE);
1659          if (exec_size < dispatch_width) {
1660             sources[length] = half(this->frag_depth, use_2nd_half);
1661          } else {
1662             sources[length] = this->frag_depth;
1663          }
1664       } else {
1665          /* Pass through the payload depth. */
1666          sources[length] = fs_reg(brw_vec8_grf(payload.source_depth_reg, 0));
1667       }
1668       length++;
1669    }
1670
1671    if (payload.dest_depth_reg)
1672       sources[length++] = fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0));
1673
1674    fs_inst *load;
1675    fs_inst *write;
1676    if (devinfo->gen >= 7) {
1677       /* Send from the GRF */
1678       fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F, exec_size);
1679       load = emit(LOAD_PAYLOAD(payload, sources, length, payload_header_size));
1680       payload.reg = alloc.allocate(load->regs_written);
1681       load->dst = payload;
1682       write = emit(FS_OPCODE_FB_WRITE, reg_undef, payload);
1683       write->base_mrf = -1;
1684    } else {
1685       /* Send from the MRF */
1686       load = emit(LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F, exec_size),
1687                                sources, length, payload_header_size));
1688
1689       /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
1690        * will do this for us if we just give it a COMPR4 destination.
1691        */
1692       if (brw->gen < 6 && exec_size == 16)
1693          load->dst.reg |= BRW_MRF_COMPR4;
1694
1695       write = emit(FS_OPCODE_FB_WRITE);
1696       write->exec_size = exec_size;
1697       write->base_mrf = 1;
1698    }
1699
1700    write->mlen = load->regs_written;
1701    write->header_size = header_size;
1702    if (prog_data->uses_kill) {
1703       write->predicate = BRW_PREDICATE_NORMAL;
1704       write->flag_subreg = 1;
1705    }
1706    return write;
1707 }
1708
1709 void
1710 fs_visitor::emit_fb_writes()
1711 {
1712    assert(stage == MESA_SHADER_FRAGMENT);
1713    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1714    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1715
1716    fs_inst *inst = NULL;
1717    if (do_dual_src) {
1718       this->current_annotation = ralloc_asprintf(this->mem_ctx,
1719                                                  "FB dual-source write");
1720       inst = emit_single_fb_write(this->outputs[0], this->dual_src_output,
1721                                   reg_undef, 4, 8);
1722       inst->target = 0;
1723
1724       /* SIMD16 dual source blending requires to send two SIMD8 dual source
1725        * messages, where each message contains color data for 8 pixels. Color
1726        * data for the first group of pixels is stored in the "lower" half of
1727        * the color registers, so in SIMD16, the previous message did:
1728        * m + 0: r0
1729        * m + 1: g0
1730        * m + 2: b0
1731        * m + 3: a0
1732        *
1733        * Here goes the second message, which packs color data for the
1734        * remaining 8 pixels. Color data for these pixels is stored in the
1735        * "upper" half of the color registers, so we need to do:
1736        * m + 0: r1
1737        * m + 1: g1
1738        * m + 2: b1
1739        * m + 3: a1
1740        */
1741       if (dispatch_width == 16) {
1742          inst = emit_single_fb_write(this->outputs[0], this->dual_src_output,
1743                                      reg_undef, 4, 8, true);
1744          inst->target = 0;
1745       }
1746
1747       prog_data->dual_src_blend = true;
1748    } else {
1749       for (int target = 0; target < key->nr_color_regions; target++) {
1750          /* Skip over outputs that weren't written. */
1751          if (this->outputs[target].file == BAD_FILE)
1752             continue;
1753
1754          this->current_annotation = ralloc_asprintf(this->mem_ctx,
1755                                                     "FB write target %d",
1756                                                     target);
1757          fs_reg src0_alpha;
1758          if (devinfo->gen >= 6 && key->replicate_alpha && target != 0)
1759             src0_alpha = offset(outputs[0], 3);
1760
1761          inst = emit_single_fb_write(this->outputs[target], reg_undef,
1762                                      src0_alpha,
1763                                      this->output_components[target],
1764                                      dispatch_width);
1765          inst->target = target;
1766       }
1767    }
1768
1769    if (inst == NULL) {
1770       /* Even if there's no color buffers enabled, we still need to send
1771        * alpha out the pipeline to our null renderbuffer to support
1772        * alpha-testing, alpha-to-coverage, and so on.
1773        */
1774       inst = emit_single_fb_write(reg_undef, reg_undef, reg_undef, 0,
1775                                   dispatch_width);
1776       inst->target = 0;
1777    }
1778
1779    inst->eot = true;
1780    this->current_annotation = NULL;
1781 }
1782
1783 void
1784 fs_visitor::setup_uniform_clipplane_values()
1785 {
1786    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
1787    const struct brw_vue_prog_key *key =
1788       (const struct brw_vue_prog_key *) this->key;
1789
1790    for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
1791       this->userplane[i] = fs_reg(UNIFORM, uniforms);
1792       for (int j = 0; j < 4; ++j) {
1793          stage_prog_data->param[uniforms + j] =
1794             (gl_constant_value *) &clip_planes[i][j];
1795       }
1796       uniforms += 4;
1797    }
1798 }
1799
1800 void fs_visitor::compute_clip_distance()
1801 {
1802    struct brw_vue_prog_data *vue_prog_data =
1803       (struct brw_vue_prog_data *) prog_data;
1804    const struct brw_vue_prog_key *key =
1805       (const struct brw_vue_prog_key *) this->key;
1806
1807    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
1808     *
1809     *     "If a linked set of shaders forming the vertex stage contains no
1810     *     static write to gl_ClipVertex or gl_ClipDistance, but the
1811     *     application has requested clipping against user clip planes through
1812     *     the API, then the coordinate written to gl_Position is used for
1813     *     comparison against the user clip planes."
1814     *
1815     * This function is only called if the shader didn't write to
1816     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
1817     * if the user wrote to it; otherwise we use gl_Position.
1818     */
1819
1820    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
1821    if (!(vue_prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX))
1822       clip_vertex = VARYING_SLOT_POS;
1823
1824    /* If the clip vertex isn't written, skip this.  Typically this means
1825     * the GS will set up clipping. */
1826    if (outputs[clip_vertex].file == BAD_FILE)
1827       return;
1828
1829    setup_uniform_clipplane_values();
1830
1831    current_annotation = "user clip distances";
1832
1833    this->outputs[VARYING_SLOT_CLIP_DIST0] = vgrf(glsl_type::vec4_type);
1834    this->outputs[VARYING_SLOT_CLIP_DIST1] = vgrf(glsl_type::vec4_type);
1835
1836    for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
1837       fs_reg u = userplane[i];
1838       fs_reg output = outputs[VARYING_SLOT_CLIP_DIST0 + i / 4];
1839       output.reg_offset = i & 3;
1840
1841       emit(MUL(output, outputs[clip_vertex], u));
1842       for (int j = 1; j < 4; j++) {
1843          u.reg = userplane[i].reg + j;
1844          emit(MAD(output, output, offset(outputs[clip_vertex], j), u));
1845       }
1846    }
1847 }
1848
1849 void
1850 fs_visitor::emit_urb_writes()
1851 {
1852    int slot, urb_offset, length;
1853    struct brw_vs_prog_data *vs_prog_data =
1854       (struct brw_vs_prog_data *) prog_data;
1855    const struct brw_vs_prog_key *key =
1856       (const struct brw_vs_prog_key *) this->key;
1857    const GLbitfield64 psiz_mask =
1858       VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ;
1859    const struct brw_vue_map *vue_map = &vs_prog_data->base.vue_map;
1860    bool flush;
1861    fs_reg sources[8];
1862
1863    /* Lower legacy ff and ClipVertex clipping to clip distances */
1864    if (key->base.userclip_active && !prog->UsesClipDistanceOut)
1865       compute_clip_distance();
1866
1867    /* If we don't have any valid slots to write, just do a minimal urb write
1868     * send to terminate the shader. */
1869    if (vue_map->slots_valid == 0) {
1870
1871       fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
1872       fs_inst *inst = emit(MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0),
1873                                                       BRW_REGISTER_TYPE_UD))));
1874       inst->force_writemask_all = true;
1875
1876       inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
1877       inst->eot = true;
1878       inst->mlen = 1;
1879       inst->offset = 1;
1880       return;
1881    }
1882
1883    length = 0;
1884    urb_offset = 0;
1885    flush = false;
1886    for (slot = 0; slot < vue_map->num_slots; slot++) {
1887       fs_reg reg, src, zero;
1888
1889       int varying = vue_map->slot_to_varying[slot];
1890       switch (varying) {
1891       case VARYING_SLOT_PSIZ:
1892
1893          /* The point size varying slot is the vue header and is always in the
1894           * vue map.  But often none of the special varyings that live there
1895           * are written and in that case we can skip writing to the vue
1896           * header, provided the corresponding state properly clamps the
1897           * values further down the pipeline. */
1898          if ((vue_map->slots_valid & psiz_mask) == 0) {
1899             assert(length == 0);
1900             urb_offset++;
1901             break;
1902          }
1903
1904          zero = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
1905          emit(MOV(zero, fs_reg(0u)));
1906
1907          sources[length++] = zero;
1908          if (vue_map->slots_valid & VARYING_BIT_LAYER)
1909             sources[length++] = this->outputs[VARYING_SLOT_LAYER];
1910          else
1911             sources[length++] = zero;
1912
1913          if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
1914             sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
1915          else
1916             sources[length++] = zero;
1917
1918          if (vue_map->slots_valid & VARYING_BIT_PSIZ)
1919             sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
1920          else
1921             sources[length++] = zero;
1922          break;
1923
1924       case BRW_VARYING_SLOT_NDC:
1925       case VARYING_SLOT_EDGE:
1926          unreachable("unexpected scalar vs output");
1927          break;
1928
1929       case BRW_VARYING_SLOT_PAD:
1930          break;
1931
1932       default:
1933          /* gl_Position is always in the vue map, but isn't always written by
1934           * the shader.  Other varyings (clip distances) get added to the vue
1935           * map but don't always get written.  In those cases, the
1936           * corresponding this->output[] slot will be invalid we and can skip
1937           * the urb write for the varying.  If we've already queued up a vue
1938           * slot for writing we flush a mlen 5 urb write, otherwise we just
1939           * advance the urb_offset.
1940           */
1941          if (this->outputs[varying].file == BAD_FILE) {
1942             if (length > 0)
1943                flush = true;
1944             else
1945                urb_offset++;
1946             break;
1947          }
1948
1949          if ((varying == VARYING_SLOT_COL0 ||
1950               varying == VARYING_SLOT_COL1 ||
1951               varying == VARYING_SLOT_BFC0 ||
1952               varying == VARYING_SLOT_BFC1) &&
1953              key->clamp_vertex_color) {
1954             /* We need to clamp these guys, so do a saturating MOV into a
1955              * temp register and use that for the payload.
1956              */
1957             for (int i = 0; i < 4; i++) {
1958                reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type);
1959                src = offset(this->outputs[varying], i);
1960                fs_inst *inst = emit(MOV(reg, src));
1961                inst->saturate = true;
1962                sources[length++] = reg;
1963             }
1964          } else {
1965             for (int i = 0; i < 4; i++)
1966                sources[length++] = offset(this->outputs[varying], i);
1967          }
1968          break;
1969       }
1970
1971       current_annotation = "URB write";
1972
1973       /* If we've queued up 8 registers of payload (2 VUE slots), if this is
1974        * the last slot or if we need to flush (see BAD_FILE varying case
1975        * above), emit a URB write send now to flush out the data.
1976        */
1977       int last = slot == vue_map->num_slots - 1;
1978       if (length == 8 || last)
1979          flush = true;
1980       if (flush) {
1981          fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
1982          fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1),
1983                                  BRW_REGISTER_TYPE_F, dispatch_width);
1984          payload_sources[0] =
1985             fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
1986
1987          memcpy(&payload_sources[1], sources, length * sizeof sources[0]);
1988          emit(LOAD_PAYLOAD(payload, payload_sources, length + 1, 1));
1989
1990          fs_inst *inst =
1991             emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
1992          inst->eot = last;
1993          inst->mlen = length + 1;
1994          inst->offset = urb_offset;
1995          urb_offset = slot + 1;
1996          length = 0;
1997          flush = false;
1998       }
1999    }
2000 }
2001
2002 void
2003 fs_visitor::resolve_ud_negate(fs_reg *reg)
2004 {
2005    if (reg->type != BRW_REGISTER_TYPE_UD ||
2006        !reg->negate)
2007       return;
2008
2009    fs_reg temp = vgrf(glsl_type::uint_type);
2010    emit(MOV(temp, *reg));
2011    *reg = temp;
2012 }
2013
2014 void
2015 fs_visitor::emit_cs_terminate()
2016 {
2017    assert(brw->gen >= 7);
2018
2019    /* We are getting the thread ID from the compute shader header */
2020    assert(stage == MESA_SHADER_COMPUTE);
2021
2022    /* We can't directly send from g0, since sends with EOT have to use
2023     * g112-127. So, copy it to a virtual register, The register allocator will
2024     * make sure it uses the appropriate register range.
2025     */
2026    struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD);
2027    fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
2028    fs_inst *inst = emit(MOV(payload, g0));
2029    inst->force_writemask_all = true;
2030
2031    /* Send a message to the thread spawner to terminate the thread. */
2032    inst = emit(CS_OPCODE_CS_TERMINATE, reg_undef, payload);
2033    inst->eot = true;
2034 }
2035
2036 fs_visitor::fs_visitor(struct brw_context *brw,
2037                        void *mem_ctx,
2038                        gl_shader_stage stage,
2039                        const void *key,
2040                        struct brw_stage_prog_data *prog_data,
2041                        struct gl_shader_program *shader_prog,
2042                        struct gl_program *prog,
2043                        unsigned dispatch_width)
2044    : backend_shader(brw, shader_prog, prog, prog_data, stage),
2045      reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
2046      reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
2047      reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
2048      key(key), prog_data(prog_data),
2049      dispatch_width(dispatch_width), promoted_constants(0),
2050      bld(fs_builder(this, dispatch_width).at_end())
2051 {
2052    this->mem_ctx = mem_ctx;
2053
2054    switch (stage) {
2055    case MESA_SHADER_FRAGMENT:
2056       key_tex = &((const brw_wm_prog_key *) key)->tex;
2057       break;
2058    case MESA_SHADER_VERTEX:
2059    case MESA_SHADER_GEOMETRY:
2060       key_tex = &((const brw_vue_prog_key *) key)->tex;
2061       break;
2062    case MESA_SHADER_COMPUTE:
2063       key_tex = &((const brw_cs_prog_key*) key)->tex;
2064       break;
2065    default:
2066       unreachable("unhandled shader stage");
2067    }
2068
2069    this->failed = false;
2070    this->simd16_unsupported = false;
2071    this->no16_msg = NULL;
2072
2073    this->nir_locals = NULL;
2074    this->nir_globals = NULL;
2075
2076    memset(&this->payload, 0, sizeof(this->payload));
2077    memset(this->outputs, 0, sizeof(this->outputs));
2078    memset(this->output_components, 0, sizeof(this->output_components));
2079    this->source_depth_to_render_target = false;
2080    this->runtime_check_aads_emit = false;
2081    this->first_non_payload_grf = 0;
2082    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2083
2084    this->current_annotation = NULL;
2085    this->base_ir = NULL;
2086
2087    this->virtual_grf_start = NULL;
2088    this->virtual_grf_end = NULL;
2089    this->live_intervals = NULL;
2090    this->regs_live_at_ip = NULL;
2091
2092    this->uniforms = 0;
2093    this->last_scratch = 0;
2094    this->pull_constant_loc = NULL;
2095    this->push_constant_loc = NULL;
2096
2097    this->spilled_any_registers = false;
2098    this->do_dual_src = false;
2099
2100    if (dispatch_width == 8)
2101       this->param_size = rzalloc_array(mem_ctx, int, stage_prog_data->nr_params);
2102 }
2103
2104 fs_visitor::~fs_visitor()
2105 {
2106 }