src/mesa/drivers/dri/i965/brw_fs_visitor.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs_visitor.cpp
  25  *
  26  * This file supports generating the FS LIR from the GLSL IR.  The LIR
  27  * makes it easier to do backend-specific optimizations than doing so
  28  * in the GLSL IR or in the native code.
  29  */
  30 #include <sys/types.h>
  31
  32 #include "main/macros.h"
  33 #include "main/shaderobj.h"
  34 #include "program/prog_parameter.h"
  35 #include "program/prog_print.h"
  36 #include "program/prog_optimize.h"
  37 #include "util/register_allocate.h"
  38 #include "program/hash_table.h"
  39 #include "brw_context.h"
  40 #include "brw_eu.h"
  41 #include "brw_wm.h"
  42 #include "brw_cs.h"
  43 #include "brw_vec4.h"
  44 #include "brw_fs.h"
  45 #include "main/uniforms.h"
  46 #include "glsl/glsl_types.h"
  47 #include "glsl/ir_optimization.h"
  48 #include "program/sampler.h"
  49
  50 using namespace brw;
  51
  52 fs_reg *
  53 fs_visitor::emit_vs_system_value(int location)
  54 {
  55    fs_reg *reg = new(this->mem_ctx)
  56       fs_reg(ATTR, VERT_ATTRIB_MAX, BRW_REGISTER_TYPE_D);
  57    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
  58
  59    switch (location) {
  60    case SYSTEM_VALUE_BASE_VERTEX:
  61       reg->reg_offset = 0;
  62       vs_prog_data->uses_vertexid = true;
  63       break;
  64    case SYSTEM_VALUE_VERTEX_ID:
  65    case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
  66       reg->reg_offset = 2;
  67       vs_prog_data->uses_vertexid = true;
  68       break;
  69    case SYSTEM_VALUE_INSTANCE_ID:
  70       reg->reg_offset = 3;
  71       vs_prog_data->uses_instanceid = true;
  72       break;
  73    default:
  74       unreachable("not reached");
  75    }
  76
  77    return reg;
  78 }
  79
  80 fs_inst *
  81 fs_visitor::emit_lrp(const fs_reg &dst, const fs_reg &x, const fs_reg &y,
  82                      const fs_reg &a)
  83 {
  84    if (devinfo->gen < 6) {
  85       /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
  86       fs_reg y_times_a           = vgrf(glsl_type::float_type);
  87       fs_reg one_minus_a         = vgrf(glsl_type::float_type);
  88       fs_reg x_times_one_minus_a = vgrf(glsl_type::float_type);
  89
  90       emit(MUL(y_times_a, y, a));
  91
  92       fs_reg negative_a = a;
  93       negative_a.negate = !a.negate;
  94       emit(ADD(one_minus_a, negative_a, fs_reg(1.0f)));
  95       emit(MUL(x_times_one_minus_a, x, one_minus_a));
  96
  97       return emit(ADD(dst, x_times_one_minus_a, y_times_a));
  98    } else {
  99       /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
 100        * we need to reorder the operands.
 101        */
 102       return emit(LRP(dst, a, y, x));
 103    }
 104 }
 105
 106 void
 107 fs_visitor::emit_uniformize(const fs_reg &dst, const fs_reg &src)
 108 {
 109    const fs_reg chan_index = vgrf(glsl_type::uint_type);
 110
 111    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, component(chan_index, 0))
 112       ->force_writemask_all = true;
 113    emit(SHADER_OPCODE_BROADCAST, component(dst, 0),
 114         src, component(chan_index, 0))
 115       ->force_writemask_all = true;
 116 }
 117
 118 fs_inst *
 119 fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
 120                               fs_reg coordinate, int coord_components,
 121                               fs_reg shadow_c,
 122                               fs_reg lod, fs_reg dPdy, int grad_components,
 123                               uint32_t sampler)
 124 {
 125    int mlen;
 126    int base_mrf = 1;
 127    bool simd16 = false;
 128    fs_reg orig_dst;
 129
 130    /* g0 header. */
 131    mlen = 1;
 132
 133    if (shadow_c.file != BAD_FILE) {
 134       for (int i = 0; i < coord_components; i++) {
 135          bld.MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate);
 136          coordinate = offset(coordinate, 1);
 137       }
 138
 139       /* gen4's SIMD8 sampler always has the slots for u,v,r present.
 140        * the unused slots must be zeroed.
 141        */
 142       for (int i = coord_components; i < 3; i++) {
 143          bld.MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f));
 144       }
 145       mlen += 3;
 146
 147       if (op == ir_tex) {
 148          /* There's no plain shadow compare message, so we use shadow
 149           * compare with a bias of 0.0.
 150           */
 151          bld.MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
 152          mlen++;
 153       } else if (op == ir_txb || op == ir_txl) {
 154          bld.MOV(fs_reg(MRF, base_mrf + mlen), lod);
 155          mlen++;
 156       } else {
 157          unreachable("Should not get here.");
 158       }
 159
 160       bld.MOV(fs_reg(MRF, base_mrf + mlen), shadow_c);
 161       mlen++;
 162    } else if (op == ir_tex) {
 163       for (int i = 0; i < coord_components; i++) {
 164          bld.MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate);
 165          coordinate = offset(coordinate, 1);
 166       }
 167       /* zero the others. */
 168       for (int i = coord_components; i<3; i++) {
 169          bld.MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f));
 170       }
 171       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
 172       mlen += 3;
 173    } else if (op == ir_txd) {
 174       fs_reg &dPdx = lod;
 175
 176       for (int i = 0; i < coord_components; i++) {
 177          bld.MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate);
 178          coordinate = offset(coordinate, 1);
 179       }
 180       /* the slots for u and v are always present, but r is optional */
 181       mlen += MAX2(coord_components, 2);
 182
 183       /*  P   = u, v, r
 184        * dPdx = dudx, dvdx, drdx
 185        * dPdy = dudy, dvdy, drdy
 186        *
 187        * 1-arg: Does not exist.
 188        *
 189        * 2-arg: dudx   dvdx   dudy   dvdy
 190        *        dPdx.x dPdx.y dPdy.x dPdy.y
 191        *        m4     m5     m6     m7
 192        *
 193        * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
 194        *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
 195        *        m5     m6     m7     m8     m9     m10
 196        */
 197       for (int i = 0; i < grad_components; i++) {
 198          bld.MOV(fs_reg(MRF, base_mrf + mlen), dPdx);
 199          dPdx = offset(dPdx, 1);
 200       }
 201       mlen += MAX2(grad_components, 2);
 202
 203       for (int i = 0; i < grad_components; i++) {
 204          bld.MOV(fs_reg(MRF, base_mrf + mlen), dPdy);
 205          dPdy = offset(dPdy, 1);
 206       }
 207       mlen += MAX2(grad_components, 2);
 208    } else if (op == ir_txs) {
 209       /* There's no SIMD8 resinfo message on Gen4.  Use SIMD16 instead. */
 210       simd16 = true;
 211       bld.MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod);
 212       mlen += 2;
 213    } else {
 214       /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
 215        * instructions.  We'll need to do SIMD16 here.
 216        */
 217       simd16 = true;
 218       assert(op == ir_txb || op == ir_txl || op == ir_txf);
 219
 220       for (int i = 0; i < coord_components; i++) {
 221          bld.MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
 222                  coordinate);
 223          coordinate = offset(coordinate, 1);
 224       }
 225
 226       /* Initialize the rest of u/v/r with 0.0.  Empirically, this seems to
 227        * be necessary for TXF (ld), but seems wise to do for all messages.
 228        */
 229       for (int i = coord_components; i < 3; i++) {
 230          bld.MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f));
 231       }
 232
 233       /* lod/bias appears after u/v/r. */
 234       mlen += 6;
 235
 236       bld.MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod);
 237       mlen++;
 238
 239       /* The unused upper half. */
 240       mlen++;
 241    }
 242
 243    if (simd16) {
 244       /* Now, since we're doing simd16, the return is 2 interleaved
 245        * vec4s where the odd-indexed ones are junk. We'll need to move
 246        * this weirdness around to the expected layout.
 247        */
 248       orig_dst = dst;
 249       dst = fs_reg(GRF, alloc.allocate(8), orig_dst.type);
 250    }
 251
 252    enum opcode opcode;
 253    switch (op) {
 254    case ir_tex: opcode = SHADER_OPCODE_TEX; break;
 255    case ir_txb: opcode = FS_OPCODE_TXB; break;
 256    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 257    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 258    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 259    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 260    default:
 261       unreachable("not reached");
 262    }
 263
 264    fs_inst *inst = bld.emit(opcode, dst, reg_undef, fs_reg(sampler));
 265    inst->base_mrf = base_mrf;
 266    inst->mlen = mlen;
 267    inst->header_size = 1;
 268    inst->regs_written = simd16 ? 8 : 4;
 269
 270    if (simd16) {
 271       for (int i = 0; i < 4; i++) {
 272          bld.MOV(orig_dst, dst);
 273          orig_dst = offset(orig_dst, 1);
 274          dst = offset(dst, 2);
 275       }
 276    }
 277
 278    return inst;
 279 }
 280
 281 fs_inst *
 282 fs_visitor::emit_texture_gen4_simd16(ir_texture_opcode op, fs_reg dst,
 283                                      fs_reg coordinate, int vector_elements,
 284                                      fs_reg shadow_c, fs_reg lod,
 285                                      uint32_t sampler)
 286 {
 287    fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F, dispatch_width);
 288    bool has_lod = op == ir_txl || op == ir_txb || op == ir_txf;
 289
 290    if (has_lod && shadow_c.file != BAD_FILE)
 291       no16("TXB and TXL with shadow comparison unsupported in SIMD16.");
 292
 293    if (op == ir_txd)
 294       no16("textureGrad unsupported in SIMD16.");
 295
 296    /* Copy the coordinates. */
 297    for (int i = 0; i < vector_elements; i++) {
 298       bld.MOV(retype(offset(message, i), coordinate.type), coordinate);
 299       coordinate = offset(coordinate, 1);
 300    }
 301
 302    fs_reg msg_end = offset(message, vector_elements);
 303
 304    /* Messages other than sample and ld require all three components */
 305    if (has_lod || shadow_c.file != BAD_FILE) {
 306       for (int i = vector_elements; i < 3; i++) {
 307          bld.MOV(offset(message, i), fs_reg(0.0f));
 308       }
 309    }
 310
 311    if (has_lod) {
 312       fs_reg msg_lod = retype(offset(message, 3), op == ir_txf ?
 313                               BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
 314       bld.MOV(msg_lod, lod);
 315       msg_end = offset(msg_lod, 1);
 316    }
 317
 318    if (shadow_c.file != BAD_FILE) {
 319       fs_reg msg_ref = offset(message, 3 + has_lod);
 320       bld.MOV(msg_ref, shadow_c);
 321       msg_end = offset(msg_ref, 1);
 322    }
 323
 324    enum opcode opcode;
 325    switch (op) {
 326    case ir_tex: opcode = SHADER_OPCODE_TEX; break;
 327    case ir_txb: opcode = FS_OPCODE_TXB;     break;
 328    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 329    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 330    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 331    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 332    default: unreachable("not reached");
 333    }
 334
 335    fs_inst *inst = bld.emit(opcode, dst, reg_undef, fs_reg(sampler));
 336    inst->base_mrf = message.reg - 1;
 337    inst->mlen = msg_end.reg - inst->base_mrf;
 338    inst->header_size = 1;
 339    inst->regs_written = 8;
 340
 341    return inst;
 342 }
 343
 344 /* gen5's sampler has slots for u, v, r, array index, then optional
 345  * parameters like shadow comparitor or LOD bias.  If optional
 346  * parameters aren't present, those base slots are optional and don't
 347  * need to be included in the message.
 348  *
 349  * We don't fill in the unnecessary slots regardless, which may look
 350  * surprising in the disassembly.
 351  */
 352 fs_inst *
 353 fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
 354                               fs_reg coordinate, int vector_elements,
 355                               fs_reg shadow_c,
 356                               fs_reg lod, fs_reg lod2, int grad_components,
 357                               fs_reg sample_index, uint32_t sampler,
 358                               bool has_offset)
 359 {
 360    int reg_width = dispatch_width / 8;
 361    unsigned header_size = 0;
 362
 363    fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F, dispatch_width);
 364    fs_reg msg_coords = message;
 365
 366    if (has_offset) {
 367       /* The offsets set up by the ir_texture visitor are in the
 368        * m1 header, so we can't go headerless.
 369        */
 370       header_size = 1;
 371       message.reg--;
 372    }
 373
 374    for (int i = 0; i < vector_elements; i++) {
 375       bld.MOV(retype(offset(msg_coords, i), coordinate.type), coordinate);
 376       coordinate = offset(coordinate, 1);
 377    }
 378    fs_reg msg_end = offset(msg_coords, vector_elements);
 379    fs_reg msg_lod = offset(msg_coords, 4);
 380
 381    if (shadow_c.file != BAD_FILE) {
 382       fs_reg msg_shadow = msg_lod;
 383       bld.MOV(msg_shadow, shadow_c);
 384       msg_lod = offset(msg_shadow, 1);
 385       msg_end = msg_lod;
 386    }
 387
 388    enum opcode opcode;
 389    switch (op) {
 390    case ir_tex:
 391       opcode = SHADER_OPCODE_TEX;
 392       break;
 393    case ir_txb:
 394       bld.MOV(msg_lod, lod);
 395       msg_end = offset(msg_lod, 1);
 396
 397       opcode = FS_OPCODE_TXB;
 398       break;
 399    case ir_txl:
 400       bld.MOV(msg_lod, lod);
 401       msg_end = offset(msg_lod, 1);
 402
 403       opcode = SHADER_OPCODE_TXL;
 404       break;
 405    case ir_txd: {
 406       /**
 407        *  P   =  u,    v,    r
 408        * dPdx = dudx, dvdx, drdx
 409        * dPdy = dudy, dvdy, drdy
 410        *
 411        * Load up these values:
 412        * - dudx   dudy   dvdx   dvdy   drdx   drdy
 413        * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
 414        */
 415       msg_end = msg_lod;
 416       for (int i = 0; i < grad_components; i++) {
 417          bld.MOV(msg_end, lod);
 418          lod = offset(lod, 1);
 419          msg_end = offset(msg_end, 1);
 420
 421          bld.MOV(msg_end, lod2);
 422          lod2 = offset(lod2, 1);
 423          msg_end = offset(msg_end, 1);
 424       }
 425
 426       opcode = SHADER_OPCODE_TXD;
 427       break;
 428    }
 429    case ir_txs:
 430       msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
 431       bld.MOV(msg_lod, lod);
 432       msg_end = offset(msg_lod, 1);
 433
 434       opcode = SHADER_OPCODE_TXS;
 435       break;
 436    case ir_query_levels:
 437       msg_lod = msg_end;
 438       bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
 439       msg_end = offset(msg_lod, 1);
 440
 441       opcode = SHADER_OPCODE_TXS;
 442       break;
 443    case ir_txf:
 444       msg_lod = offset(msg_coords, 3);
 445       bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
 446       msg_end = offset(msg_lod, 1);
 447
 448       opcode = SHADER_OPCODE_TXF;
 449       break;
 450    case ir_txf_ms:
 451       msg_lod = offset(msg_coords, 3);
 452       /* lod */
 453       bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
 454       /* sample index */
 455       bld.MOV(retype(offset(msg_lod, 1), BRW_REGISTER_TYPE_UD), sample_index);
 456       msg_end = offset(msg_lod, 2);
 457
 458       opcode = SHADER_OPCODE_TXF_CMS;
 459       break;
 460    case ir_lod:
 461       opcode = SHADER_OPCODE_LOD;
 462       break;
 463    case ir_tg4:
 464       opcode = SHADER_OPCODE_TG4;
 465       break;
 466    default:
 467       unreachable("not reached");
 468    }
 469
 470    fs_inst *inst = bld.emit(opcode, dst, reg_undef, fs_reg(sampler));
 471    inst->base_mrf = message.reg;
 472    inst->mlen = msg_end.reg - message.reg;
 473    inst->header_size = header_size;
 474    inst->regs_written = 4 * reg_width;
 475
 476    if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
 477       fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
 478            " disallowed by hardware\n");
 479    }
 480
 481    return inst;
 482 }
 483
 484 static bool
 485 is_high_sampler(const struct brw_device_info *devinfo, fs_reg sampler)
 486 {
 487    if (devinfo->gen < 8 && !devinfo->is_haswell)
 488       return false;
 489
 490    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
 491 }
 492
 493 fs_inst *
 494 fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
 495                               fs_reg coordinate, int coord_components,
 496                               fs_reg shadow_c,
 497                               fs_reg lod, fs_reg lod2, int grad_components,
 498                               fs_reg sample_index, fs_reg mcs, fs_reg sampler,
 499                               fs_reg offset_value)
 500 {
 501    int reg_width = dispatch_width / 8;
 502    unsigned header_size = 0;
 503
 504    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, MAX_SAMPLER_MESSAGE_SIZE);
 505    for (int i = 0; i < MAX_SAMPLER_MESSAGE_SIZE; i++) {
 506       sources[i] = vgrf(glsl_type::float_type);
 507    }
 508    int length = 0;
 509
 510    if (op == ir_tg4 || offset_value.file != BAD_FILE ||
 511        is_high_sampler(devinfo, sampler)) {
 512       /* For general texture offsets (no txf workaround), we need a header to
 513        * put them in.  Note that for SIMD16 we're making space for two actual
 514        * hardware registers here, so the emit will have to fix up for this.
 515        *
 516        * * ir4_tg4 needs to place its channel select in the header,
 517        * for interaction with ARB_texture_swizzle
 518        *
 519        * The sampler index is only 4-bits, so for larger sampler numbers we
 520        * need to offset the Sampler State Pointer in the header.
 521        */
 522       header_size = 1;
 523       sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
 524       length++;
 525    }
 526
 527    if (shadow_c.file != BAD_FILE) {
 528       bld.MOV(sources[length], shadow_c);
 529       length++;
 530    }
 531
 532    bool has_nonconstant_offset =
 533       offset_value.file != BAD_FILE && offset_value.file != IMM;
 534    bool coordinate_done = false;
 535
 536    /* The sampler can only meaningfully compute LOD for fragment shader
 537     * messages. For all other stages, we change the opcode to ir_txl and
 538     * hardcode the LOD to 0.
 539     */
 540    if (stage != MESA_SHADER_FRAGMENT && op == ir_tex) {
 541       op = ir_txl;
 542       lod = fs_reg(0.0f);
 543    }
 544
 545    /* Set up the LOD info */
 546    switch (op) {
 547    case ir_tex:
 548    case ir_lod:
 549       break;
 550    case ir_txb:
 551       bld.MOV(sources[length], lod);
 552       length++;
 553       break;
 554    case ir_txl:
 555       bld.MOV(sources[length], lod);
 556       length++;
 557       break;
 558    case ir_txd: {
 559       no16("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
 560
 561       /* Load dPdx and the coordinate together:
 562        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
 563        */
 564       for (int i = 0; i < coord_components; i++) {
 565          bld.MOV(sources[length], coordinate);
 566          coordinate = offset(coordinate, 1);
 567          length++;
 568
 569          /* For cube map array, the coordinate is (u,v,r,ai) but there are
 570           * only derivatives for (u, v, r).
 571           */
 572          if (i < grad_components) {
 573             bld.MOV(sources[length], lod);
 574             lod = offset(lod, 1);
 575             length++;
 576
 577             bld.MOV(sources[length], lod2);
 578             lod2 = offset(lod2, 1);
 579             length++;
 580          }
 581       }
 582
 583       coordinate_done = true;
 584       break;
 585    }
 586    case ir_txs:
 587       bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
 588       length++;
 589       break;
 590    case ir_query_levels:
 591       bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), fs_reg(0u));
 592       length++;
 593       break;
 594    case ir_txf:
 595       /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
 596        * On Gen9 they are u, v, lod, r
 597        */
 598
 599       bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
 600       coordinate = offset(coordinate, 1);
 601       length++;
 602
 603       if (devinfo->gen >= 9) {
 604          if (coord_components >= 2) {
 605             bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
 606             coordinate = offset(coordinate, 1);
 607          }
 608          length++;
 609       }
 610
 611       bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
 612       length++;
 613
 614       for (int i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
 615          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
 616          coordinate = offset(coordinate, 1);
 617          length++;
 618       }
 619
 620       coordinate_done = true;
 621       break;
 622    case ir_txf_ms:
 623       bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
 624       length++;
 625
 626       /* data from the multisample control surface */
 627       bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
 628       length++;
 629
 630       /* there is no offsetting for this message; just copy in the integer
 631        * texture coordinates
 632        */
 633       for (int i = 0; i < coord_components; i++) {
 634          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
 635          coordinate = offset(coordinate, 1);
 636          length++;
 637       }
 638
 639       coordinate_done = true;
 640       break;
 641    case ir_tg4:
 642       if (has_nonconstant_offset) {
 643          if (shadow_c.file != BAD_FILE)
 644             no16("Gen7 does not support gather4_po_c in SIMD16 mode.");
 645
 646          /* More crazy intermixing */
 647          for (int i = 0; i < 2; i++) { /* u, v */
 648             bld.MOV(sources[length], coordinate);
 649             coordinate = offset(coordinate, 1);
 650             length++;
 651          }
 652
 653          for (int i = 0; i < 2; i++) { /* offu, offv */
 654             bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value);
 655             offset_value = offset(offset_value, 1);
 656             length++;
 657          }
 658
 659          if (coord_components == 3) { /* r if present */
 660             bld.MOV(sources[length], coordinate);
 661             coordinate = offset(coordinate, 1);
 662             length++;
 663          }
 664
 665          coordinate_done = true;
 666       }
 667       break;
 668    }
 669
 670    /* Set up the coordinate (except for cases where it was done above) */
 671    if (!coordinate_done) {
 672       for (int i = 0; i < coord_components; i++) {
 673          bld.MOV(sources[length], coordinate);
 674          coordinate = offset(coordinate, 1);
 675          length++;
 676       }
 677    }
 678
 679    int mlen;
 680    if (reg_width == 2)
 681       mlen = length * reg_width - header_size;
 682    else
 683       mlen = length * reg_width;
 684
 685    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
 686                                BRW_REGISTER_TYPE_F, dispatch_width);
 687    bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
 688
 689    /* Generate the SEND */
 690    enum opcode opcode;
 691    switch (op) {
 692    case ir_tex: opcode = SHADER_OPCODE_TEX; break;
 693    case ir_txb: opcode = FS_OPCODE_TXB; break;
 694    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
 695    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
 696    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
 697    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
 698    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
 699    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
 700    case ir_lod: opcode = SHADER_OPCODE_LOD; break;
 701    case ir_tg4:
 702       if (has_nonconstant_offset)
 703          opcode = SHADER_OPCODE_TG4_OFFSET;
 704       else
 705          opcode = SHADER_OPCODE_TG4;
 706       break;
 707    default:
 708       unreachable("not reached");
 709    }
 710    fs_inst *inst = bld.emit(opcode, dst, src_payload, sampler);
 711    inst->base_mrf = -1;
 712    inst->mlen = mlen;
 713    inst->header_size = header_size;
 714    inst->regs_written = 4 * reg_width;
 715
 716    if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
 717       fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
 718            " disallowed by hardware\n");
 719    }
 720
 721    return inst;
 722 }
 723
 724 fs_reg
 725 fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
 726                              bool is_rect, uint32_t sampler, int texunit)
 727 {
 728    bool needs_gl_clamp = true;
 729    fs_reg scale_x, scale_y;
 730
 731    /* The 965 requires the EU to do the normalization of GL rectangle
 732     * texture coordinates.  We use the program parameter state
 733     * tracking to get the scaling factor.
 734     */
 735    if (is_rect &&
 736        (devinfo->gen < 6 ||
 737         (devinfo->gen >= 6 && (key_tex->gl_clamp_mask[0] & (1 << sampler) ||
 738                                key_tex->gl_clamp_mask[1] & (1 << sampler))))) {
 739       struct gl_program_parameter_list *params = prog->Parameters;
 740       int tokens[STATE_LENGTH] = {
 741          STATE_INTERNAL,
 742          STATE_TEXRECT_SCALE,
 743          texunit,
 744          0,
 745          0
 746       };
 747
 748       no16("rectangle scale uniform setup not supported on SIMD16\n");
 749       if (dispatch_width == 16) {
 750          return coordinate;
 751       }
 752
 753       GLuint index = _mesa_add_state_reference(params,
 754                                                (gl_state_index *)tokens);
 755       /* Try to find existing copies of the texrect scale uniforms. */
 756       for (unsigned i = 0; i < uniforms; i++) {
 757          if (stage_prog_data->param[i] ==
 758              &prog->Parameters->ParameterValues[index][0]) {
 759             scale_x = fs_reg(UNIFORM, i);
 760             scale_y = fs_reg(UNIFORM, i + 1);
 761             break;
 762          }
 763       }
 764
 765       /* If we didn't already set them up, do so now. */
 766       if (scale_x.file == BAD_FILE) {
 767          scale_x = fs_reg(UNIFORM, uniforms);
 768          scale_y = fs_reg(UNIFORM, uniforms + 1);
 769
 770          stage_prog_data->param[uniforms++] =
 771             &prog->Parameters->ParameterValues[index][0];
 772          stage_prog_data->param[uniforms++] =
 773             &prog->Parameters->ParameterValues[index][1];
 774       }
 775    }
 776
 777    /* The 965 requires the EU to do the normalization of GL rectangle
 778     * texture coordinates.  We use the program parameter state
 779     * tracking to get the scaling factor.
 780     */
 781    if (devinfo->gen < 6 && is_rect) {
 782       fs_reg dst = fs_reg(GRF, alloc.allocate(coord_components));
 783       fs_reg src = coordinate;
 784       coordinate = dst;
 785
 786       bld.MUL(dst, src, scale_x);
 787       dst = offset(dst, 1);
 788       src = offset(src, 1);
 789       bld.MUL(dst, src, scale_y);
 790    } else if (is_rect) {
 791       /* On gen6+, the sampler handles the rectangle coordinates
 792        * natively, without needing rescaling.  But that means we have
 793        * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
 794        * not [0, 1] like the default case below.
 795        */
 796       needs_gl_clamp = false;
 797
 798       for (int i = 0; i < 2; i++) {
 799          if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
 800             fs_reg chan = coordinate;
 801             chan = offset(chan, i);
 802
 803             set_condmod(BRW_CONDITIONAL_GE,
 804                         bld.emit(BRW_OPCODE_SEL, chan, chan, fs_reg(0.0f)));
 805
 806             /* Our parameter comes in as 1.0/width or 1.0/height,
 807              * because that's what people normally want for doing
 808              * texture rectangle handling.  We need width or height
 809              * for clamping, but we don't care enough to make a new
 810              * parameter type, so just invert back.
 811              */
 812             fs_reg limit = vgrf(glsl_type::float_type);
 813             bld.MOV(limit, i == 0 ? scale_x : scale_y);
 814             bld.emit(SHADER_OPCODE_RCP, limit, limit);
 815
 816             set_condmod(BRW_CONDITIONAL_L,
 817                         bld.emit(BRW_OPCODE_SEL, chan, chan, limit));
 818          }
 819       }
 820    }
 821
 822    if (coord_components > 0 && needs_gl_clamp) {
 823       for (int i = 0; i < MIN2(coord_components, 3); i++) {
 824          if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
 825             fs_reg chan = coordinate;
 826             chan = offset(chan, i);
 827             set_saturate(true, bld.MOV(chan, chan));
 828          }
 829       }
 830    }
 831    return coordinate;
 832 }
 833
 834 /* Sample from the MCS surface attached to this multisample texture. */
 835 fs_reg
 836 fs_visitor::emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler)
 837 {
 838    int reg_width = dispatch_width / 8;
 839    fs_reg payload = fs_reg(GRF, alloc.allocate(components * reg_width),
 840                            BRW_REGISTER_TYPE_F, dispatch_width);
 841    fs_reg dest = vgrf(glsl_type::uvec4_type);
 842    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, components);
 843
 844    /* parameters are: u, v, r; missing parameters are treated as zero */
 845    for (int i = 0; i < components; i++) {
 846       sources[i] = vgrf(glsl_type::float_type);
 847       bld.MOV(retype(sources[i], BRW_REGISTER_TYPE_D), coordinate);
 848       coordinate = offset(coordinate, 1);
 849    }
 850
 851    bld.LOAD_PAYLOAD(payload, sources, components, 0);
 852
 853    fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS, dest, payload, sampler);
 854    inst->base_mrf = -1;
 855    inst->mlen = components * reg_width;
 856    inst->header_size = 0;
 857    inst->regs_written = 4 * reg_width; /* we only care about one reg of
 858                                         * response, but the sampler always
 859                                         * writes 4/8
 860                                         */
 861
 862    return dest;
 863 }
 864
 865 void
 866 fs_visitor::emit_texture(ir_texture_opcode op,
 867                          const glsl_type *dest_type,
 868                          fs_reg coordinate, int coord_components,
 869                          fs_reg shadow_c,
 870                          fs_reg lod, fs_reg lod2, int grad_components,
 871                          fs_reg sample_index,
 872                          fs_reg offset_value,
 873                          fs_reg mcs,
 874                          int gather_component,
 875                          bool is_cube_array,
 876                          bool is_rect,
 877                          uint32_t sampler,
 878                          fs_reg sampler_reg, int texunit)
 879 {
 880    fs_inst *inst = NULL;
 881
 882    if (op == ir_tg4) {
 883       /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
 884        * emitting anything other than setting up the constant result.
 885        */
 886       int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
 887       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
 888
 889          fs_reg res = vgrf(glsl_type::vec4_type);
 890          this->result = res;
 891
 892          for (int i=0; i<4; i++) {
 893             bld.MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f));
 894             res = offset(res, 1);
 895          }
 896          return;
 897       }
 898    }
 899
 900    if (coordinate.file != BAD_FILE) {
 901       /* FINISHME: Texture coordinate rescaling doesn't work with non-constant
 902        * samplers.  This should only be a problem with GL_CLAMP on Gen7.
 903        */
 904       coordinate = rescale_texcoord(coordinate, coord_components, is_rect,
 905                                     sampler, texunit);
 906    }
 907
 908    /* Writemasking doesn't eliminate channels on SIMD8 texture
 909     * samples, so don't worry about them.
 910     */
 911    fs_reg dst = vgrf(glsl_type::get_instance(dest_type->base_type, 4, 1));
 912
 913    if (devinfo->gen >= 7) {
 914       inst = emit_texture_gen7(op, dst, coordinate, coord_components,
 915                                shadow_c, lod, lod2, grad_components,
 916                                sample_index, mcs, sampler_reg,
 917                                offset_value);
 918    } else if (devinfo->gen >= 5) {
 919       inst = emit_texture_gen5(op, dst, coordinate, coord_components,
 920                                shadow_c, lod, lod2, grad_components,
 921                                sample_index, sampler,
 922                                offset_value.file != BAD_FILE);
 923    } else if (dispatch_width == 16) {
 924       inst = emit_texture_gen4_simd16(op, dst, coordinate, coord_components,
 925                                       shadow_c, lod, sampler);
 926    } else {
 927       inst = emit_texture_gen4(op, dst, coordinate, coord_components,
 928                                shadow_c, lod, lod2, grad_components,
 929                                sampler);
 930    }
 931
 932    if (shadow_c.file != BAD_FILE)
 933       inst->shadow_compare = true;
 934
 935    if (offset_value.file == IMM)
 936       inst->offset = offset_value.fixed_hw_reg.dw1.ud;
 937
 938    if (op == ir_tg4) {
 939       inst->offset |=
 940          gather_channel(gather_component, sampler) << 16; /* M0.2:16-17 */
 941
 942       if (devinfo->gen == 6)
 943          emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], dst);
 944    }
 945
 946    /* fixup #layers for cube map arrays */
 947    if (op == ir_txs && is_cube_array) {
 948       fs_reg depth = offset(dst, 2);
 949       fs_reg fixed_depth = vgrf(glsl_type::int_type);
 950       bld.emit(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
 951
 952       fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
 953       int components = inst->regs_written / (dst.width / 8);
 954       for (int i = 0; i < components; i++) {
 955          if (i == 2) {
 956             fixed_payload[i] = fixed_depth;
 957          } else {
 958             fixed_payload[i] = offset(dst, i);
 959          }
 960       }
 961       bld.LOAD_PAYLOAD(dst, fixed_payload, components, 0);
 962    }
 963
 964    swizzle_result(op, dest_type->vector_elements, dst, sampler);
 965 }
 966
 967 /**
 968  * Apply workarounds for Gen6 gather with UINT/SINT
 969  */
 970 void
 971 fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
 972 {
 973    if (!wa)
 974       return;
 975
 976    int width = (wa & WA_8BIT) ? 8 : 16;
 977
 978    for (int i = 0; i < 4; i++) {
 979       fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
 980       /* Convert from UNORM to UINT */
 981       bld.MUL(dst_f, dst_f, fs_reg((float)((1 << width) - 1)));
 982       bld.MOV(dst, dst_f);
 983
 984       if (wa & WA_SIGN) {
 985          /* Reinterpret the UINT value as a signed INT value by
 986           * shifting the sign bit into place, then shifting back
 987           * preserving sign.
 988           */
 989          bld.SHL(dst, dst, fs_reg(32 - width));
 990          bld.ASR(dst, dst, fs_reg(32 - width));
 991       }
 992
 993       dst = offset(dst, 1);
 994    }
 995 }
 996
 997 /**
 998  * Set up the gather channel based on the swizzle, for gather4.
 999  */
1000 uint32_t
1001 fs_visitor::gather_channel(int orig_chan, uint32_t sampler)
1002 {
1003    int swiz = GET_SWZ(key_tex->swizzles[sampler], orig_chan);
1004    switch (swiz) {
1005       case SWIZZLE_X: return 0;
1006       case SWIZZLE_Y:
1007          /* gather4 sampler is broken for green channel on RG32F --
1008           * we must ask for blue instead.
1009           */
1010          if (key_tex->gather_channel_quirk_mask & (1 << sampler))
1011             return 2;
1012          return 1;
1013       case SWIZZLE_Z: return 2;
1014       case SWIZZLE_W: return 3;
1015       default:
1016          unreachable("Not reached"); /* zero, one swizzles handled already */
1017    }
1018 }
1019
1020 /**
1021  * Swizzle the result of a texture result.  This is necessary for
1022  * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
1023  */
1024 void
1025 fs_visitor::swizzle_result(ir_texture_opcode op, int dest_components,
1026                            fs_reg orig_val, uint32_t sampler)
1027 {
1028    if (op == ir_query_levels) {
1029       /* # levels is in .w */
1030       this->result = offset(orig_val, 3);
1031       return;
1032    }
1033
1034    this->result = orig_val;
1035
1036    /* txs,lod don't actually sample the texture, so swizzling the result
1037     * makes no sense.
1038     */
1039    if (op == ir_txs || op == ir_lod || op == ir_tg4)
1040       return;
1041
1042    if (dest_components == 1) {
1043       /* Ignore DEPTH_TEXTURE_MODE swizzling. */
1044    } else if (key_tex->swizzles[sampler] != SWIZZLE_NOOP) {
1045       fs_reg swizzled_result = vgrf(glsl_type::vec4_type);
1046       swizzled_result.type = orig_val.type;
1047
1048       for (int i = 0; i < 4; i++) {
1049          int swiz = GET_SWZ(key_tex->swizzles[sampler], i);
1050          fs_reg l = swizzled_result;
1051          l = offset(l, i);
1052
1053          if (swiz == SWIZZLE_ZERO) {
1054             bld.MOV(l, fs_reg(0.0f));
1055          } else if (swiz == SWIZZLE_ONE) {
1056             bld.MOV(l, fs_reg(1.0f));
1057          } else {
1058             bld.MOV(l, offset(orig_val,
1059                               GET_SWZ(key_tex->swizzles[sampler], i)));
1060          }
1061       }
1062       this->result = swizzled_result;
1063    }
1064 }
1065
1066 /**
1067  * Try to replace IF/MOV/ELSE/MOV/ENDIF with SEL.
1068  *
1069  * Many GLSL shaders contain the following pattern:
1070  *
1071  *    x = condition ? foo : bar
1072  *
1073  * The compiler emits an ir_if tree for this, since each subexpression might be
1074  * a complex tree that could have side-effects or short-circuit logic.
1075  *
1076  * However, the common case is to simply select one of two constants or
1077  * variable values---which is exactly what SEL is for.  In this case, the
1078  * assembly looks like:
1079  *
1080  *    (+f0) IF
1081  *    MOV dst src0
1082  *    ELSE
1083  *    MOV dst src1
1084  *    ENDIF
1085  *
1086  * which can be easily translated into:
1087  *
1088  *    (+f0) SEL dst src0 src1
1089  *
1090  * If src0 is an immediate value, we promote it to a temporary GRF.
1091  */
1092 bool
1093 fs_visitor::try_replace_with_sel()
1094 {
1095    fs_inst *endif_inst = (fs_inst *) instructions.get_tail();
1096    assert(endif_inst->opcode == BRW_OPCODE_ENDIF);
1097
1098    /* Pattern match in reverse: IF, MOV, ELSE, MOV, ENDIF. */
1099    int opcodes[] = {
1100       BRW_OPCODE_IF, BRW_OPCODE_MOV, BRW_OPCODE_ELSE, BRW_OPCODE_MOV,
1101    };
1102
1103    fs_inst *match = (fs_inst *) endif_inst->prev;
1104    for (int i = 0; i < 4; i++) {
1105       if (match->is_head_sentinel() || match->opcode != opcodes[4-i-1])
1106          return false;
1107       match = (fs_inst *) match->prev;
1108    }
1109
1110    /* The opcodes match; it looks like the right sequence of instructions. */
1111    fs_inst *else_mov = (fs_inst *) endif_inst->prev;
1112    fs_inst *then_mov = (fs_inst *) else_mov->prev->prev;
1113    fs_inst *if_inst = (fs_inst *) then_mov->prev;
1114
1115    /* Check that the MOVs are the right form. */
1116    if (then_mov->dst.equals(else_mov->dst) &&
1117        !then_mov->is_partial_write() &&
1118        !else_mov->is_partial_write()) {
1119
1120       /* Remove the matched instructions; we'll emit a SEL to replace them. */
1121       while (!if_inst->next->is_tail_sentinel())
1122          if_inst->next->exec_node::remove();
1123       if_inst->exec_node::remove();
1124
1125       /* Only the last source register can be a constant, so if the MOV in
1126        * the "then" clause uses a constant, we need to put it in a temporary.
1127        */
1128       fs_reg src0(then_mov->src[0]);
1129       if (src0.file == IMM) {
1130          src0 = vgrf(glsl_type::float_type);
1131          src0.type = then_mov->src[0].type;
1132          bld.MOV(src0, then_mov->src[0]);
1133       }
1134
1135       if (if_inst->conditional_mod) {
1136          /* Sandybridge-specific IF with embedded comparison */
1137          bld.CMP(bld.null_reg_d(), if_inst->src[0], if_inst->src[1],
1138                  if_inst->conditional_mod);
1139          set_predicate(BRW_PREDICATE_NORMAL,
1140                        bld.emit(BRW_OPCODE_SEL, then_mov->dst,
1141                                 src0, else_mov->src[0]));
1142       } else {
1143          /* Separate CMP and IF instructions */
1144          set_predicate_inv(if_inst->predicate, if_inst->predicate_inverse,
1145                            bld.emit(BRW_OPCODE_SEL, then_mov->dst,
1146                                     src0, else_mov->src[0]));
1147       }
1148
1149       return true;
1150    }
1151
1152    return false;
1153 }
1154
1155 void
1156 fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
1157                                 fs_reg dst, fs_reg offset, fs_reg src0,
1158                                 fs_reg src1)
1159 {
1160    int reg_width = dispatch_width / 8;
1161    int length = 0;
1162
1163    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 4);
1164
1165    sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
1166    /* Initialize the sample mask in the message header. */
1167    bld.exec_all().MOV(sources[0], fs_reg(0u));
1168
1169    if (stage == MESA_SHADER_FRAGMENT) {
1170       if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
1171          bld.exec_all()
1172             .MOV(component(sources[0], 7), brw_flag_reg(0, 1));
1173       } else {
1174          bld.exec_all()
1175             .MOV(component(sources[0], 7),
1176                  retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
1177       }
1178    } else {
1179       /* The execution mask is part of the side-band information sent together with
1180        * the message payload to the data port. It's implicitly ANDed with the sample
1181        * mask sent in the header to compute the actual set of channels that execute
1182        * the atomic operation.
1183        */
1184       assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
1185       bld.exec_all()
1186          .MOV(component(sources[0], 7), fs_reg(0xffffu));
1187    }
1188    length++;
1189
1190    /* Set the atomic operation offset. */
1191    sources[1] = vgrf(glsl_type::uint_type);
1192    bld.MOV(sources[1], offset);
1193    length++;
1194
1195    /* Set the atomic operation arguments. */
1196    if (src0.file != BAD_FILE) {
1197       sources[length] = vgrf(glsl_type::uint_type);
1198       bld.MOV(sources[length], src0);
1199       length++;
1200    }
1201
1202    if (src1.file != BAD_FILE) {
1203       sources[length] = vgrf(glsl_type::uint_type);
1204       bld.MOV(sources[length], src1);
1205       length++;
1206    }
1207
1208    int mlen = 1 + (length - 1) * reg_width;
1209    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
1210                                BRW_REGISTER_TYPE_UD, dispatch_width);
1211    bld.LOAD_PAYLOAD(src_payload, sources, length, 1);
1212
1213    /* Emit the instruction. */
1214    fs_inst *inst = bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst, src_payload,
1215                             fs_reg(surf_index), fs_reg(atomic_op));
1216    inst->mlen = mlen;
1217 }
1218
1219 void
1220 fs_visitor::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
1221                                       fs_reg offset)
1222 {
1223    int reg_width = dispatch_width / 8;
1224
1225    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
1226
1227    sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
1228    /* Initialize the sample mask in the message header. */
1229    bld.exec_all()
1230       .MOV(sources[0], fs_reg(0u));
1231
1232    if (stage == MESA_SHADER_FRAGMENT) {
1233       if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
1234          bld.exec_all()
1235             .MOV(component(sources[0], 7), brw_flag_reg(0, 1));
1236       } else {
1237          bld.exec_all()
1238             .MOV(component(sources[0], 7),
1239                  retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
1240       }
1241    } else {
1242       /* The execution mask is part of the side-band information sent together with
1243        * the message payload to the data port. It's implicitly ANDed with the sample
1244        * mask sent in the header to compute the actual set of channels that execute
1245        * the atomic operation.
1246        */
1247       assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
1248       bld.exec_all()
1249          .MOV(component(sources[0], 7), fs_reg(0xffffu));
1250    }
1251
1252    /* Set the surface read offset. */
1253    sources[1] = vgrf(glsl_type::uint_type);
1254    bld.MOV(sources[1], offset);
1255
1256    int mlen = 1 + reg_width;
1257    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
1258                                BRW_REGISTER_TYPE_UD, dispatch_width);
1259    fs_inst *inst = bld.LOAD_PAYLOAD(src_payload, sources, 2, 1);
1260
1261    /* Emit the instruction. */
1262    inst = bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, src_payload,
1263                    fs_reg(surf_index), fs_reg(1));
1264    inst->mlen = mlen;
1265 }
1266
1267 fs_inst *
1268 fs_visitor::emit(fs_inst *inst)
1269 {
1270    if (dispatch_width == 16 && inst->exec_size == 8)
1271       inst->force_uncompressed = true;
1272
1273    inst->annotation = this->current_annotation;
1274    inst->ir = this->base_ir;
1275
1276    this->instructions.push_tail(inst);
1277
1278    return inst;
1279 }
1280
1281 void
1282 fs_visitor::emit(exec_list list)
1283 {
1284    foreach_in_list_safe(fs_inst, inst, &list) {
1285       inst->exec_node::remove();
1286       emit(inst);
1287    }
1288 }
1289
1290 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1291 void
1292 fs_visitor::emit_dummy_fs()
1293 {
1294    int reg_width = dispatch_width / 8;
1295
1296    /* Everyone's favorite color. */
1297    const float color[4] = { 1.0, 0.0, 1.0, 0.0 };
1298    for (int i = 0; i < 4; i++) {
1299       bld.MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F,
1300                      dispatch_width), fs_reg(color[i]));
1301    }
1302
1303    fs_inst *write;
1304    write = bld.emit(FS_OPCODE_FB_WRITE);
1305    write->eot = true;
1306    if (devinfo->gen >= 6) {
1307       write->base_mrf = 2;
1308       write->mlen = 4 * reg_width;
1309    } else {
1310       write->header_size = 2;
1311       write->base_mrf = 0;
1312       write->mlen = 2 + 4 * reg_width;
1313    }
1314
1315    /* Tell the SF we don't have any inputs.  Gen4-5 require at least one
1316     * varying to avoid GPU hangs, so set that.
1317     */
1318    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
1319    wm_prog_data->num_varying_inputs = devinfo->gen < 6 ? 1 : 0;
1320    memset(wm_prog_data->urb_setup, -1,
1321           sizeof(wm_prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1322
1323    /* We don't have any uniforms. */
1324    stage_prog_data->nr_params = 0;
1325    stage_prog_data->nr_pull_params = 0;
1326    stage_prog_data->curb_read_length = 0;
1327    stage_prog_data->dispatch_grf_start_reg = 2;
1328    wm_prog_data->dispatch_grf_start_reg_16 = 2;
1329    grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */
1330
1331    calculate_cfg();
1332 }
1333
1334 /* The register location here is relative to the start of the URB
1335  * data.  It will get adjusted to be a real location before
1336  * generate_code() time.
1337  */
1338 struct brw_reg
1339 fs_visitor::interp_reg(int location, int channel)
1340 {
1341    assert(stage == MESA_SHADER_FRAGMENT);
1342    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1343    int regnr = prog_data->urb_setup[location] * 2 + channel / 2;
1344    int stride = (channel & 1) * 4;
1345
1346    assert(prog_data->urb_setup[location] != -1);
1347
1348    return brw_vec1_grf(regnr, stride);
1349 }
1350
1351 /** Emits the interpolation for the varying inputs. */
1352 void
1353 fs_visitor::emit_interpolation_setup_gen4()
1354 {
1355    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1356
1357    fs_builder abld = bld.annotate("compute pixel centers");
1358    this->pixel_x = vgrf(glsl_type::uint_type);
1359    this->pixel_y = vgrf(glsl_type::uint_type);
1360    this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1361    this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1362    abld.ADD(this->pixel_x,
1363             fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1364             fs_reg(brw_imm_v(0x10101010)));
1365    abld.ADD(this->pixel_y,
1366             fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1367             fs_reg(brw_imm_v(0x11001100)));
1368
1369    abld = bld.annotate("compute pixel deltas from v0");
1370
1371    this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
1372       vgrf(glsl_type::vec2_type);
1373    const fs_reg &delta_xy = this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC];
1374    const fs_reg xstart(negate(brw_vec1_grf(1, 0)));
1375    const fs_reg ystart(negate(brw_vec1_grf(1, 1)));
1376
1377    if (devinfo->has_pln && dispatch_width == 16) {
1378       for (unsigned i = 0; i < 2; i++) {
1379          abld.half(i).ADD(half(offset(delta_xy, i), 0),
1380                           half(this->pixel_x, i), xstart);
1381          abld.half(i).ADD(half(offset(delta_xy, i), 1),
1382                           half(this->pixel_y, i), ystart);
1383       }
1384    } else {
1385       abld.ADD(offset(delta_xy, 0), this->pixel_x, xstart);
1386       abld.ADD(offset(delta_xy, 1), this->pixel_y, ystart);
1387    }
1388
1389    abld = bld.annotate("compute pos.w and 1/pos.w");
1390    /* Compute wpos.w.  It's always in our setup, since it's needed to
1391     * interpolate the other attributes.
1392     */
1393    this->wpos_w = vgrf(glsl_type::float_type);
1394    abld.emit(FS_OPCODE_LINTERP, wpos_w, delta_xy,
1395              interp_reg(VARYING_SLOT_POS, 3));
1396    /* Compute the pixel 1/W value from wpos.w. */
1397    this->pixel_w = vgrf(glsl_type::float_type);
1398    abld.emit(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
1399 }
1400
1401 /** Emits the interpolation for the varying inputs. */
1402 void
1403 fs_visitor::emit_interpolation_setup_gen6()
1404 {
1405    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1406
1407    fs_builder abld = bld.annotate("compute pixel centers");
1408    if (brw->gen >= 8 || dispatch_width == 8) {
1409       /* The "Register Region Restrictions" page says for BDW (and newer,
1410        * presumably):
1411        *
1412        *     "When destination spans two registers, the source may be one or
1413        *      two registers. The destination elements must be evenly split
1414        *      between the two registers."
1415        *
1416        * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16 to
1417        * compute our pixel centers.
1418        */
1419       fs_reg int_pixel_xy(GRF, alloc.allocate(dispatch_width / 8),
1420                           BRW_REGISTER_TYPE_UW, dispatch_width * 2);
1421       abld.exec_all()
1422           .ADD(int_pixel_xy,
1423                fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)),
1424                fs_reg(brw_imm_v(0x11001010)));
1425
1426       this->pixel_x = vgrf(glsl_type::float_type);
1427       this->pixel_y = vgrf(glsl_type::float_type);
1428       abld.emit(FS_OPCODE_PIXEL_X, this->pixel_x, int_pixel_xy);
1429       abld.emit(FS_OPCODE_PIXEL_Y, this->pixel_y, int_pixel_xy);
1430    } else {
1431       /* The "Register Region Restrictions" page says for SNB, IVB, HSW:
1432        *
1433        *     "When destination spans two registers, the source MUST span two
1434        *      registers."
1435        *
1436        * Since the GRF source of the ADD will only read a single register, we
1437        * must do two separate ADDs in SIMD16.
1438        */
1439       fs_reg int_pixel_x = vgrf(glsl_type::uint_type);
1440       fs_reg int_pixel_y = vgrf(glsl_type::uint_type);
1441       int_pixel_x.type = BRW_REGISTER_TYPE_UW;
1442       int_pixel_y.type = BRW_REGISTER_TYPE_UW;
1443       abld.ADD(int_pixel_x,
1444                fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1445                fs_reg(brw_imm_v(0x10101010)));
1446       abld.ADD(int_pixel_y,
1447                fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1448                fs_reg(brw_imm_v(0x11001100)));
1449
1450       /* As of gen6, we can no longer mix float and int sources.  We have
1451        * to turn the integer pixel centers into floats for their actual
1452        * use.
1453        */
1454       this->pixel_x = vgrf(glsl_type::float_type);
1455       this->pixel_y = vgrf(glsl_type::float_type);
1456       abld.MOV(this->pixel_x, int_pixel_x);
1457       abld.MOV(this->pixel_y, int_pixel_y);
1458    }
1459
1460    abld = bld.annotate("compute pos.w");
1461    this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0));
1462    this->wpos_w = vgrf(glsl_type::float_type);
1463    abld.emit(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
1464
1465    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
1466       uint8_t reg = payload.barycentric_coord_reg[i];
1467       this->delta_xy[i] = fs_reg(brw_vec16_grf(reg, 0));
1468    }
1469 }
1470
1471 void
1472 fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
1473                                 unsigned exec_size, bool use_2nd_half)
1474 {
1475    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1476    fs_inst *inst;
1477
1478    if (key->clamp_fragment_color) {
1479       fs_reg tmp = vgrf(glsl_type::vec4_type);
1480       assert(color.type == BRW_REGISTER_TYPE_F);
1481       for (unsigned i = 0; i < components; i++) {
1482          inst = bld.MOV(offset(tmp, i), offset(color, i));
1483          inst->saturate = true;
1484       }
1485       color = tmp;
1486    }
1487
1488    if (exec_size < dispatch_width) {
1489       unsigned half_idx = use_2nd_half ? 1 : 0;
1490       for (unsigned i = 0; i < components; i++)
1491          dst[i] = half(offset(color, i), half_idx);
1492    } else {
1493       for (unsigned i = 0; i < components; i++)
1494          dst[i] = offset(color, i);
1495    }
1496 }
1497
1498 static enum brw_conditional_mod
1499 cond_for_alpha_func(GLenum func)
1500 {
1501    switch(func) {
1502       case GL_GREATER:
1503          return BRW_CONDITIONAL_G;
1504       case GL_GEQUAL:
1505          return BRW_CONDITIONAL_GE;
1506       case GL_LESS:
1507          return BRW_CONDITIONAL_L;
1508       case GL_LEQUAL:
1509          return BRW_CONDITIONAL_LE;
1510       case GL_EQUAL:
1511          return BRW_CONDITIONAL_EQ;
1512       case GL_NOTEQUAL:
1513          return BRW_CONDITIONAL_NEQ;
1514       default:
1515          unreachable("Not reached");
1516    }
1517 }
1518
1519 /**
1520  * Alpha test support for when we compile it into the shader instead
1521  * of using the normal fixed-function alpha test.
1522  */
1523 void
1524 fs_visitor::emit_alpha_test()
1525 {
1526    assert(stage == MESA_SHADER_FRAGMENT);
1527    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1528    const fs_builder abld = bld.annotate("Alpha test");
1529
1530    fs_inst *cmp;
1531    if (key->alpha_test_func == GL_ALWAYS)
1532       return;
1533
1534    if (key->alpha_test_func == GL_NEVER) {
1535       /* f0.1 = 0 */
1536       fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
1537                                       BRW_REGISTER_TYPE_UW));
1538       cmp = abld.CMP(bld.null_reg_f(), some_reg, some_reg,
1539                      BRW_CONDITIONAL_NEQ);
1540    } else {
1541       /* RT0 alpha */
1542       fs_reg color = offset(outputs[0], 3);
1543
1544       /* f0.1 &= func(color, ref) */
1545       cmp = abld.CMP(bld.null_reg_f(), color, fs_reg(key->alpha_test_ref),
1546                      cond_for_alpha_func(key->alpha_test_func));
1547    }
1548    cmp->predicate = BRW_PREDICATE_NORMAL;
1549    cmp->flag_subreg = 1;
1550 }
1551
1552 fs_inst *
1553 fs_visitor::emit_single_fb_write(const fs_builder &bld,
1554                                  fs_reg color0, fs_reg color1,
1555                                  fs_reg src0_alpha, unsigned components,
1556                                  unsigned exec_size, bool use_2nd_half)
1557 {
1558    assert(stage == MESA_SHADER_FRAGMENT);
1559    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1560    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1561    int header_size = 2, payload_header_size;
1562
1563    /* We can potentially have a message length of up to 15, so we have to set
1564     * base_mrf to either 0 or 1 in order to fit in m0..m15.
1565     */
1566    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 15);
1567    int length = 0;
1568
1569    /* From the Sandy Bridge PRM, volume 4, page 198:
1570     *
1571     *     "Dispatched Pixel Enables. One bit per pixel indicating
1572     *      which pixels were originally enabled when the thread was
1573     *      dispatched. This field is only required for the end-of-
1574     *      thread message and on all dual-source messages."
1575     */
1576    if (devinfo->gen >= 6 &&
1577        (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
1578        color1.file == BAD_FILE &&
1579        key->nr_color_regions == 1) {
1580       header_size = 0;
1581    }
1582
1583    if (header_size != 0) {
1584       assert(header_size == 2);
1585       /* Allocate 2 registers for a header */
1586       length += 2;
1587    }
1588
1589    if (payload.aa_dest_stencil_reg) {
1590       sources[length] = fs_reg(GRF, alloc.allocate(1));
1591       bld.exec_all().annotate("FB write stencil/AA alpha")
1592          .MOV(sources[length],
1593               fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
1594       length++;
1595    }
1596
1597    prog_data->uses_omask =
1598       prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
1599    if (prog_data->uses_omask) {
1600       assert(this->sample_mask.file != BAD_FILE);
1601       /* Hand over gl_SampleMask. Only lower 16 bits are relevant.  Since
1602        * it's unsinged single words, one vgrf is always 16-wide.
1603        */
1604       sources[length] = fs_reg(GRF, alloc.allocate(1),
1605                                BRW_REGISTER_TYPE_UW, 16);
1606       bld.exec_all().annotate("FB write oMask")
1607          .emit(FS_OPCODE_SET_OMASK, sources[length], this->sample_mask);
1608       length++;
1609    }
1610
1611    payload_header_size = length;
1612
1613    if (color0.file == BAD_FILE) {
1614       /* Even if there's no color buffers enabled, we still need to send
1615        * alpha out the pipeline to our null renderbuffer to support
1616        * alpha-testing, alpha-to-coverage, and so on.
1617        */
1618       if (this->outputs[0].file != BAD_FILE)
1619          setup_color_payload(&sources[length + 3], offset(this->outputs[0], 3),
1620                              1, exec_size, false);
1621       length += 4;
1622    } else if (color1.file == BAD_FILE) {
1623       if (src0_alpha.file != BAD_FILE) {
1624          setup_color_payload(&sources[length], src0_alpha, 1, exec_size, false);
1625          length++;
1626       }
1627
1628       setup_color_payload(&sources[length], color0, components,
1629                           exec_size, use_2nd_half);
1630       length += 4;
1631    } else {
1632       setup_color_payload(&sources[length], color0, components,
1633                           exec_size, use_2nd_half);
1634       length += 4;
1635       setup_color_payload(&sources[length], color1, components,
1636                           exec_size, use_2nd_half);
1637       length += 4;
1638    }
1639
1640    if (source_depth_to_render_target) {
1641       if (devinfo->gen == 6) {
1642          /* For outputting oDepth on gen6, SIMD8 writes have to be
1643           * used.  This would require SIMD8 moves of each half to
1644           * message regs, kind of like pre-gen5 SIMD16 FB writes.
1645           * Just bail on doing so for now.
1646           */
1647          no16("Missing support for simd16 depth writes on gen6\n");
1648       }
1649
1650       if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
1651          /* Hand over gl_FragDepth. */
1652          assert(this->frag_depth.file != BAD_FILE);
1653          if (exec_size < dispatch_width) {
1654             sources[length] = half(this->frag_depth, use_2nd_half);
1655          } else {
1656             sources[length] = this->frag_depth;
1657          }
1658       } else {
1659          /* Pass through the payload depth. */
1660          sources[length] = fs_reg(brw_vec8_grf(payload.source_depth_reg, 0));
1661       }
1662       length++;
1663    }
1664
1665    if (payload.dest_depth_reg)
1666       sources[length++] = fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0));
1667
1668    const fs_builder ubld = bld.group(exec_size, use_2nd_half);
1669    fs_inst *load;
1670    fs_inst *write;
1671    if (devinfo->gen >= 7) {
1672       /* Send from the GRF */
1673       fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F, exec_size);
1674       load = ubld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
1675       payload.reg = alloc.allocate(load->regs_written);
1676       load->dst = payload;
1677       write = ubld.emit(FS_OPCODE_FB_WRITE, reg_undef, payload);
1678       write->base_mrf = -1;
1679    } else {
1680       /* Send from the MRF */
1681       load = ubld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F, exec_size),
1682                                sources, length, payload_header_size);
1683
1684       /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
1685        * will do this for us if we just give it a COMPR4 destination.
1686        */
1687       if (brw->gen < 6 && exec_size == 16)
1688          load->dst.reg |= BRW_MRF_COMPR4;
1689
1690       write = ubld.emit(FS_OPCODE_FB_WRITE);
1691       write->exec_size = exec_size;
1692       write->base_mrf = 1;
1693    }
1694
1695    write->mlen = load->regs_written;
1696    write->header_size = header_size;
1697    if (prog_data->uses_kill) {
1698       write->predicate = BRW_PREDICATE_NORMAL;
1699       write->flag_subreg = 1;
1700    }
1701    return write;
1702 }
1703
1704 void
1705 fs_visitor::emit_fb_writes()
1706 {
1707    assert(stage == MESA_SHADER_FRAGMENT);
1708    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1709    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1710
1711    fs_inst *inst = NULL;
1712    if (do_dual_src) {
1713       const fs_builder abld = bld.annotate("FB dual-source write");
1714
1715       inst = emit_single_fb_write(abld, this->outputs[0],
1716                                   this->dual_src_output, reg_undef, 4, 8);
1717       inst->target = 0;
1718
1719       /* SIMD16 dual source blending requires to send two SIMD8 dual source
1720        * messages, where each message contains color data for 8 pixels. Color
1721        * data for the first group of pixels is stored in the "lower" half of
1722        * the color registers, so in SIMD16, the previous message did:
1723        * m + 0: r0
1724        * m + 1: g0
1725        * m + 2: b0
1726        * m + 3: a0
1727        *
1728        * Here goes the second message, which packs color data for the
1729        * remaining 8 pixels. Color data for these pixels is stored in the
1730        * "upper" half of the color registers, so we need to do:
1731        * m + 0: r1
1732        * m + 1: g1
1733        * m + 2: b1
1734        * m + 3: a1
1735        */
1736       if (dispatch_width == 16) {
1737          inst = emit_single_fb_write(abld, this->outputs[0],
1738                                      this->dual_src_output, reg_undef, 4, 8,
1739                                      true);
1740          inst->target = 0;
1741       }
1742
1743       prog_data->dual_src_blend = true;
1744    } else {
1745       for (int target = 0; target < key->nr_color_regions; target++) {
1746          /* Skip over outputs that weren't written. */
1747          if (this->outputs[target].file == BAD_FILE)
1748             continue;
1749
1750          const fs_builder abld = bld.annotate(
1751             ralloc_asprintf(this->mem_ctx, "FB write target %d", target));
1752
1753          fs_reg src0_alpha;
1754          if (devinfo->gen >= 6 && key->replicate_alpha && target != 0)
1755             src0_alpha = offset(outputs[0], 3);
1756
1757          inst = emit_single_fb_write(abld, this->outputs[target], reg_undef,
1758                                      src0_alpha,
1759                                      this->output_components[target],
1760                                      dispatch_width);
1761          inst->target = target;
1762       }
1763    }
1764
1765    if (inst == NULL) {
1766       /* Even if there's no color buffers enabled, we still need to send
1767        * alpha out the pipeline to our null renderbuffer to support
1768        * alpha-testing, alpha-to-coverage, and so on.
1769        */
1770       inst = emit_single_fb_write(bld, reg_undef, reg_undef, reg_undef, 0,
1771                                   dispatch_width);
1772       inst->target = 0;
1773    }
1774
1775    inst->eot = true;
1776 }
1777
1778 void
1779 fs_visitor::setup_uniform_clipplane_values()
1780 {
1781    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
1782    const struct brw_vue_prog_key *key =
1783       (const struct brw_vue_prog_key *) this->key;
1784
1785    for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
1786       this->userplane[i] = fs_reg(UNIFORM, uniforms);
1787       for (int j = 0; j < 4; ++j) {
1788          stage_prog_data->param[uniforms + j] =
1789             (gl_constant_value *) &clip_planes[i][j];
1790       }
1791       uniforms += 4;
1792    }
1793 }
1794
1795 void fs_visitor::compute_clip_distance()
1796 {
1797    struct brw_vue_prog_data *vue_prog_data =
1798       (struct brw_vue_prog_data *) prog_data;
1799    const struct brw_vue_prog_key *key =
1800       (const struct brw_vue_prog_key *) this->key;
1801
1802    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
1803     *
1804     *     "If a linked set of shaders forming the vertex stage contains no
1805     *     static write to gl_ClipVertex or gl_ClipDistance, but the
1806     *     application has requested clipping against user clip planes through
1807     *     the API, then the coordinate written to gl_Position is used for
1808     *     comparison against the user clip planes."
1809     *
1810     * This function is only called if the shader didn't write to
1811     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
1812     * if the user wrote to it; otherwise we use gl_Position.
1813     */
1814
1815    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
1816    if (!(vue_prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX))
1817       clip_vertex = VARYING_SLOT_POS;
1818
1819    /* If the clip vertex isn't written, skip this.  Typically this means
1820     * the GS will set up clipping. */
1821    if (outputs[clip_vertex].file == BAD_FILE)
1822       return;
1823
1824    setup_uniform_clipplane_values();
1825
1826    current_annotation = "user clip distances";
1827
1828    this->outputs[VARYING_SLOT_CLIP_DIST0] = vgrf(glsl_type::vec4_type);
1829    this->outputs[VARYING_SLOT_CLIP_DIST1] = vgrf(glsl_type::vec4_type);
1830
1831    for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
1832       fs_reg u = userplane[i];
1833       fs_reg output = outputs[VARYING_SLOT_CLIP_DIST0 + i / 4];
1834       output.reg_offset = i & 3;
1835
1836       emit(MUL(output, outputs[clip_vertex], u));
1837       for (int j = 1; j < 4; j++) {
1838          u.reg = userplane[i].reg + j;
1839          emit(MAD(output, output, offset(outputs[clip_vertex], j), u));
1840       }
1841    }
1842 }
1843
1844 void
1845 fs_visitor::emit_urb_writes()
1846 {
1847    int slot, urb_offset, length;
1848    struct brw_vs_prog_data *vs_prog_data =
1849       (struct brw_vs_prog_data *) prog_data;
1850    const struct brw_vs_prog_key *key =
1851       (const struct brw_vs_prog_key *) this->key;
1852    const GLbitfield64 psiz_mask =
1853       VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ;
1854    const struct brw_vue_map *vue_map = &vs_prog_data->base.vue_map;
1855    bool flush;
1856    fs_reg sources[8];
1857
1858    /* Lower legacy ff and ClipVertex clipping to clip distances */
1859    if (key->base.userclip_active && !prog->UsesClipDistanceOut)
1860       compute_clip_distance();
1861
1862    /* If we don't have any valid slots to write, just do a minimal urb write
1863     * send to terminate the shader. */
1864    if (vue_map->slots_valid == 0) {
1865
1866       fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
1867       fs_inst *inst = emit(MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0),
1868                                                       BRW_REGISTER_TYPE_UD))));
1869       inst->force_writemask_all = true;
1870
1871       inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
1872       inst->eot = true;
1873       inst->mlen = 1;
1874       inst->offset = 1;
1875       return;
1876    }
1877
1878    length = 0;
1879    urb_offset = 0;
1880    flush = false;
1881    for (slot = 0; slot < vue_map->num_slots; slot++) {
1882       fs_reg reg, src, zero;
1883
1884       int varying = vue_map->slot_to_varying[slot];
1885       switch (varying) {
1886       case VARYING_SLOT_PSIZ:
1887
1888          /* The point size varying slot is the vue header and is always in the
1889           * vue map.  But often none of the special varyings that live there
1890           * are written and in that case we can skip writing to the vue
1891           * header, provided the corresponding state properly clamps the
1892           * values further down the pipeline. */
1893          if ((vue_map->slots_valid & psiz_mask) == 0) {
1894             assert(length == 0);
1895             urb_offset++;
1896             break;
1897          }
1898
1899          zero = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
1900          emit(MOV(zero, fs_reg(0u)));
1901
1902          sources[length++] = zero;
1903          if (vue_map->slots_valid & VARYING_BIT_LAYER)
1904             sources[length++] = this->outputs[VARYING_SLOT_LAYER];
1905          else
1906             sources[length++] = zero;
1907
1908          if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
1909             sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
1910          else
1911             sources[length++] = zero;
1912
1913          if (vue_map->slots_valid & VARYING_BIT_PSIZ)
1914             sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
1915          else
1916             sources[length++] = zero;
1917          break;
1918
1919       case BRW_VARYING_SLOT_NDC:
1920       case VARYING_SLOT_EDGE:
1921          unreachable("unexpected scalar vs output");
1922          break;
1923
1924       case BRW_VARYING_SLOT_PAD:
1925          break;
1926
1927       default:
1928          /* gl_Position is always in the vue map, but isn't always written by
1929           * the shader.  Other varyings (clip distances) get added to the vue
1930           * map but don't always get written.  In those cases, the
1931           * corresponding this->output[] slot will be invalid we and can skip
1932           * the urb write for the varying.  If we've already queued up a vue
1933           * slot for writing we flush a mlen 5 urb write, otherwise we just
1934           * advance the urb_offset.
1935           */
1936          if (this->outputs[varying].file == BAD_FILE) {
1937             if (length > 0)
1938                flush = true;
1939             else
1940                urb_offset++;
1941             break;
1942          }
1943
1944          if ((varying == VARYING_SLOT_COL0 ||
1945               varying == VARYING_SLOT_COL1 ||
1946               varying == VARYING_SLOT_BFC0 ||
1947               varying == VARYING_SLOT_BFC1) &&
1948              key->clamp_vertex_color) {
1949             /* We need to clamp these guys, so do a saturating MOV into a
1950              * temp register and use that for the payload.
1951              */
1952             for (int i = 0; i < 4; i++) {
1953                reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type);
1954                src = offset(this->outputs[varying], i);
1955                fs_inst *inst = emit(MOV(reg, src));
1956                inst->saturate = true;
1957                sources[length++] = reg;
1958             }
1959          } else {
1960             for (int i = 0; i < 4; i++)
1961                sources[length++] = offset(this->outputs[varying], i);
1962          }
1963          break;
1964       }
1965
1966       current_annotation = "URB write";
1967
1968       /* If we've queued up 8 registers of payload (2 VUE slots), if this is
1969        * the last slot or if we need to flush (see BAD_FILE varying case
1970        * above), emit a URB write send now to flush out the data.
1971        */
1972       int last = slot == vue_map->num_slots - 1;
1973       if (length == 8 || last)
1974          flush = true;
1975       if (flush) {
1976          fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
1977          fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1),
1978                                  BRW_REGISTER_TYPE_F, dispatch_width);
1979          payload_sources[0] =
1980             fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
1981
1982          memcpy(&payload_sources[1], sources, length * sizeof sources[0]);
1983          emit(LOAD_PAYLOAD(payload, payload_sources, length + 1, 1));
1984
1985          fs_inst *inst =
1986             emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
1987          inst->eot = last;
1988          inst->mlen = length + 1;
1989          inst->offset = urb_offset;
1990          urb_offset = slot + 1;
1991          length = 0;
1992          flush = false;
1993       }
1994    }
1995 }
1996
1997 void
1998 fs_visitor::resolve_ud_negate(fs_reg *reg)
1999 {
2000    if (reg->type != BRW_REGISTER_TYPE_UD ||
2001        !reg->negate)
2002       return;
2003
2004    fs_reg temp = vgrf(glsl_type::uint_type);
2005    emit(MOV(temp, *reg));
2006    *reg = temp;
2007 }
2008
2009 void
2010 fs_visitor::emit_cs_terminate()
2011 {
2012    assert(brw->gen >= 7);
2013
2014    /* We are getting the thread ID from the compute shader header */
2015    assert(stage == MESA_SHADER_COMPUTE);
2016
2017    /* We can't directly send from g0, since sends with EOT have to use
2018     * g112-127. So, copy it to a virtual register, The register allocator will
2019     * make sure it uses the appropriate register range.
2020     */
2021    struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD);
2022    fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
2023    fs_inst *inst = emit(MOV(payload, g0));
2024    inst->force_writemask_all = true;
2025
2026    /* Send a message to the thread spawner to terminate the thread. */
2027    inst = emit(CS_OPCODE_CS_TERMINATE, reg_undef, payload);
2028    inst->eot = true;
2029 }
2030
2031 fs_visitor::fs_visitor(struct brw_context *brw,
2032                        void *mem_ctx,
2033                        gl_shader_stage stage,
2034                        const void *key,
2035                        struct brw_stage_prog_data *prog_data,
2036                        struct gl_shader_program *shader_prog,
2037                        struct gl_program *prog,
2038                        unsigned dispatch_width)
2039    : backend_shader(brw, shader_prog, prog, prog_data, stage),
2040      reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
2041      reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
2042      reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
2043      key(key), prog_data(prog_data),
2044      dispatch_width(dispatch_width), promoted_constants(0),
2045      bld(fs_builder(this, dispatch_width).at_end())
2046 {
2047    this->mem_ctx = mem_ctx;
2048
2049    switch (stage) {
2050    case MESA_SHADER_FRAGMENT:
2051       key_tex = &((const brw_wm_prog_key *) key)->tex;
2052       break;
2053    case MESA_SHADER_VERTEX:
2054    case MESA_SHADER_GEOMETRY:
2055       key_tex = &((const brw_vue_prog_key *) key)->tex;
2056       break;
2057    case MESA_SHADER_COMPUTE:
2058       key_tex = &((const brw_cs_prog_key*) key)->tex;
2059       break;
2060    default:
2061       unreachable("unhandled shader stage");
2062    }
2063
2064    this->failed = false;
2065    this->simd16_unsupported = false;
2066    this->no16_msg = NULL;
2067
2068    this->nir_locals = NULL;
2069    this->nir_globals = NULL;
2070
2071    memset(&this->payload, 0, sizeof(this->payload));
2072    memset(this->outputs, 0, sizeof(this->outputs));
2073    memset(this->output_components, 0, sizeof(this->output_components));
2074    this->source_depth_to_render_target = false;
2075    this->runtime_check_aads_emit = false;
2076    this->first_non_payload_grf = 0;
2077    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2078
2079    this->current_annotation = NULL;
2080    this->base_ir = NULL;
2081
2082    this->virtual_grf_start = NULL;
2083    this->virtual_grf_end = NULL;
2084    this->live_intervals = NULL;
2085    this->regs_live_at_ip = NULL;
2086
2087    this->uniforms = 0;
2088    this->last_scratch = 0;
2089    this->pull_constant_loc = NULL;
2090    this->push_constant_loc = NULL;
2091
2092    this->spilled_any_registers = false;
2093    this->do_dual_src = false;
2094
2095    if (dispatch_width == 8)
2096       this->param_size = rzalloc_array(mem_ctx, int, stage_prog_data->nr_params);
2097 }
2098
2099 fs_visitor::~fs_visitor()
2100 {
2101 }