src/mesa/drivers/dri/i965/brw_fs_visitor.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs_visitor.cpp
  25  *
  26  * This file supports generating the FS LIR from the GLSL IR.  The LIR
  27  * makes it easier to do backend-specific optimizations than doing so
  28  * in the GLSL IR or in the native code.
  29  */
  30 #include <sys/types.h>
  31
  32 #include "main/macros.h"
  33 #include "main/shaderobj.h"
  34 #include "program/prog_parameter.h"
  35 #include "program/prog_print.h"
  36 #include "program/prog_optimize.h"
  37 #include "util/register_allocate.h"
  38 #include "program/hash_table.h"
  39 #include "brw_context.h"
  40 #include "brw_eu.h"
  41 #include "brw_wm.h"
  42 #include "brw_cs.h"
  43 #include "brw_vec4.h"
  44 #include "brw_fs.h"
  45 #include "main/uniforms.h"
  46 #include "glsl/nir/glsl_types.h"
  47 #include "glsl/ir_optimization.h"
  48 #include "program/sampler.h"
  49
  50 using namespace brw;
  51
  52 fs_reg *
  53 fs_visitor::emit_vs_system_value(int location)
  54 {
  55    fs_reg *reg = new(this->mem_ctx)
  56       fs_reg(ATTR, 4 * _mesa_bitcount_64(nir->info.inputs_read),
  57              BRW_REGISTER_TYPE_D);
  58    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
  59
  60    switch (location) {
  61    case SYSTEM_VALUE_BASE_VERTEX:
  62       reg->reg_offset = 0;
  63       vs_prog_data->uses_vertexid = true;
  64       break;
  65    case SYSTEM_VALUE_VERTEX_ID:
  66    case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
  67       reg->reg_offset = 2;
  68       vs_prog_data->uses_vertexid = true;
  69       break;
  70    case SYSTEM_VALUE_INSTANCE_ID:
  71       reg->reg_offset = 3;
  72       vs_prog_data->uses_instanceid = true;
  73       break;
  74    default:
  75       unreachable("not reached");
  76    }
  77
  78    return reg;
  79 }
  80
  81 fs_reg
  82 fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
  83                              bool is_rect, uint32_t sampler)
  84 {
  85    bool needs_gl_clamp = true;
  86    fs_reg scale_x, scale_y;
  87
  88    /* The 965 requires the EU to do the normalization of GL rectangle
  89     * texture coordinates.  We use the program parameter state
  90     * tracking to get the scaling factor.
  91     */
  92    if (is_rect &&
  93        (devinfo->gen < 6 ||
  94         (devinfo->gen >= 6 && (key_tex->gl_clamp_mask[0] & (1 << sampler) ||
  95                                key_tex->gl_clamp_mask[1] & (1 << sampler))))) {
  96       struct gl_program_parameter_list *params = prog->Parameters;
  97
  98
  99       /* FINISHME: We're failing to recompile our programs when the sampler is
 100        * updated.  This only matters for the texture rectangle scale
 101        * parameters (pre-gen6, or gen6+ with GL_CLAMP).
 102        */
 103       int tokens[STATE_LENGTH] = {
 104          STATE_INTERNAL,
 105          STATE_TEXRECT_SCALE,
 106          prog->SamplerUnits[sampler],
 107          0,
 108          0
 109       };
 110
 111       no16("rectangle scale uniform setup not supported on SIMD16\n");
 112       if (dispatch_width == 16) {
 113          return coordinate;
 114       }
 115
 116       GLuint index = _mesa_add_state_reference(params,
 117                                                (gl_state_index *)tokens);
 118       /* Try to find existing copies of the texrect scale uniforms. */
 119       for (unsigned i = 0; i < uniforms; i++) {
 120          if (stage_prog_data->param[i] ==
 121              &prog->Parameters->ParameterValues[index][0]) {
 122             scale_x = fs_reg(UNIFORM, i);
 123             scale_y = fs_reg(UNIFORM, i + 1);
 124             break;
 125          }
 126       }
 127
 128       /* If we didn't already set them up, do so now. */
 129       if (scale_x.file == BAD_FILE) {
 130          scale_x = fs_reg(UNIFORM, uniforms);
 131          scale_y = fs_reg(UNIFORM, uniforms + 1);
 132
 133          stage_prog_data->param[uniforms++] =
 134             &prog->Parameters->ParameterValues[index][0];
 135          stage_prog_data->param[uniforms++] =
 136             &prog->Parameters->ParameterValues[index][1];
 137       }
 138    }
 139
 140    /* The 965 requires the EU to do the normalization of GL rectangle
 141     * texture coordinates.  We use the program parameter state
 142     * tracking to get the scaling factor.
 143     */
 144    if (devinfo->gen < 6 && is_rect) {
 145       fs_reg dst = fs_reg(GRF, alloc.allocate(coord_components));
 146       fs_reg src = coordinate;
 147       coordinate = dst;
 148
 149       bld.MUL(dst, src, scale_x);
 150       dst = offset(dst, bld, 1);
 151       src = offset(src, bld, 1);
 152       bld.MUL(dst, src, scale_y);
 153    } else if (is_rect) {
 154       /* On gen6+, the sampler handles the rectangle coordinates
 155        * natively, without needing rescaling.  But that means we have
 156        * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
 157        * not [0, 1] like the default case below.
 158        */
 159       needs_gl_clamp = false;
 160
 161       for (int i = 0; i < 2; i++) {
 162          if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
 163             fs_reg chan = coordinate;
 164             chan = offset(chan, bld, i);
 165
 166             set_condmod(BRW_CONDITIONAL_GE,
 167                         bld.emit(BRW_OPCODE_SEL, chan, chan, fs_reg(0.0f)));
 168
 169             /* Our parameter comes in as 1.0/width or 1.0/height,
 170              * because that's what people normally want for doing
 171              * texture rectangle handling.  We need width or height
 172              * for clamping, but we don't care enough to make a new
 173              * parameter type, so just invert back.
 174              */
 175             fs_reg limit = vgrf(glsl_type::float_type);
 176             bld.MOV(limit, i == 0 ? scale_x : scale_y);
 177             bld.emit(SHADER_OPCODE_RCP, limit, limit);
 178
 179             set_condmod(BRW_CONDITIONAL_L,
 180                         bld.emit(BRW_OPCODE_SEL, chan, chan, limit));
 181          }
 182       }
 183    }
 184
 185    if (coord_components > 0 && needs_gl_clamp) {
 186       for (int i = 0; i < MIN2(coord_components, 3); i++) {
 187          if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
 188             fs_reg chan = coordinate;
 189             chan = offset(chan, bld, i);
 190             set_saturate(true, bld.MOV(chan, chan));
 191          }
 192       }
 193    }
 194    return coordinate;
 195 }
 196
 197 /* Sample from the MCS surface attached to this multisample texture. */
 198 fs_reg
 199 fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
 200                            const fs_reg &sampler)
 201 {
 202    const fs_reg dest = vgrf(glsl_type::uvec4_type);
 203    const fs_reg srcs[] = {
 204       coordinate, fs_reg(), fs_reg(), fs_reg(), fs_reg(), fs_reg(),
 205       sampler, fs_reg(), fs_reg(components), fs_reg(0)
 206    };
 207    fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
 208                             ARRAY_SIZE(srcs));
 209
 210    /* We only care about one reg of response, but the sampler always writes
 211     * 4/8.
 212     */
 213    inst->regs_written = 4 * dispatch_width / 8;
 214
 215    return dest;
 216 }
 217
 218 void
 219 fs_visitor::emit_texture(ir_texture_opcode op,
 220                          const glsl_type *dest_type,
 221                          fs_reg coordinate, int coord_components,
 222                          fs_reg shadow_c,
 223                          fs_reg lod, fs_reg lod2, int grad_components,
 224                          fs_reg sample_index,
 225                          fs_reg offset_value,
 226                          fs_reg mcs,
 227                          int gather_component,
 228                          bool is_cube_array,
 229                          bool is_rect,
 230                          uint32_t sampler,
 231                          fs_reg sampler_reg)
 232 {
 233    fs_inst *inst = NULL;
 234
 235    if (op == ir_tg4) {
 236       /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
 237        * emitting anything other than setting up the constant result.
 238        */
 239       int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
 240       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
 241
 242          fs_reg res = vgrf(glsl_type::vec4_type);
 243          this->result = res;
 244
 245          for (int i=0; i<4; i++) {
 246             bld.MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f));
 247             res = offset(res, bld, 1);
 248          }
 249          return;
 250       }
 251    }
 252
 253    if (op == ir_query_levels) {
 254       /* textureQueryLevels() is implemented in terms of TXS so we need to
 255        * pass a valid LOD argument.
 256        */
 257       assert(lod.file == BAD_FILE);
 258       lod = fs_reg(0u);
 259    }
 260
 261    if (coordinate.file != BAD_FILE) {
 262       /* FINISHME: Texture coordinate rescaling doesn't work with non-constant
 263        * samplers.  This should only be a problem with GL_CLAMP on Gen7.
 264        */
 265       coordinate = rescale_texcoord(coordinate, coord_components, is_rect,
 266                                     sampler);
 267    }
 268
 269    /* Writemasking doesn't eliminate channels on SIMD8 texture
 270     * samples, so don't worry about them.
 271     */
 272    fs_reg dst = vgrf(glsl_type::get_instance(dest_type->base_type, 4, 1));
 273    const fs_reg srcs[] = {
 274       coordinate, shadow_c, lod, lod2,
 275       sample_index, mcs, sampler_reg, offset_value,
 276       fs_reg(coord_components), fs_reg(grad_components)
 277    };
 278    enum opcode opcode;
 279
 280    switch (op) {
 281    case ir_tex:
 282       opcode = SHADER_OPCODE_TEX_LOGICAL;
 283       break;
 284    case ir_txb:
 285       opcode = FS_OPCODE_TXB_LOGICAL;
 286       break;
 287    case ir_txl:
 288       opcode = SHADER_OPCODE_TXL_LOGICAL;
 289       break;
 290    case ir_txd:
 291       opcode = SHADER_OPCODE_TXD_LOGICAL;
 292       break;
 293    case ir_txf:
 294       opcode = SHADER_OPCODE_TXF_LOGICAL;
 295       break;
 296    case ir_txf_ms:
 297       opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
 298       break;
 299    case ir_txs:
 300    case ir_query_levels:
 301       opcode = SHADER_OPCODE_TXS_LOGICAL;
 302       break;
 303    case ir_lod:
 304       opcode = SHADER_OPCODE_LOD_LOGICAL;
 305       break;
 306    case ir_tg4:
 307       opcode = (offset_value.file != BAD_FILE && offset_value.file != IMM ?
 308                 SHADER_OPCODE_TG4_OFFSET_LOGICAL : SHADER_OPCODE_TG4_LOGICAL);
 309       break;
 310    default:
 311       unreachable("Invalid texture opcode.");
 312    }
 313
 314    inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
 315    inst->regs_written = 4 * dispatch_width / 8;
 316
 317    if (shadow_c.file != BAD_FILE)
 318       inst->shadow_compare = true;
 319
 320    if (offset_value.file == IMM)
 321       inst->offset = offset_value.fixed_hw_reg.dw1.ud;
 322
 323    if (op == ir_tg4) {
 324       inst->offset |=
 325          gather_channel(gather_component, sampler) << 16; /* M0.2:16-17 */
 326
 327       if (devinfo->gen == 6)
 328          emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], dst);
 329    }
 330
 331    /* fixup #layers for cube map arrays */
 332    if (op == ir_txs && is_cube_array) {
 333       fs_reg depth = offset(dst, bld, 2);
 334       fs_reg fixed_depth = vgrf(glsl_type::int_type);
 335       bld.emit(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
 336
 337       fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
 338       int components = inst->regs_written / (inst->exec_size / 8);
 339       for (int i = 0; i < components; i++) {
 340          if (i == 2) {
 341             fixed_payload[i] = fixed_depth;
 342          } else {
 343             fixed_payload[i] = offset(dst, bld, i);
 344          }
 345       }
 346       bld.LOAD_PAYLOAD(dst, fixed_payload, components, 0);
 347    }
 348
 349    swizzle_result(op, dest_type->vector_elements, dst, sampler);
 350 }
 351
 352 /**
 353  * Apply workarounds for Gen6 gather with UINT/SINT
 354  */
 355 void
 356 fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
 357 {
 358    if (!wa)
 359       return;
 360
 361    int width = (wa & WA_8BIT) ? 8 : 16;
 362
 363    for (int i = 0; i < 4; i++) {
 364       fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
 365       /* Convert from UNORM to UINT */
 366       bld.MUL(dst_f, dst_f, fs_reg((float)((1 << width) - 1)));
 367       bld.MOV(dst, dst_f);
 368
 369       if (wa & WA_SIGN) {
 370          /* Reinterpret the UINT value as a signed INT value by
 371           * shifting the sign bit into place, then shifting back
 372           * preserving sign.
 373           */
 374          bld.SHL(dst, dst, fs_reg(32 - width));
 375          bld.ASR(dst, dst, fs_reg(32 - width));
 376       }
 377
 378       dst = offset(dst, bld, 1);
 379    }
 380 }
 381
 382 /**
 383  * Set up the gather channel based on the swizzle, for gather4.
 384  */
 385 uint32_t
 386 fs_visitor::gather_channel(int orig_chan, uint32_t sampler)
 387 {
 388    int swiz = GET_SWZ(key_tex->swizzles[sampler], orig_chan);
 389    switch (swiz) {
 390       case SWIZZLE_X: return 0;
 391       case SWIZZLE_Y:
 392          /* gather4 sampler is broken for green channel on RG32F --
 393           * we must ask for blue instead.
 394           */
 395          if (key_tex->gather_channel_quirk_mask & (1 << sampler))
 396             return 2;
 397          return 1;
 398       case SWIZZLE_Z: return 2;
 399       case SWIZZLE_W: return 3;
 400       default:
 401          unreachable("Not reached"); /* zero, one swizzles handled already */
 402    }
 403 }
 404
 405 /**
 406  * Swizzle the result of a texture result.  This is necessary for
 407  * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
 408  */
 409 void
 410 fs_visitor::swizzle_result(ir_texture_opcode op, int dest_components,
 411                            fs_reg orig_val, uint32_t sampler)
 412 {
 413    if (op == ir_query_levels) {
 414       /* # levels is in .w */
 415       this->result = offset(orig_val, bld, 3);
 416       return;
 417    }
 418
 419    this->result = orig_val;
 420
 421    /* txs,lod don't actually sample the texture, so swizzling the result
 422     * makes no sense.
 423     */
 424    if (op == ir_txs || op == ir_lod || op == ir_tg4)
 425       return;
 426
 427    if (dest_components == 1) {
 428       /* Ignore DEPTH_TEXTURE_MODE swizzling. */
 429    } else if (key_tex->swizzles[sampler] != SWIZZLE_NOOP) {
 430       fs_reg swizzled_result = vgrf(glsl_type::vec4_type);
 431       swizzled_result.type = orig_val.type;
 432
 433       for (int i = 0; i < 4; i++) {
 434          int swiz = GET_SWZ(key_tex->swizzles[sampler], i);
 435          fs_reg l = swizzled_result;
 436          l = offset(l, bld, i);
 437
 438          if (swiz == SWIZZLE_ZERO) {
 439             bld.MOV(l, fs_reg(0.0f));
 440          } else if (swiz == SWIZZLE_ONE) {
 441             bld.MOV(l, fs_reg(1.0f));
 442          } else {
 443             bld.MOV(l, offset(orig_val, bld,
 444                                   GET_SWZ(key_tex->swizzles[sampler], i)));
 445          }
 446       }
 447       this->result = swizzled_result;
 448    }
 449 }
 450
 451 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
 452 void
 453 fs_visitor::emit_dummy_fs()
 454 {
 455    int reg_width = dispatch_width / 8;
 456
 457    /* Everyone's favorite color. */
 458    const float color[4] = { 1.0, 0.0, 1.0, 0.0 };
 459    for (int i = 0; i < 4; i++) {
 460       bld.MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F),
 461               fs_reg(color[i]));
 462    }
 463
 464    fs_inst *write;
 465    write = bld.emit(FS_OPCODE_FB_WRITE);
 466    write->eot = true;
 467    if (devinfo->gen >= 6) {
 468       write->base_mrf = 2;
 469       write->mlen = 4 * reg_width;
 470    } else {
 471       write->header_size = 2;
 472       write->base_mrf = 0;
 473       write->mlen = 2 + 4 * reg_width;
 474    }
 475
 476    /* Tell the SF we don't have any inputs.  Gen4-5 require at least one
 477     * varying to avoid GPU hangs, so set that.
 478     */
 479    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
 480    wm_prog_data->num_varying_inputs = devinfo->gen < 6 ? 1 : 0;
 481    memset(wm_prog_data->urb_setup, -1,
 482           sizeof(wm_prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
 483
 484    /* We don't have any uniforms. */
 485    stage_prog_data->nr_params = 0;
 486    stage_prog_data->nr_pull_params = 0;
 487    stage_prog_data->curb_read_length = 0;
 488    stage_prog_data->dispatch_grf_start_reg = 2;
 489    wm_prog_data->dispatch_grf_start_reg_16 = 2;
 490    grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */
 491
 492    calculate_cfg();
 493 }
 494
 495 /* The register location here is relative to the start of the URB
 496  * data.  It will get adjusted to be a real location before
 497  * generate_code() time.
 498  */
 499 struct brw_reg
 500 fs_visitor::interp_reg(int location, int channel)
 501 {
 502    assert(stage == MESA_SHADER_FRAGMENT);
 503    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
 504    int regnr = prog_data->urb_setup[location] * 2 + channel / 2;
 505    int stride = (channel & 1) * 4;
 506
 507    assert(prog_data->urb_setup[location] != -1);
 508
 509    return brw_vec1_grf(regnr, stride);
 510 }
 511
 512 /** Emits the interpolation for the varying inputs. */
 513 void
 514 fs_visitor::emit_interpolation_setup_gen4()
 515 {
 516    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
 517
 518    fs_builder abld = bld.annotate("compute pixel centers");
 519    this->pixel_x = vgrf(glsl_type::uint_type);
 520    this->pixel_y = vgrf(glsl_type::uint_type);
 521    this->pixel_x.type = BRW_REGISTER_TYPE_UW;
 522    this->pixel_y.type = BRW_REGISTER_TYPE_UW;
 523    abld.ADD(this->pixel_x,
 524             fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
 525             fs_reg(brw_imm_v(0x10101010)));
 526    abld.ADD(this->pixel_y,
 527             fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
 528             fs_reg(brw_imm_v(0x11001100)));
 529
 530    abld = bld.annotate("compute pixel deltas from v0");
 531
 532    this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
 533       vgrf(glsl_type::vec2_type);
 534    const fs_reg &delta_xy = this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC];
 535    const fs_reg xstart(negate(brw_vec1_grf(1, 0)));
 536    const fs_reg ystart(negate(brw_vec1_grf(1, 1)));
 537
 538    if (devinfo->has_pln && dispatch_width == 16) {
 539       for (unsigned i = 0; i < 2; i++) {
 540          abld.half(i).ADD(half(offset(delta_xy, abld, i), 0),
 541                           half(this->pixel_x, i), xstart);
 542          abld.half(i).ADD(half(offset(delta_xy, abld, i), 1),
 543                           half(this->pixel_y, i), ystart);
 544       }
 545    } else {
 546       abld.ADD(offset(delta_xy, abld, 0), this->pixel_x, xstart);
 547       abld.ADD(offset(delta_xy, abld, 1), this->pixel_y, ystart);
 548    }
 549
 550    abld = bld.annotate("compute pos.w and 1/pos.w");
 551    /* Compute wpos.w.  It's always in our setup, since it's needed to
 552     * interpolate the other attributes.
 553     */
 554    this->wpos_w = vgrf(glsl_type::float_type);
 555    abld.emit(FS_OPCODE_LINTERP, wpos_w, delta_xy,
 556              interp_reg(VARYING_SLOT_POS, 3));
 557    /* Compute the pixel 1/W value from wpos.w. */
 558    this->pixel_w = vgrf(glsl_type::float_type);
 559    abld.emit(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
 560 }
 561
 562 /** Emits the interpolation for the varying inputs. */
 563 void
 564 fs_visitor::emit_interpolation_setup_gen6()
 565 {
 566    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
 567
 568    fs_builder abld = bld.annotate("compute pixel centers");
 569    if (devinfo->gen >= 8 || dispatch_width == 8) {
 570       /* The "Register Region Restrictions" page says for BDW (and newer,
 571        * presumably):
 572        *
 573        *     "When destination spans two registers, the source may be one or
 574        *      two registers. The destination elements must be evenly split
 575        *      between the two registers."
 576        *
 577        * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16 to
 578        * compute our pixel centers.
 579        */
 580       fs_reg int_pixel_xy(GRF, alloc.allocate(dispatch_width / 8),
 581                           BRW_REGISTER_TYPE_UW);
 582
 583       const fs_builder dbld = abld.exec_all().group(dispatch_width * 2, 0);
 584       dbld.ADD(int_pixel_xy,
 585                fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)),
 586                fs_reg(brw_imm_v(0x11001010)));
 587
 588       this->pixel_x = vgrf(glsl_type::float_type);
 589       this->pixel_y = vgrf(glsl_type::float_type);
 590       abld.emit(FS_OPCODE_PIXEL_X, this->pixel_x, int_pixel_xy);
 591       abld.emit(FS_OPCODE_PIXEL_Y, this->pixel_y, int_pixel_xy);
 592    } else {
 593       /* The "Register Region Restrictions" page says for SNB, IVB, HSW:
 594        *
 595        *     "When destination spans two registers, the source MUST span two
 596        *      registers."
 597        *
 598        * Since the GRF source of the ADD will only read a single register, we
 599        * must do two separate ADDs in SIMD16.
 600        */
 601       fs_reg int_pixel_x = vgrf(glsl_type::uint_type);
 602       fs_reg int_pixel_y = vgrf(glsl_type::uint_type);
 603       int_pixel_x.type = BRW_REGISTER_TYPE_UW;
 604       int_pixel_y.type = BRW_REGISTER_TYPE_UW;
 605       abld.ADD(int_pixel_x,
 606                fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
 607                fs_reg(brw_imm_v(0x10101010)));
 608       abld.ADD(int_pixel_y,
 609                fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
 610                fs_reg(brw_imm_v(0x11001100)));
 611
 612       /* As of gen6, we can no longer mix float and int sources.  We have
 613        * to turn the integer pixel centers into floats for their actual
 614        * use.
 615        */
 616       this->pixel_x = vgrf(glsl_type::float_type);
 617       this->pixel_y = vgrf(glsl_type::float_type);
 618       abld.MOV(this->pixel_x, int_pixel_x);
 619       abld.MOV(this->pixel_y, int_pixel_y);
 620    }
 621
 622    abld = bld.annotate("compute pos.w");
 623    this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0));
 624    this->wpos_w = vgrf(glsl_type::float_type);
 625    abld.emit(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
 626
 627    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
 628       uint8_t reg = payload.barycentric_coord_reg[i];
 629       this->delta_xy[i] = fs_reg(brw_vec16_grf(reg, 0));
 630    }
 631 }
 632
 633 static enum brw_conditional_mod
 634 cond_for_alpha_func(GLenum func)
 635 {
 636    switch(func) {
 637       case GL_GREATER:
 638          return BRW_CONDITIONAL_G;
 639       case GL_GEQUAL:
 640          return BRW_CONDITIONAL_GE;
 641       case GL_LESS:
 642          return BRW_CONDITIONAL_L;
 643       case GL_LEQUAL:
 644          return BRW_CONDITIONAL_LE;
 645       case GL_EQUAL:
 646          return BRW_CONDITIONAL_EQ;
 647       case GL_NOTEQUAL:
 648          return BRW_CONDITIONAL_NEQ;
 649       default:
 650          unreachable("Not reached");
 651    }
 652 }
 653
 654 /**
 655  * Alpha test support for when we compile it into the shader instead
 656  * of using the normal fixed-function alpha test.
 657  */
 658 void
 659 fs_visitor::emit_alpha_test()
 660 {
 661    assert(stage == MESA_SHADER_FRAGMENT);
 662    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 663    const fs_builder abld = bld.annotate("Alpha test");
 664
 665    fs_inst *cmp;
 666    if (key->alpha_test_func == GL_ALWAYS)
 667       return;
 668
 669    if (key->alpha_test_func == GL_NEVER) {
 670       /* f0.1 = 0 */
 671       fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
 672                                       BRW_REGISTER_TYPE_UW));
 673       cmp = abld.CMP(bld.null_reg_f(), some_reg, some_reg,
 674                      BRW_CONDITIONAL_NEQ);
 675    } else {
 676       /* RT0 alpha */
 677       fs_reg color = offset(outputs[0], bld, 3);
 678
 679       /* f0.1 &= func(color, ref) */
 680       cmp = abld.CMP(bld.null_reg_f(), color, fs_reg(key->alpha_test_ref),
 681                      cond_for_alpha_func(key->alpha_test_func));
 682    }
 683    cmp->predicate = BRW_PREDICATE_NORMAL;
 684    cmp->flag_subreg = 1;
 685 }
 686
 687 fs_inst *
 688 fs_visitor::emit_single_fb_write(const fs_builder &bld,
 689                                  fs_reg color0, fs_reg color1,
 690                                  fs_reg src0_alpha, unsigned components)
 691 {
 692    assert(stage == MESA_SHADER_FRAGMENT);
 693    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
 694
 695    /* Hand over gl_FragDepth or the payload depth. */
 696    const fs_reg dst_depth = (payload.dest_depth_reg ?
 697                              fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0)) :
 698                              fs_reg());
 699    fs_reg src_depth;
 700
 701    if (source_depth_to_render_target) {
 702       if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
 703          src_depth = frag_depth;
 704       else
 705          src_depth = fs_reg(brw_vec8_grf(payload.source_depth_reg, 0));
 706    }
 707
 708    const fs_reg sources[] = {
 709       color0, color1, src0_alpha, src_depth, dst_depth, sample_mask,
 710       fs_reg(components)
 711    };
 712    fs_inst *write = bld.emit(FS_OPCODE_FB_WRITE_LOGICAL, fs_reg(),
 713                              sources, ARRAY_SIZE(sources));
 714
 715    if (prog_data->uses_kill) {
 716       write->predicate = BRW_PREDICATE_NORMAL;
 717       write->flag_subreg = 1;
 718    }
 719
 720    return write;
 721 }
 722
 723 void
 724 fs_visitor::emit_fb_writes()
 725 {
 726    assert(stage == MESA_SHADER_FRAGMENT);
 727    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
 728    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 729
 730    fs_inst *inst = NULL;
 731
 732    if (source_depth_to_render_target && devinfo->gen == 6) {
 733       /* For outputting oDepth on gen6, SIMD8 writes have to be used.  This
 734        * would require SIMD8 moves of each half to message regs, e.g. by using
 735        * the SIMD lowering pass.  Unfortunately this is more difficult than it
 736        * sounds because the SIMD8 single-source message lacks channel selects
 737        * for the second and third subspans.
 738        */
 739       no16("Missing support for simd16 depth writes on gen6\n");
 740    }
 741
 742    if (do_dual_src) {
 743       const fs_builder abld = bld.annotate("FB dual-source write");
 744
 745       inst = emit_single_fb_write(abld, this->outputs[0],
 746                                   this->dual_src_output, reg_undef, 4);
 747       inst->target = 0;
 748
 749       prog_data->dual_src_blend = true;
 750    } else {
 751       for (int target = 0; target < key->nr_color_regions; target++) {
 752          /* Skip over outputs that weren't written. */
 753          if (this->outputs[target].file == BAD_FILE)
 754             continue;
 755
 756          const fs_builder abld = bld.annotate(
 757             ralloc_asprintf(this->mem_ctx, "FB write target %d", target));
 758
 759          fs_reg src0_alpha;
 760          if (devinfo->gen >= 6 && key->replicate_alpha && target != 0)
 761             src0_alpha = offset(outputs[0], bld, 3);
 762
 763          inst = emit_single_fb_write(abld, this->outputs[target], reg_undef,
 764                                      src0_alpha,
 765                                      this->output_components[target]);
 766          inst->target = target;
 767       }
 768    }
 769
 770    if (inst == NULL) {
 771       /* Even if there's no color buffers enabled, we still need to send
 772        * alpha out the pipeline to our null renderbuffer to support
 773        * alpha-testing, alpha-to-coverage, and so on.
 774        */
 775       /* FINISHME: Factor out this frequently recurring pattern into a
 776        * helper function.
 777        */
 778       const fs_reg srcs[] = { reg_undef, reg_undef,
 779                               reg_undef, offset(this->outputs[0], bld, 3) };
 780       const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
 781       bld.LOAD_PAYLOAD(tmp, srcs, 4, 0);
 782
 783       inst = emit_single_fb_write(bld, tmp, reg_undef, reg_undef, 4);
 784       inst->target = 0;
 785    }
 786
 787    inst->eot = true;
 788 }
 789
 790 void
 791 fs_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
 792 {
 793    const struct brw_vs_prog_key *key =
 794       (const struct brw_vs_prog_key *) this->key;
 795
 796    for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
 797       this->userplane[i] = fs_reg(UNIFORM, uniforms);
 798       for (int j = 0; j < 4; ++j) {
 799          stage_prog_data->param[uniforms + j] =
 800             (gl_constant_value *) &clip_planes[i][j];
 801       }
 802       uniforms += 4;
 803    }
 804 }
 805
 806 /**
 807  * Lower legacy fixed-function and gl_ClipVertex clipping to clip distances.
 808  *
 809  * This does nothing if the shader uses gl_ClipDistance or user clipping is
 810  * disabled altogether.
 811  */
 812 void fs_visitor::compute_clip_distance(gl_clip_plane *clip_planes)
 813 {
 814    struct brw_vue_prog_data *vue_prog_data =
 815       (struct brw_vue_prog_data *) prog_data;
 816    const struct brw_vs_prog_key *key =
 817       (const struct brw_vs_prog_key *) this->key;
 818
 819    /* Bail unless some sort of legacy clipping is enabled */
 820    if (key->nr_userclip_plane_consts == 0)
 821       return;
 822
 823    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
 824     *
 825     *     "If a linked set of shaders forming the vertex stage contains no
 826     *     static write to gl_ClipVertex or gl_ClipDistance, but the
 827     *     application has requested clipping against user clip planes through
 828     *     the API, then the coordinate written to gl_Position is used for
 829     *     comparison against the user clip planes."
 830     *
 831     * This function is only called if the shader didn't write to
 832     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
 833     * if the user wrote to it; otherwise we use gl_Position.
 834     */
 835
 836    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
 837    if (!(vue_prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX))
 838       clip_vertex = VARYING_SLOT_POS;
 839
 840    /* If the clip vertex isn't written, skip this.  Typically this means
 841     * the GS will set up clipping. */
 842    if (outputs[clip_vertex].file == BAD_FILE)
 843       return;
 844
 845    setup_uniform_clipplane_values(clip_planes);
 846
 847    const fs_builder abld = bld.annotate("user clip distances");
 848
 849    this->outputs[VARYING_SLOT_CLIP_DIST0] = vgrf(glsl_type::vec4_type);
 850    this->output_components[VARYING_SLOT_CLIP_DIST0] = 4;
 851    this->outputs[VARYING_SLOT_CLIP_DIST1] = vgrf(glsl_type::vec4_type);
 852    this->output_components[VARYING_SLOT_CLIP_DIST1] = 4;
 853
 854    for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
 855       fs_reg u = userplane[i];
 856       fs_reg output = outputs[VARYING_SLOT_CLIP_DIST0 + i / 4];
 857       output.reg_offset = i & 3;
 858
 859       abld.MUL(output, outputs[clip_vertex], u);
 860       for (int j = 1; j < 4; j++) {
 861          u.reg = userplane[i].reg + j;
 862          abld.MAD(output, output, offset(outputs[clip_vertex], bld, j), u);
 863       }
 864    }
 865 }
 866
 867 void
 868 fs_visitor::emit_urb_writes()
 869 {
 870    int slot, urb_offset, length;
 871    struct brw_vs_prog_data *vs_prog_data =
 872       (struct brw_vs_prog_data *) prog_data;
 873    const struct brw_vs_prog_key *key =
 874       (const struct brw_vs_prog_key *) this->key;
 875    const GLbitfield64 psiz_mask =
 876       VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ;
 877    const struct brw_vue_map *vue_map = &vs_prog_data->base.vue_map;
 878    bool flush;
 879    fs_reg sources[8];
 880
 881    /* If we don't have any valid slots to write, just do a minimal urb write
 882     * send to terminate the shader.  This includes 1 slot of undefined data,
 883     * because it's invalid to write 0 data:
 884     *
 885     * From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions -
 886     * Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read >
 887     * Write Data Payload:
 888     *
 889     *    "The write data payload can be between 1 and 8 message phases long."
 890     */
 891    if (vue_map->slots_valid == 0) {
 892       fs_reg payload = fs_reg(GRF, alloc.allocate(2), BRW_REGISTER_TYPE_UD);
 893       bld.exec_all().MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0),
 894                                                 BRW_REGISTER_TYPE_UD)));
 895
 896       fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
 897       inst->eot = true;
 898       inst->mlen = 2;
 899       inst->offset = 1;
 900       return;
 901    }
 902
 903    length = 0;
 904    urb_offset = 0;
 905    flush = false;
 906    for (slot = 0; slot < vue_map->num_slots; slot++) {
 907       fs_reg reg, src, zero;
 908
 909       int varying = vue_map->slot_to_varying[slot];
 910       switch (varying) {
 911       case VARYING_SLOT_PSIZ:
 912
 913          /* The point size varying slot is the vue header and is always in the
 914           * vue map.  But often none of the special varyings that live there
 915           * are written and in that case we can skip writing to the vue
 916           * header, provided the corresponding state properly clamps the
 917           * values further down the pipeline. */
 918          if ((vue_map->slots_valid & psiz_mask) == 0) {
 919             assert(length == 0);
 920             urb_offset++;
 921             break;
 922          }
 923
 924          zero = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
 925          bld.MOV(zero, fs_reg(0u));
 926
 927          sources[length++] = zero;
 928          if (vue_map->slots_valid & VARYING_BIT_LAYER)
 929             sources[length++] = this->outputs[VARYING_SLOT_LAYER];
 930          else
 931             sources[length++] = zero;
 932
 933          if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
 934             sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
 935          else
 936             sources[length++] = zero;
 937
 938          if (vue_map->slots_valid & VARYING_BIT_PSIZ)
 939             sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
 940          else
 941             sources[length++] = zero;
 942          break;
 943
 944       case BRW_VARYING_SLOT_NDC:
 945       case VARYING_SLOT_EDGE:
 946          unreachable("unexpected scalar vs output");
 947          break;
 948
 949       default:
 950          /* gl_Position is always in the vue map, but isn't always written by
 951           * the shader.  Other varyings (clip distances) get added to the vue
 952           * map but don't always get written.  In those cases, the
 953           * corresponding this->output[] slot will be invalid we and can skip
 954           * the urb write for the varying.  If we've already queued up a vue
 955           * slot for writing we flush a mlen 5 urb write, otherwise we just
 956           * advance the urb_offset.
 957           */
 958          if (varying == BRW_VARYING_SLOT_PAD ||
 959              this->outputs[varying].file == BAD_FILE) {
 960             if (length > 0)
 961                flush = true;
 962             else
 963                urb_offset++;
 964             break;
 965          }
 966
 967          if ((varying == VARYING_SLOT_COL0 ||
 968               varying == VARYING_SLOT_COL1 ||
 969               varying == VARYING_SLOT_BFC0 ||
 970               varying == VARYING_SLOT_BFC1) &&
 971              key->clamp_vertex_color) {
 972             /* We need to clamp these guys, so do a saturating MOV into a
 973              * temp register and use that for the payload.
 974              */
 975             for (int i = 0; i < 4; i++) {
 976                reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type);
 977                src = offset(this->outputs[varying], bld, i);
 978                set_saturate(true, bld.MOV(reg, src));
 979                sources[length++] = reg;
 980             }
 981          } else {
 982             for (unsigned i = 0; i < output_components[varying]; i++)
 983                sources[length++] = offset(this->outputs[varying], bld, i);
 984             for (unsigned i = output_components[varying]; i < 4; i++)
 985                sources[length++] = fs_reg(0);
 986          }
 987          break;
 988       }
 989
 990       const fs_builder abld = bld.annotate("URB write");
 991
 992       /* If we've queued up 8 registers of payload (2 VUE slots), if this is
 993        * the last slot or if we need to flush (see BAD_FILE varying case
 994        * above), emit a URB write send now to flush out the data.
 995        */
 996       int last = slot == vue_map->num_slots - 1;
 997       if (length == 8 || last)
 998          flush = true;
 999       if (flush) {
1000          fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
1001          fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1),
1002                                  BRW_REGISTER_TYPE_F);
1003          payload_sources[0] =
1004             fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
1005
1006          memcpy(&payload_sources[1], sources, length * sizeof sources[0]);
1007          abld.LOAD_PAYLOAD(payload, payload_sources, length + 1, 1);
1008
1009          fs_inst *inst =
1010             abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
1011          inst->eot = last;
1012          inst->mlen = length + 1;
1013          inst->offset = urb_offset;
1014          urb_offset = slot + 1;
1015          length = 0;
1016          flush = false;
1017       }
1018    }
1019 }
1020
1021 void
1022 fs_visitor::emit_cs_terminate()
1023 {
1024    assert(devinfo->gen >= 7);
1025
1026    /* We are getting the thread ID from the compute shader header */
1027    assert(stage == MESA_SHADER_COMPUTE);
1028
1029    /* We can't directly send from g0, since sends with EOT have to use
1030     * g112-127. So, copy it to a virtual register, The register allocator will
1031     * make sure it uses the appropriate register range.
1032     */
1033    struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD);
1034    fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
1035    bld.group(8, 0).exec_all().MOV(payload, g0);
1036
1037    /* Send a message to the thread spawner to terminate the thread. */
1038    fs_inst *inst = bld.exec_all()
1039                       .emit(CS_OPCODE_CS_TERMINATE, reg_undef, payload);
1040    inst->eot = true;
1041 }
1042
1043 void
1044 fs_visitor::emit_barrier()
1045 {
1046    assert(devinfo->gen >= 7);
1047
1048    /* We are getting the barrier ID from the compute shader header */
1049    assert(stage == MESA_SHADER_COMPUTE);
1050
1051    fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
1052
1053    const fs_builder pbld = bld.exec_all().group(8, 0);
1054
1055    /* Clear the message payload */
1056    pbld.MOV(payload, fs_reg(0u));
1057
1058    /* Copy bits 27:24 of r0.2 (barrier id) to the message payload reg.2 */
1059    fs_reg r0_2 = fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD));
1060    pbld.AND(component(payload, 2), r0_2, fs_reg(0x0f000000u));
1061
1062    /* Emit a gateway "barrier" message using the payload we set up, followed
1063     * by a wait instruction.
1064     */
1065    bld.exec_all().emit(SHADER_OPCODE_BARRIER, reg_undef, payload);
1066 }
1067
1068 fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
1069                        void *mem_ctx,
1070                        const void *key,
1071                        struct brw_stage_prog_data *prog_data,
1072                        struct gl_program *prog,
1073                        nir_shader *shader,
1074                        unsigned dispatch_width,
1075                        int shader_time_index)
1076    : backend_shader(compiler, log_data, mem_ctx, shader, prog_data),
1077      key(key), prog_data(prog_data), prog(prog),
1078      dispatch_width(dispatch_width),
1079      shader_time_index(shader_time_index),
1080      promoted_constants(0),
1081      bld(fs_builder(this, dispatch_width).at_end())
1082 {
1083    switch (stage) {
1084    case MESA_SHADER_FRAGMENT:
1085       key_tex = &((const brw_wm_prog_key *) key)->tex;
1086       break;
1087    case MESA_SHADER_VERTEX:
1088       key_tex = &((const brw_vs_prog_key *) key)->tex;
1089       break;
1090    case MESA_SHADER_GEOMETRY:
1091       key_tex = &((const brw_gs_prog_key *) key)->tex;
1092       break;
1093    case MESA_SHADER_COMPUTE:
1094       key_tex = &((const brw_cs_prog_key*) key)->tex;
1095       break;
1096    default:
1097       unreachable("unhandled shader stage");
1098    }
1099
1100    this->failed = false;
1101    this->simd16_unsupported = false;
1102    this->no16_msg = NULL;
1103
1104    this->nir_locals = NULL;
1105    this->nir_ssa_values = NULL;
1106
1107    memset(&this->payload, 0, sizeof(this->payload));
1108    memset(this->outputs, 0, sizeof(this->outputs));
1109    memset(this->output_components, 0, sizeof(this->output_components));
1110    this->source_depth_to_render_target = false;
1111    this->runtime_check_aads_emit = false;
1112    this->first_non_payload_grf = 0;
1113    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1114
1115    this->virtual_grf_start = NULL;
1116    this->virtual_grf_end = NULL;
1117    this->live_intervals = NULL;
1118    this->regs_live_at_ip = NULL;
1119
1120    this->uniforms = 0;
1121    this->last_scratch = 0;
1122    this->pull_constant_loc = NULL;
1123    this->push_constant_loc = NULL;
1124
1125    this->spilled_any_registers = false;
1126    this->do_dual_src = false;
1127
1128    if (dispatch_width == 8)
1129       this->param_size = rzalloc_array(mem_ctx, int, stage_prog_data->nr_params);
1130 }
1131
1132 fs_visitor::~fs_visitor()
1133 {
1134 }