src/mesa/drivers/dri/i965/gen7_wm_state.c

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <stdbool.h>
  25 #include "brw_context.h"
  26 #include "brw_state.h"
  27 #include "brw_defines.h"
  28 #include "brw_util.h"
  29 #include "brw_wm.h"
  30 #include "program/program.h"
  31 #include "program/prog_parameter.h"
  32 #include "program/prog_statevars.h"
  33 #include "intel_batchbuffer.h"
  34
  35 static void
  36 upload_wm_state(struct brw_context *brw)
  37 {
  38    struct gl_context *ctx = &brw->ctx;
  39    const struct brw_fragment_program *fp =
  40       brw_fragment_program_const(brw->fragment_program);
  41    bool writes_depth = false;
  42    uint32_t dw1, dw2;
  43
  44    /* _NEW_BUFFERS */
  45    bool multisampled_fbo = ctx->DrawBuffer->Visual.samples > 1;
  46
  47    dw1 = dw2 = 0;
  48    dw1 |= GEN7_WM_STATISTICS_ENABLE;
  49    dw1 |= GEN7_WM_LINE_AA_WIDTH_1_0;
  50    dw1 |= GEN7_WM_LINE_END_CAP_AA_WIDTH_0_5;
  51
  52    /* _NEW_LINE */
  53    if (ctx->Line.StippleFlag)
  54       dw1 |= GEN7_WM_LINE_STIPPLE_ENABLE;
  55
  56    /* _NEW_POLYGON */
  57    if (ctx->Polygon.StippleFlag)
  58       dw1 |= GEN7_WM_POLYGON_STIPPLE_ENABLE;
  59
  60    /* BRW_NEW_FRAGMENT_PROGRAM */
  61    if (fp->program.Base.InputsRead & VARYING_BIT_POS)
  62       dw1 |= GEN7_WM_USES_SOURCE_DEPTH | GEN7_WM_USES_SOURCE_W;
  63    if (fp->program.Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
  64       writes_depth = fp->program.FragDepthLayout != FRAG_DEPTH_LAYOUT_UNCHANGED;
  65
  66       switch (fp->program.FragDepthLayout) {
  67          case FRAG_DEPTH_LAYOUT_NONE:
  68          case FRAG_DEPTH_LAYOUT_ANY:
  69             dw1 |= GEN7_WM_PSCDEPTH_ON;
  70             break;
  71          case FRAG_DEPTH_LAYOUT_GREATER:
  72             dw1 |= GEN7_WM_PSCDEPTH_ON_GE;
  73             break;
  74          case FRAG_DEPTH_LAYOUT_LESS:
  75             dw1 |= GEN7_WM_PSCDEPTH_ON_LE;
  76             break;
  77          case FRAG_DEPTH_LAYOUT_UNCHANGED:
  78             break;
  79       }
  80    }
  81    /* CACHE_NEW_WM_PROG */
  82    dw1 |= brw->wm.prog_data->barycentric_interp_modes <<
  83       GEN7_WM_BARYCENTRIC_INTERPOLATION_MODE_SHIFT;
  84
  85    /* _NEW_COLOR, _NEW_MULTISAMPLE */
  86    /* Enable if the pixel shader kernel generates and outputs oMask.
  87     */
  88    if (fp->program.UsesKill || ctx->Color.AlphaEnabled ||
  89        ctx->Multisample.SampleAlphaToCoverage ||
  90        brw->wm.prog_data->uses_omask) {
  91       dw1 |= GEN7_WM_KILL_ENABLE;
  92    }
  93
  94    /* _NEW_BUFFERS */
  95    if (brw_color_buffer_write_enabled(brw) || writes_depth ||
  96        dw1 & GEN7_WM_KILL_ENABLE) {
  97       dw1 |= GEN7_WM_DISPATCH_ENABLE;
  98    }
  99    if (multisampled_fbo) {
 100       /* _NEW_MULTISAMPLE */
 101       if (ctx->Multisample.Enabled)
 102          dw1 |= GEN7_WM_MSRAST_ON_PATTERN;
 103       else
 104          dw1 |= GEN7_WM_MSRAST_OFF_PIXEL;
 105
 106       if (_mesa_get_min_invocations_per_fragment(ctx, brw->fragment_program, false) > 1)
 107          dw2 |= GEN7_WM_MSDISPMODE_PERSAMPLE;
 108       else
 109          dw2 |= GEN7_WM_MSDISPMODE_PERPIXEL;
 110    } else {
 111       dw1 |= GEN7_WM_MSRAST_OFF_PIXEL;
 112       dw2 |= GEN7_WM_MSDISPMODE_PERSAMPLE;
 113    }
 114
 115    if (fp->program.Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
 116       dw1 |= GEN7_WM_USES_INPUT_COVERAGE_MASK;
 117    }
 118
 119    BEGIN_BATCH(3);
 120    OUT_BATCH(_3DSTATE_WM << 16 | (3 - 2));
 121    OUT_BATCH(dw1);
 122    OUT_BATCH(dw2);
 123    ADVANCE_BATCH();
 124 }
 125
 126 const struct brw_tracked_state gen7_wm_state = {
 127    .dirty = {
 128       .mesa  = (_NEW_LINE | _NEW_POLYGON |
 129                 _NEW_COLOR | _NEW_BUFFERS |
 130                 _NEW_MULTISAMPLE),
 131       .brw   = (BRW_NEW_FRAGMENT_PROGRAM |
 132                 BRW_NEW_BATCH),
 133       .cache = CACHE_NEW_WM_PROG,
 134    },
 135    .emit = upload_wm_state,
 136 };
 137
 138 static void
 139 upload_ps_state(struct brw_context *brw)
 140 {
 141    struct gl_context *ctx = &brw->ctx;
 142    uint32_t dw2, dw4, dw5;
 143    const int max_threads_shift = brw->is_haswell ?
 144       HSW_PS_MAX_THREADS_SHIFT : IVB_PS_MAX_THREADS_SHIFT;
 145
 146    /* BRW_NEW_PS_BINDING_TABLE */
 147    BEGIN_BATCH(2);
 148    OUT_BATCH(_3DSTATE_BINDING_TABLE_POINTERS_PS << 16 | (2 - 2));
 149    OUT_BATCH(brw->wm.base.bind_bo_offset);
 150    ADVANCE_BATCH();
 151
 152    /* CACHE_NEW_SAMPLER */
 153    BEGIN_BATCH(2);
 154    OUT_BATCH(_3DSTATE_SAMPLER_STATE_POINTERS_PS << 16 | (2 - 2));
 155    OUT_BATCH(brw->wm.base.sampler_offset);
 156    ADVANCE_BATCH();
 157
 158    /* CACHE_NEW_WM_PROG */
 159    gen7_upload_constant_state(brw, &brw->wm.base, true, _3DSTATE_CONSTANT_PS);
 160
 161    dw2 = dw4 = dw5 = 0;
 162
 163    /* CACHE_NEW_SAMPLER */
 164    dw2 |=
 165       (ALIGN(brw->wm.base.sampler_count, 4) / 4) << GEN7_PS_SAMPLER_COUNT_SHIFT;
 166
 167    /* CACHE_NEW_WM_PROG */
 168    dw2 |= ((brw->wm.prog_data->base.binding_table.size_bytes / 4) <<
 169            GEN7_PS_BINDING_TABLE_ENTRY_COUNT_SHIFT);
 170
 171    /* Use ALT floating point mode for ARB fragment programs, because they
 172     * require 0^0 == 1.  Even though _CurrentFragmentProgram is used for
 173     * rendering, CurrentFragmentProgram is used for this check to
 174     * differentiate between the GLSL and non-GLSL cases.
 175     */
 176    /* BRW_NEW_FRAGMENT_PROGRAM */
 177    if (ctx->Shader.CurrentProgram[MESA_SHADER_FRAGMENT] == NULL)
 178       dw2 |= GEN7_PS_FLOATING_POINT_MODE_ALT;
 179
 180    /* Haswell requires the sample mask to be set in this packet as well as
 181     * in 3DSTATE_SAMPLE_MASK; the values should match. */
 182    /* _NEW_BUFFERS, _NEW_MULTISAMPLE */
 183    if (brw->is_haswell)
 184       dw4 |= SET_FIELD(gen6_determine_sample_mask(brw), HSW_PS_SAMPLE_MASK);
 185
 186    dw4 |= (brw->max_wm_threads - 1) << max_threads_shift;
 187
 188    /* CACHE_NEW_WM_PROG */
 189    if (brw->wm.prog_data->nr_params > 0)
 190       dw4 |= GEN7_PS_PUSH_CONSTANT_ENABLE;
 191
 192    /* From the IVB PRM, volume 2 part 1, page 287:
 193     * "This bit is inserted in the PS payload header and made available to
 194     * the DataPort (either via the message header or via header bypass) to
 195     * indicate that oMask data (one or two phases) is included in Render
 196     * Target Write messages. If present, the oMask data is used to mask off
 197     * samples."
 198     */
 199    if (brw->wm.prog_data->uses_omask)
 200       dw4 |= GEN7_PS_OMASK_TO_RENDER_TARGET;
 201
 202    /* From the IVB PRM, volume 2 part 1, page 287:
 203     * "If the PS kernel does not need the Position XY Offsets to
 204     * compute a Position Value, then this field should be programmed
 205     * to POSOFFSET_NONE."
 206     * "SW Recommendation: If the PS kernel needs the Position Offsets
 207     * to compute a Position XY value, this field should match Position
 208     * ZW Interpolation Mode to ensure a consistent position.xyzw
 209     * computation."
 210     * We only require XY sample offsets. So, this recommendation doesn't
 211     * look useful at the moment. We might need this in future.
 212     */
 213    if (brw->wm.prog_data->uses_pos_offset)
 214       dw4 |= GEN7_PS_POSOFFSET_SAMPLE;
 215    else
 216       dw4 |= GEN7_PS_POSOFFSET_NONE;
 217
 218    /* CACHE_NEW_WM_PROG | _NEW_COLOR
 219     *
 220     * The hardware wedges if you have this bit set but don't turn on any dual
 221     * source blend factors.
 222     */
 223    if (brw->wm.prog_data->dual_src_blend &&
 224        (ctx->Color.BlendEnabled & 1) &&
 225        ctx->Color.Blend[0]._UsesDualSrc) {
 226       dw4 |= GEN7_PS_DUAL_SOURCE_BLEND_ENABLE;
 227    }
 228
 229    /* CACHE_NEW_WM_PROG */
 230    if (brw->wm.prog_data->num_varying_inputs != 0)
 231       dw4 |= GEN7_PS_ATTRIBUTE_ENABLE;
 232
 233    /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16
 234     * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader
 235     * is successfully compiled. In majority of the cases that bring us
 236     * better performance than 'SIMD8 only' dispatch.
 237     */
 238    int min_inv_per_frag =
 239       _mesa_get_min_invocations_per_fragment(ctx, brw->fragment_program, false);
 240    assert(min_inv_per_frag >= 1);
 241
 242    if (brw->wm.prog_data->prog_offset_16) {
 243       dw4 |= GEN7_PS_16_DISPATCH_ENABLE;
 244       if (min_inv_per_frag == 1) {
 245          dw4 |= GEN7_PS_8_DISPATCH_ENABLE;
 246          dw5 |= (brw->wm.prog_data->first_curbe_grf <<
 247                  GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
 248          dw5 |= (brw->wm.prog_data->first_curbe_grf_16 <<
 249                  GEN7_PS_DISPATCH_START_GRF_SHIFT_2);
 250       } else
 251          dw5 |= (brw->wm.prog_data->first_curbe_grf_16 <<
 252                  GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
 253    }
 254    else {
 255       dw4 |= GEN7_PS_8_DISPATCH_ENABLE;
 256       dw5 |= (brw->wm.prog_data->first_curbe_grf <<
 257               GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
 258    }
 259
 260    BEGIN_BATCH(8);
 261    OUT_BATCH(_3DSTATE_PS << 16 | (8 - 2));
 262    if (brw->wm.prog_data->prog_offset_16 && min_inv_per_frag > 1)
 263       OUT_BATCH(brw->wm.base.prog_offset + brw->wm.prog_data->prog_offset_16);
 264    else
 265       OUT_BATCH(brw->wm.base.prog_offset);
 266    OUT_BATCH(dw2);
 267    if (brw->wm.prog_data->total_scratch) {
 268       OUT_RELOC(brw->wm.base.scratch_bo,
 269                 I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
 270                 ffs(brw->wm.prog_data->total_scratch) - 11);
 271    } else {
 272       OUT_BATCH(0);
 273    }
 274    OUT_BATCH(dw4);
 275    OUT_BATCH(dw5);
 276    OUT_BATCH(0); /* kernel 1 pointer */
 277    OUT_BATCH(brw->wm.base.prog_offset + brw->wm.prog_data->prog_offset_16);
 278    ADVANCE_BATCH();
 279 }
 280
 281 const struct brw_tracked_state gen7_ps_state = {
 282    .dirty = {
 283       .mesa  = (_NEW_PROGRAM_CONSTANTS |
 284                 _NEW_COLOR |
 285                 _NEW_BUFFERS |
 286                 _NEW_MULTISAMPLE),
 287       .brw   = (BRW_NEW_FRAGMENT_PROGRAM |
 288                 BRW_NEW_PS_BINDING_TABLE |
 289                 BRW_NEW_BATCH |
 290                 BRW_NEW_PUSH_CONSTANT_ALLOCATION),
 291       .cache = (CACHE_NEW_SAMPLER |
 292                 CACHE_NEW_WM_PROG)
 293    },
 294    .emit = upload_ps_state,
 295 };