src/mesa/drivers/dri/i965/gen8_ps_state.c

   1 /*
   2  * Copyright © 2012 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <stdbool.h>
  25 #include "program/program.h"
  26 #include "brw_state.h"
  27 #include "brw_defines.h"
  28 #include "brw_wm.h"
  29 #include "intel_batchbuffer.h"
  30
  31 void
  32 gen8_upload_ps_extra(struct brw_context *brw,
  33                      const struct gl_fragment_program *fp,
  34                      const struct brw_wm_prog_data *prog_data,
  35                      bool multisampled_fbo)
  36 {
  37    struct gl_context *ctx = &brw->ctx;
  38    uint32_t dw1 = 0;
  39
  40    dw1 |= GEN8_PSX_PIXEL_SHADER_VALID;
  41    dw1 |= prog_data->computed_depth_mode << GEN8_PSX_COMPUTED_DEPTH_MODE_SHIFT;
  42
  43    if (prog_data->uses_kill)
  44       dw1 |= GEN8_PSX_KILL_ENABLE;
  45
  46    if (prog_data->num_varying_inputs != 0)
  47       dw1 |= GEN8_PSX_ATTRIBUTE_ENABLE;
  48
  49    if (prog_data->uses_src_depth)
  50       dw1 |= GEN8_PSX_USES_SOURCE_DEPTH;
  51
  52    if (prog_data->uses_src_w)
  53       dw1 |= GEN8_PSX_USES_SOURCE_W;
  54
  55    if (multisampled_fbo &&
  56        _mesa_get_min_invocations_per_fragment(ctx, fp, false) > 1)
  57       dw1 |= GEN8_PSX_SHADER_IS_PER_SAMPLE;
  58
  59    if (prog_data->uses_sample_mask) {
  60       if (brw->gen >= 9)
  61          dw1 |= BRW_PSICMS_INNER << GEN9_PSX_SHADER_NORMAL_COVERAGE_MASK_SHIFT;
  62       else
  63          dw1 |= GEN8_PSX_SHADER_USES_INPUT_COVERAGE_MASK;
  64    }
  65
  66    if (prog_data->uses_omask)
  67       dw1 |= GEN8_PSX_OMASK_TO_RENDER_TARGET;
  68
  69    if (brw->gen >= 9 && prog_data->pulls_bary)
  70       dw1 |= GEN9_PSX_SHADER_PULLS_BARY;
  71
  72    /* The stricter cross-primitive coherency guarantees that the hardware
  73     * gives us with the "Accesses UAV" bit set for at least one shader stage
  74     * and the "UAV coherency required" bit set on the 3DPRIMITIVE command are
  75     * redundant within the current image, atomic counter and SSBO GL APIs,
  76     * which all have very loose ordering and coherency requirements and
  77     * generally rely on the application to insert explicit barriers when a
  78     * shader invocation is expected to see the memory writes performed by the
  79     * invocations of some previous primitive.  Regardless of the value of "UAV
  80     * coherency required", the "Accesses UAV" bits will implicitly cause an in
  81     * most cases useless DC flush when the lowermost stage with the bit set
  82     * finishes execution.
  83     *
  84     * It would be nice to disable it, but in some cases we can't because on
  85     * Gen8+ it also has an influence on rasterization via the PS UAV-only
  86     * signal (which could be set independently from the coherency mechanism in
  87     * the 3DSTATE_WM command on Gen7), and because in some cases it will
  88     * determine whether the hardware skips execution of the fragment shader or
  89     * not via the ThreadDispatchEnable signal.  However if we know that
  90     * GEN8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
  91     * GEN8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
  92     * difference so we may just disable it here.
  93     *
  94     * Gen8 hardware tries to compute ThreadDispatchEnable for us but doesn't
  95     * take into account KillPixels when no depth or stencil writes are enabled.
  96     * In order for occlusion queries to work correctly with no attachments, we
  97     * need to force-enable here.
  98     *
  99     * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR
 100     */
 101    if ((_mesa_active_fragment_shader_has_side_effects(ctx) ||
 102         prog_data->uses_kill) && !brw_color_buffer_write_enabled(brw))
 103       dw1 |= GEN8_PSX_SHADER_HAS_UAV;
 104
 105    if (prog_data->computed_stencil) {
 106       assert(brw->gen >= 9);
 107       dw1 |= GEN9_PSX_SHADER_COMPUTES_STENCIL;
 108    }
 109
 110    BEGIN_BATCH(2);
 111    OUT_BATCH(_3DSTATE_PS_EXTRA << 16 | (2 - 2));
 112    OUT_BATCH(dw1);
 113    ADVANCE_BATCH();
 114 }
 115
 116 static void
 117 upload_ps_extra(struct brw_context *brw)
 118 {
 119    /* BRW_NEW_FRAGMENT_PROGRAM */
 120    const struct brw_fragment_program *fp =
 121       brw_fragment_program_const(brw->fragment_program);
 122    /* BRW_NEW_FS_PROG_DATA */
 123    const struct brw_wm_prog_data *prog_data = brw->wm.prog_data;
 124    /* BRW_NEW_NUM_SAMPLES */
 125    const bool multisampled_fbo = brw->num_samples > 1;
 126
 127    gen8_upload_ps_extra(brw, &fp->program, prog_data, multisampled_fbo);
 128 }
 129
 130 const struct brw_tracked_state gen8_ps_extra = {
 131    .dirty = {
 132       .mesa  = _NEW_BUFFERS | _NEW_COLOR,
 133       .brw   = BRW_NEW_BLORP |
 134                BRW_NEW_CONTEXT |
 135                BRW_NEW_FRAGMENT_PROGRAM |
 136                BRW_NEW_FS_PROG_DATA |
 137                BRW_NEW_NUM_SAMPLES,
 138    },
 139    .emit = upload_ps_extra,
 140 };
 141
 142 static void
 143 upload_wm_state(struct brw_context *brw)
 144 {
 145    struct gl_context *ctx = &brw->ctx;
 146    uint32_t dw1 = 0;
 147
 148    dw1 |= GEN7_WM_STATISTICS_ENABLE;
 149    dw1 |= GEN7_WM_LINE_AA_WIDTH_1_0;
 150    dw1 |= GEN7_WM_LINE_END_CAP_AA_WIDTH_0_5;
 151    dw1 |= GEN7_WM_POINT_RASTRULE_UPPER_RIGHT;
 152
 153    /* _NEW_LINE */
 154    if (ctx->Line.StippleFlag)
 155       dw1 |= GEN7_WM_LINE_STIPPLE_ENABLE;
 156
 157    /* _NEW_POLYGON */
 158    if (ctx->Polygon.StippleFlag)
 159       dw1 |= GEN7_WM_POLYGON_STIPPLE_ENABLE;
 160
 161    /* BRW_NEW_FS_PROG_DATA */
 162    dw1 |= brw->wm.prog_data->barycentric_interp_modes <<
 163       GEN7_WM_BARYCENTRIC_INTERPOLATION_MODE_SHIFT;
 164
 165    /* BRW_NEW_FS_PROG_DATA */
 166    if (brw->wm.prog_data->early_fragment_tests)
 167       dw1 |= GEN7_WM_EARLY_DS_CONTROL_PREPS;
 168    else if (_mesa_active_fragment_shader_has_side_effects(&brw->ctx))
 169       dw1 |= GEN7_WM_EARLY_DS_CONTROL_PSEXEC;
 170
 171    BEGIN_BATCH(2);
 172    OUT_BATCH(_3DSTATE_WM << 16 | (2 - 2));
 173    OUT_BATCH(dw1);
 174    ADVANCE_BATCH();
 175 }
 176
 177 const struct brw_tracked_state gen8_wm_state = {
 178    .dirty = {
 179       .mesa  = _NEW_LINE |
 180                _NEW_POLYGON,
 181       .brw   = BRW_NEW_BLORP |
 182                BRW_NEW_CONTEXT |
 183                BRW_NEW_FS_PROG_DATA,
 184    },
 185    .emit = upload_wm_state,
 186 };
 187
 188 void
 189 gen8_upload_ps_state(struct brw_context *brw,
 190                      const struct gl_fragment_program *fp,
 191                      const struct brw_stage_state *stage_state,
 192                      const struct brw_wm_prog_data *prog_data,
 193                      uint32_t fast_clear_op)
 194 {
 195    struct gl_context *ctx = &brw->ctx;
 196    uint32_t dw3 = 0, dw6 = 0, dw7 = 0, ksp0, ksp2 = 0;
 197
 198    /* Initialize the execution mask with VMask.  Otherwise, derivatives are
 199     * incorrect for subspans where some of the pixels are unlit.  We believe
 200     * the bit just didn't take effect in previous generations.
 201     */
 202    dw3 |= GEN7_PS_VECTOR_MASK_ENABLE;
 203
 204    const unsigned sampler_count =
 205       DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);
 206    dw3 |= SET_FIELD(sampler_count, GEN7_PS_SAMPLER_COUNT);
 207
 208    /* BRW_NEW_FS_PROG_DATA */
 209    dw3 |=
 210       ((prog_data->base.binding_table.size_bytes / 4) <<
 211        GEN7_PS_BINDING_TABLE_ENTRY_COUNT_SHIFT);
 212
 213    if (prog_data->base.use_alt_mode)
 214       dw3 |= GEN7_PS_FLOATING_POINT_MODE_ALT;
 215
 216    /* 3DSTATE_PS expects the number of threads per PSD, which is always 64;
 217     * it implicitly scales for different GT levels (which have some # of PSDs).
 218     *
 219     * In Gen8 the format is U8-2 whereas in Gen9 it is U8-1.
 220     */
 221    if (brw->gen >= 9)
 222       dw6 |= (64 - 1) << HSW_PS_MAX_THREADS_SHIFT;
 223    else
 224       dw6 |= (64 - 2) << HSW_PS_MAX_THREADS_SHIFT;
 225
 226    if (prog_data->base.nr_params > 0)
 227       dw6 |= GEN7_PS_PUSH_CONSTANT_ENABLE;
 228
 229    /* From the documentation for this packet:
 230     * "If the PS kernel does not need the Position XY Offsets to
 231     *  compute a Position Value, then this field should be programmed
 232     *  to POSOFFSET_NONE."
 233     *
 234     * "SW Recommendation: If the PS kernel needs the Position Offsets
 235     *  to compute a Position XY value, this field should match Position
 236     *  ZW Interpolation Mode to ensure a consistent position.xyzw
 237     *  computation."
 238     *
 239     * We only require XY sample offsets. So, this recommendation doesn't
 240     * look useful at the moment. We might need this in future.
 241     */
 242    if (prog_data->uses_pos_offset)
 243       dw6 |= GEN7_PS_POSOFFSET_SAMPLE;
 244    else
 245       dw6 |= GEN7_PS_POSOFFSET_NONE;
 246
 247    dw6 |= fast_clear_op;
 248
 249    /* _NEW_MULTISAMPLE
 250     * In case of non 1x per sample shading, only one of SIMD8 and SIMD16
 251     * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader
 252     * is successfully compiled. In majority of the cases that bring us
 253     * better performance than 'SIMD8 only' dispatch.
 254     */
 255    int min_invocations_per_fragment =
 256       _mesa_get_min_invocations_per_fragment(ctx, fp, false);
 257    assert(min_invocations_per_fragment >= 1);
 258
 259    if (prog_data->prog_offset_16 || prog_data->no_8) {
 260       dw6 |= GEN7_PS_16_DISPATCH_ENABLE;
 261       if (!prog_data->no_8 && min_invocations_per_fragment == 1) {
 262          dw6 |= GEN7_PS_8_DISPATCH_ENABLE;
 263          dw7 |= (prog_data->base.dispatch_grf_start_reg <<
 264                  GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
 265          dw7 |= (prog_data->dispatch_grf_start_reg_16 <<
 266                  GEN7_PS_DISPATCH_START_GRF_SHIFT_2);
 267          ksp0 = stage_state->prog_offset;
 268          ksp2 = stage_state->prog_offset + prog_data->prog_offset_16;
 269       } else {
 270          dw7 |= (prog_data->dispatch_grf_start_reg_16 <<
 271                  GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
 272
 273          ksp0 = stage_state->prog_offset + prog_data->prog_offset_16;
 274       }
 275    } else {
 276       dw6 |= GEN7_PS_8_DISPATCH_ENABLE;
 277       dw7 |= (prog_data->base.dispatch_grf_start_reg <<
 278               GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
 279       ksp0 = stage_state->prog_offset;
 280    }
 281
 282    BEGIN_BATCH(12);
 283    OUT_BATCH(_3DSTATE_PS << 16 | (12 - 2));
 284    OUT_BATCH(ksp0);
 285    OUT_BATCH(0);
 286    OUT_BATCH(dw3);
 287    if (prog_data->base.total_scratch) {
 288       OUT_RELOC64(stage_state->scratch_bo,
 289                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
 290                   ffs(prog_data->base.total_scratch) - 11);
 291    } else {
 292       OUT_BATCH(0);
 293       OUT_BATCH(0);
 294    }
 295    OUT_BATCH(dw6);
 296    OUT_BATCH(dw7);
 297    OUT_BATCH(0); /* kernel 1 pointer */
 298    OUT_BATCH(0);
 299    OUT_BATCH(ksp2);
 300    OUT_BATCH(0);
 301    ADVANCE_BATCH();
 302 }
 303
 304 static void
 305 upload_ps_state(struct brw_context *brw)
 306 {
 307    /* BRW_NEW_FS_PROG_DATA */
 308    const struct brw_wm_prog_data *prog_data = brw->wm.prog_data;
 309    gen8_upload_ps_state(brw, brw->fragment_program, &brw->wm.base, prog_data,
 310                         brw->wm.fast_clear_op);
 311 }
 312
 313 const struct brw_tracked_state gen8_ps_state = {
 314    .dirty = {
 315       .mesa  = _NEW_MULTISAMPLE,
 316       .brw   = BRW_NEW_BATCH |
 317                BRW_NEW_BLORP |
 318                BRW_NEW_FRAGMENT_PROGRAM |
 319                BRW_NEW_FS_PROG_DATA,
 320    },
 321    .emit = upload_ps_state,
 322 };