src/mesa/drivers/dri/i965/gen8_ps_state.c

   1 /*
   2  * Copyright © 2012 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <stdbool.h>
  25 #include "program/program.h"
  26 #include "brw_state.h"
  27 #include "brw_defines.h"
  28 #include "intel_batchbuffer.h"
  29
  30 static void
  31 upload_ps_extra(struct brw_context *brw)
  32 {
  33    struct gl_context *ctx = &brw->ctx;
  34    /* BRW_NEW_FRAGMENT_PROGRAM */
  35    const struct brw_fragment_program *fp =
  36       brw_fragment_program_const(brw->fragment_program);
  37    uint32_t dw1 = 0;
  38
  39    dw1 |= GEN8_PSX_PIXEL_SHADER_VALID;
  40
  41    if (fp->program.UsesKill)
  42       dw1 |= GEN8_PSX_KILL_ENABLE;
  43
  44    /* BRW_NEW_FRAGMENT_PROGRAM */
  45    if (brw->wm.prog_data->num_varying_inputs != 0)
  46       dw1 |= GEN8_PSX_ATTRIBUTE_ENABLE;
  47
  48    if (fp->program.Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
  49       switch (fp->program.FragDepthLayout) {
  50          case FRAG_DEPTH_LAYOUT_NONE:
  51          case FRAG_DEPTH_LAYOUT_ANY:
  52             dw1 |= GEN8_PSX_PSCDEPTH_ON;
  53             break;
  54          case FRAG_DEPTH_LAYOUT_GREATER:
  55             dw1 |= GEN8_PSX_PSCDEPTH_ON_GE;
  56             break;
  57          case FRAG_DEPTH_LAYOUT_LESS:
  58             dw1 |= GEN8_PSX_PSCDEPTH_ON_LE;
  59             break;
  60          case FRAG_DEPTH_LAYOUT_UNCHANGED:
  61             break;
  62       }
  63    }
  64
  65    if (fp->program.Base.InputsRead & VARYING_BIT_POS)
  66       dw1 |= GEN8_PSX_USES_SOURCE_DEPTH | GEN8_PSX_USES_SOURCE_W;
  67
  68    /* BRW_NEW_NUM_SAMPLES | _NEW_MULTISAMPLE */
  69    bool multisampled_fbo = brw->num_samples > 1;
  70    if (multisampled_fbo &&
  71        _mesa_get_min_invocations_per_fragment(ctx, &fp->program, false) > 1)
  72       dw1 |= GEN8_PSX_SHADER_IS_PER_SAMPLE;
  73
  74    if (fp->program.Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN)
  75       dw1 |= GEN8_PSX_SHADER_USES_INPUT_COVERAGE_MASK;
  76
  77    if (brw->wm.prog_data->uses_omask)
  78       dw1 |= GEN8_PSX_OMASK_TO_RENDER_TARGET;
  79
  80    BEGIN_BATCH(2);
  81    OUT_BATCH(_3DSTATE_PS_EXTRA << 16 | (2 - 2));
  82    OUT_BATCH(dw1);
  83    ADVANCE_BATCH();
  84 }
  85
  86 const struct brw_tracked_state gen8_ps_extra = {
  87    .dirty = {
  88       .mesa  = _NEW_MULTISAMPLE,
  89       .brw   = BRW_NEW_CONTEXT | BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_NUM_SAMPLES,
  90       .cache = 0,
  91    },
  92    .emit = upload_ps_extra,
  93 };
  94
  95 static void
  96 upload_wm_state(struct brw_context *brw)
  97 {
  98    struct gl_context *ctx = &brw->ctx;
  99    uint32_t dw1 = 0;
 100
 101    dw1 |= GEN7_WM_STATISTICS_ENABLE;
 102    dw1 |= GEN7_WM_LINE_AA_WIDTH_1_0;
 103    dw1 |= GEN7_WM_LINE_END_CAP_AA_WIDTH_0_5;
 104    dw1 |= GEN7_WM_POINT_RASTRULE_UPPER_RIGHT;
 105
 106    /* _NEW_LINE */
 107    if (ctx->Line.StippleFlag)
 108       dw1 |= GEN7_WM_LINE_STIPPLE_ENABLE;
 109
 110    /* _NEW_POLYGON */
 111    if (ctx->Polygon.StippleFlag)
 112       dw1 |= GEN7_WM_POLYGON_STIPPLE_ENABLE;
 113
 114    /* CACHE_NEW_WM_PROG */
 115    dw1 |= brw->wm.prog_data->barycentric_interp_modes <<
 116       GEN7_WM_BARYCENTRIC_INTERPOLATION_MODE_SHIFT;
 117
 118    BEGIN_BATCH(2);
 119    OUT_BATCH(_3DSTATE_WM << 16 | (2 - 2));
 120    OUT_BATCH(dw1);
 121    ADVANCE_BATCH();
 122 }
 123
 124 const struct brw_tracked_state gen8_wm_state = {
 125    .dirty = {
 126       .mesa  = _NEW_LINE | _NEW_POLYGON,
 127       .brw   = BRW_NEW_CONTEXT,
 128       .cache = CACHE_NEW_WM_PROG,
 129    },
 130    .emit = upload_wm_state,
 131 };
 132
 133 static void
 134 upload_ps_state(struct brw_context *brw)
 135 {
 136    struct gl_context *ctx = &brw->ctx;
 137    uint32_t dw3 = 0, dw6 = 0, dw7 = 0, ksp0, ksp2 = 0;
 138
 139    /* Initialize the execution mask with VMask.  Otherwise, derivatives are
 140     * incorrect for subspans where some of the pixels are unlit.  We believe
 141     * the bit just didn't take effect in previous generations.
 142     */
 143    dw3 |= GEN7_PS_VECTOR_MASK_ENABLE;
 144
 145    dw3 |=
 146       (ALIGN(brw->wm.base.sampler_count, 4) / 4) << GEN7_PS_SAMPLER_COUNT_SHIFT;
 147
 148    /* CACHE_NEW_WM_PROG */
 149    dw3 |=
 150       ((brw->wm.prog_data->base.binding_table.size_bytes / 4) <<
 151        GEN7_PS_BINDING_TABLE_ENTRY_COUNT_SHIFT);
 152
 153    /* Use ALT floating point mode for ARB fragment programs, because they
 154     * require 0^0 == 1.  Even though _CurrentFragmentProgram is used for
 155     * rendering, CurrentFragmentProgram is used for this check to
 156     * differentiate between the GLSL and non-GLSL cases.
 157     */
 158    if (ctx->Shader.CurrentProgram[MESA_SHADER_FRAGMENT] == NULL)
 159       dw3 |= GEN7_PS_FLOATING_POINT_MODE_ALT;
 160
 161    /* 3DSTATE_PS expects the number of threads per PSD, which is always 64;
 162     * it implicitly scales for different GT levels (which have some # of PSDs).
 163     */
 164    dw6 |= (64 - 2) << HSW_PS_MAX_THREADS_SHIFT;
 165
 166    /* CACHE_NEW_WM_PROG */
 167    if (brw->wm.prog_data->base.nr_params > 0)
 168       dw6 |= GEN7_PS_PUSH_CONSTANT_ENABLE;
 169
 170    /* From the documentation for this packet:
 171     * "If the PS kernel does not need the Position XY Offsets to
 172     *  compute a Position Value, then this field should be programmed
 173     *  to POSOFFSET_NONE."
 174     *
 175     * "SW Recommendation: If the PS kernel needs the Position Offsets
 176     *  to compute a Position XY value, this field should match Position
 177     *  ZW Interpolation Mode to ensure a consistent position.xyzw
 178     *  computation."
 179     *
 180     * We only require XY sample offsets. So, this recommendation doesn't
 181     * look useful at the moment. We might need this in future.
 182     */
 183    if (brw->wm.prog_data->uses_pos_offset)
 184       dw6 |= GEN7_PS_POSOFFSET_SAMPLE;
 185    else
 186       dw6 |= GEN7_PS_POSOFFSET_NONE;
 187
 188    dw6 |= brw->wm.fast_clear_op;
 189
 190    /* _NEW_MULTISAMPLE
 191     * In case of non 1x per sample shading, only one of SIMD8 and SIMD16
 192     * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader
 193     * is successfully compiled. In majority of the cases that bring us
 194     * better performance than 'SIMD8 only' dispatch.
 195     */
 196    int min_invocations_per_fragment =
 197       _mesa_get_min_invocations_per_fragment(ctx, brw->fragment_program, false);
 198    assert(min_invocations_per_fragment >= 1);
 199
 200    if (brw->wm.prog_data->prog_offset_16 || brw->wm.prog_data->no_8) {
 201       dw6 |= GEN7_PS_16_DISPATCH_ENABLE;
 202       if (!brw->wm.prog_data->no_8 && min_invocations_per_fragment == 1) {
 203          dw6 |= GEN7_PS_8_DISPATCH_ENABLE;
 204          dw7 |= (brw->wm.prog_data->base.dispatch_grf_start_reg <<
 205                  GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
 206          dw7 |= (brw->wm.prog_data->dispatch_grf_start_reg_16 <<
 207                  GEN7_PS_DISPATCH_START_GRF_SHIFT_2);
 208          ksp0 = brw->wm.base.prog_offset;
 209          ksp2 = brw->wm.base.prog_offset + brw->wm.prog_data->prog_offset_16;
 210       } else {
 211          dw7 |= (brw->wm.prog_data->dispatch_grf_start_reg_16 <<
 212                  GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
 213
 214          ksp0 = brw->wm.base.prog_offset + brw->wm.prog_data->prog_offset_16;
 215       }
 216    } else {
 217       dw6 |= GEN7_PS_8_DISPATCH_ENABLE;
 218       dw7 |= (brw->wm.prog_data->base.dispatch_grf_start_reg <<
 219               GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
 220       ksp0 = brw->wm.base.prog_offset;
 221    }
 222
 223    BEGIN_BATCH(12);
 224    OUT_BATCH(_3DSTATE_PS << 16 | (12 - 2));
 225    OUT_BATCH(ksp0);
 226    OUT_BATCH(0);
 227    OUT_BATCH(dw3);
 228    if (brw->wm.prog_data->base.total_scratch) {
 229       OUT_RELOC64(brw->wm.base.scratch_bo,
 230                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
 231                   ffs(brw->wm.prog_data->base.total_scratch) - 11);
 232    } else {
 233       OUT_BATCH(0);
 234       OUT_BATCH(0);
 235    }
 236    OUT_BATCH(dw6);
 237    OUT_BATCH(dw7);
 238    OUT_BATCH(0); /* kernel 1 pointer */
 239    OUT_BATCH(0);
 240    OUT_BATCH(ksp2);
 241    OUT_BATCH(0);
 242    ADVANCE_BATCH();
 243 }
 244
 245 const struct brw_tracked_state gen8_ps_state = {
 246    .dirty = {
 247       .mesa  = _NEW_MULTISAMPLE,
 248       .brw   = BRW_NEW_FRAGMENT_PROGRAM |
 249                BRW_NEW_BATCH,
 250       .cache = CACHE_NEW_WM_PROG
 251    },
 252    .emit = upload_ps_state,
 253 };