From 23f69dfc0f5eabc04141e9a2793cf871f38d6432 Mon Sep 17 00:00:00 2001 From: Rafael Antognolli Date: Tue, 21 Mar 2017 13:08:05 -0700 Subject: [PATCH] i965: Port gen7+ 3DSTATE_PS to genxml. Emit 3DSTATE_PS on Gen7+ using brw_batch_emit helper, that uses pack structs from genxml. v2: - Use render_bo helper to setup brw_address (Kristian) v3: - Style fixes and code cleanup (Ken) v4: - More style fixes and code cleanup missed in v3 Signed-off-by: Rafael Antognolli Reviewed-by: Kenneth Graunke --- src/mesa/drivers/dri/i965/brw_state.h | 2 - src/mesa/drivers/dri/i965/gen7_wm_state.c | 137 ------------------ src/mesa/drivers/dri/i965/gen8_ps_state.c | 114 --------------- src/mesa/drivers/dri/i965/genX_state_upload.c | 136 ++++++++++++++++- 4 files changed, 134 insertions(+), 255 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h index 94f758b3d36..c55c1752a34 100644 --- a/src/mesa/drivers/dri/i965/brw_state.h +++ b/src/mesa/drivers/dri/i965/brw_state.h @@ -132,7 +132,6 @@ extern const struct brw_tracked_state gen7_gs_state; extern const struct brw_tracked_state gen7_tcs_push_constants; extern const struct brw_tracked_state gen7_hs_state; extern const struct brw_tracked_state gen7_l3_state; -extern const struct brw_tracked_state gen7_ps_state; extern const struct brw_tracked_state gen7_push_constant_space; extern const struct brw_tracked_state gen7_sf_clip_viewport; extern const struct brw_tracked_state gen7_te_state; @@ -150,7 +149,6 @@ extern const struct brw_tracked_state gen8_multisample_state; extern const struct brw_tracked_state gen8_pma_fix; extern const struct brw_tracked_state gen8_ps_blend; extern const struct brw_tracked_state gen8_ps_extra; -extern const struct brw_tracked_state gen8_ps_state; extern const struct brw_tracked_state gen8_wm_state; extern const struct brw_tracked_state gen8_sf_clip_viewport; extern const struct brw_tracked_state gen8_vertices; diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c index 5efe55a0088..3173035f1c3 100644 --- a/src/mesa/drivers/dri/i965/gen7_wm_state.c +++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c @@ -145,140 +145,3 @@ const struct brw_tracked_state gen7_wm_state = { }, .emit = upload_wm_state, }; - -static void -gen7_upload_ps_state(struct brw_context *brw, - const struct brw_stage_state *stage_state, - const struct brw_wm_prog_data *prog_data, - bool enable_dual_src_blend, unsigned sample_mask, - unsigned fast_clear_op) -{ - const struct gen_device_info *devinfo = &brw->screen->devinfo; - uint32_t dw2, dw4, dw5, ksp0, ksp2; - const int max_threads_shift = brw->is_haswell ? - HSW_PS_MAX_THREADS_SHIFT : IVB_PS_MAX_THREADS_SHIFT; - - dw2 = dw4 = dw5 = ksp2 = 0; - - const unsigned sampler_count = - DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4); - dw2 |= SET_FIELD(sampler_count, GEN7_PS_SAMPLER_COUNT); - - dw2 |= ((prog_data->base.binding_table.size_bytes / 4) << - GEN7_PS_BINDING_TABLE_ENTRY_COUNT_SHIFT); - - if (prog_data->base.use_alt_mode) - dw2 |= GEN7_PS_FLOATING_POINT_MODE_ALT; - - /* Haswell requires the sample mask to be set in this packet as well as - * in 3DSTATE_SAMPLE_MASK; the values should match. */ - /* _NEW_BUFFERS, _NEW_MULTISAMPLE */ - if (brw->is_haswell) - dw4 |= SET_FIELD(sample_mask, HSW_PS_SAMPLE_MASK); - - dw4 |= (devinfo->max_wm_threads - 1) << max_threads_shift; - - if (prog_data->base.nr_params > 0) - dw4 |= GEN7_PS_PUSH_CONSTANT_ENABLE; - - /* From the IVB PRM, volume 2 part 1, page 287: - * "This bit is inserted in the PS payload header and made available to - * the DataPort (either via the message header or via header bypass) to - * indicate that oMask data (one or two phases) is included in Render - * Target Write messages. If present, the oMask data is used to mask off - * samples." - */ - if (prog_data->uses_omask) - dw4 |= GEN7_PS_OMASK_TO_RENDER_TARGET; - - /* From the IVB PRM, volume 2 part 1, page 287: - * "If the PS kernel does not need the Position XY Offsets to - * compute a Position Value, then this field should be programmed - * to POSOFFSET_NONE." - * "SW Recommendation: If the PS kernel needs the Position Offsets - * to compute a Position XY value, this field should match Position - * ZW Interpolation Mode to ensure a consistent position.xyzw - * computation." - * We only require XY sample offsets. So, this recommendation doesn't - * look useful at the moment. We might need this in future. - */ - if (prog_data->uses_pos_offset) - dw4 |= GEN7_PS_POSOFFSET_SAMPLE; - else - dw4 |= GEN7_PS_POSOFFSET_NONE; - - /* The hardware wedges if you have this bit set but don't turn on any dual - * source blend factors. - */ - if (enable_dual_src_blend) - dw4 |= GEN7_PS_DUAL_SOURCE_BLEND_ENABLE; - - /* BRW_NEW_FS_PROG_DATA */ - if (prog_data->num_varying_inputs != 0) - dw4 |= GEN7_PS_ATTRIBUTE_ENABLE; - - dw4 |= fast_clear_op; - - if (prog_data->dispatch_16) - dw4 |= GEN7_PS_16_DISPATCH_ENABLE; - - if (prog_data->dispatch_8) - dw4 |= GEN7_PS_8_DISPATCH_ENABLE; - - dw5 |= prog_data->base.dispatch_grf_start_reg << - GEN7_PS_DISPATCH_START_GRF_SHIFT_0; - dw5 |= prog_data->dispatch_grf_start_reg_2 << - GEN7_PS_DISPATCH_START_GRF_SHIFT_2; - - ksp0 = stage_state->prog_offset; - ksp2 = stage_state->prog_offset + prog_data->prog_offset_2; - - BEGIN_BATCH(8); - OUT_BATCH(_3DSTATE_PS << 16 | (8 - 2)); - OUT_BATCH(ksp0); - OUT_BATCH(dw2); - if (prog_data->base.total_scratch) { - OUT_RELOC(brw->wm.base.scratch_bo, - I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, - ffs(stage_state->per_thread_scratch) - 11); - } else { - OUT_BATCH(0); - } - OUT_BATCH(dw4); - OUT_BATCH(dw5); - OUT_BATCH(0); /* kernel 1 pointer */ - OUT_BATCH(ksp2); - ADVANCE_BATCH(); -} - -static void -upload_ps_state(struct brw_context *brw) -{ - /* BRW_NEW_FS_PROG_DATA */ - const struct brw_wm_prog_data *prog_data = - brw_wm_prog_data(brw->wm.base.prog_data); - const struct gl_context *ctx = &brw->ctx; - /* BRW_NEW_FS_PROG_DATA | _NEW_COLOR */ - const bool enable_dual_src_blend = prog_data->dual_src_blend && - (ctx->Color.BlendEnabled & 1) && - ctx->Color.Blend[0]._UsesDualSrc; - /* _NEW_BUFFERS, _NEW_MULTISAMPLE */ - const unsigned sample_mask = - brw->is_haswell ? gen6_determine_sample_mask(brw) : 0; - - gen7_upload_ps_state(brw, &brw->wm.base, prog_data, - enable_dual_src_blend, sample_mask, - brw->wm.fast_clear_op); -} - -const struct brw_tracked_state gen7_ps_state = { - .dirty = { - .mesa = _NEW_BUFFERS | - _NEW_COLOR | - _NEW_MULTISAMPLE, - .brw = BRW_NEW_BATCH | - BRW_NEW_BLORP | - BRW_NEW_FS_PROG_DATA, - }, - .emit = upload_ps_state, -}; diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c index 03468267ce6..581aa54c904 100644 --- a/src/mesa/drivers/dri/i965/gen8_ps_state.c +++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c @@ -185,117 +185,3 @@ const struct brw_tracked_state gen8_wm_state = { }, .emit = upload_wm_state, }; - -void -gen8_upload_ps_state(struct brw_context *brw, - const struct brw_stage_state *stage_state, - const struct brw_wm_prog_data *prog_data, - uint32_t fast_clear_op) -{ - uint32_t dw3 = 0, dw6 = 0, dw7 = 0, ksp0, ksp2 = 0; - - /* Initialize the execution mask with VMask. Otherwise, derivatives are - * incorrect for subspans where some of the pixels are unlit. We believe - * the bit just didn't take effect in previous generations. - */ - dw3 |= GEN7_PS_VECTOR_MASK_ENABLE; - - const unsigned sampler_count = - DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4); - dw3 |= SET_FIELD(sampler_count, GEN7_PS_SAMPLER_COUNT); - - /* BRW_NEW_FS_PROG_DATA */ - dw3 |= - ((prog_data->base.binding_table.size_bytes / 4) << - GEN7_PS_BINDING_TABLE_ENTRY_COUNT_SHIFT); - - if (prog_data->base.use_alt_mode) - dw3 |= GEN7_PS_FLOATING_POINT_MODE_ALT; - - /* 3DSTATE_PS expects the number of threads per PSD, which is always 64; - * it implicitly scales for different GT levels (which have some # of PSDs). - * - * In Gen8 the format is U8-2 whereas in Gen9 it is U8-1. - */ - if (brw->gen >= 9) - dw6 |= (64 - 1) << HSW_PS_MAX_THREADS_SHIFT; - else - dw6 |= (64 - 2) << HSW_PS_MAX_THREADS_SHIFT; - - if (prog_data->base.nr_params > 0) - dw6 |= GEN7_PS_PUSH_CONSTANT_ENABLE; - - /* From the documentation for this packet: - * "If the PS kernel does not need the Position XY Offsets to - * compute a Position Value, then this field should be programmed - * to POSOFFSET_NONE." - * - * "SW Recommendation: If the PS kernel needs the Position Offsets - * to compute a Position XY value, this field should match Position - * ZW Interpolation Mode to ensure a consistent position.xyzw - * computation." - * - * We only require XY sample offsets. So, this recommendation doesn't - * look useful at the moment. We might need this in future. - */ - if (prog_data->uses_pos_offset) - dw6 |= GEN7_PS_POSOFFSET_SAMPLE; - else - dw6 |= GEN7_PS_POSOFFSET_NONE; - - dw6 |= fast_clear_op; - - if (prog_data->dispatch_8) - dw6 |= GEN7_PS_8_DISPATCH_ENABLE; - - if (prog_data->dispatch_16) - dw6 |= GEN7_PS_16_DISPATCH_ENABLE; - - dw7 |= prog_data->base.dispatch_grf_start_reg << - GEN7_PS_DISPATCH_START_GRF_SHIFT_0; - dw7 |= prog_data->dispatch_grf_start_reg_2 << - GEN7_PS_DISPATCH_START_GRF_SHIFT_2; - - ksp0 = stage_state->prog_offset; - ksp2 = stage_state->prog_offset + prog_data->prog_offset_2; - - BEGIN_BATCH(12); - OUT_BATCH(_3DSTATE_PS << 16 | (12 - 2)); - OUT_BATCH(ksp0); - OUT_BATCH(0); - OUT_BATCH(dw3); - if (prog_data->base.total_scratch) { - OUT_RELOC64(stage_state->scratch_bo, - I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, - ffs(stage_state->per_thread_scratch) - 11); - } else { - OUT_BATCH(0); - OUT_BATCH(0); - } - OUT_BATCH(dw6); - OUT_BATCH(dw7); - OUT_BATCH(0); /* kernel 1 pointer */ - OUT_BATCH(0); - OUT_BATCH(ksp2); - OUT_BATCH(0); - ADVANCE_BATCH(); -} - -static void -upload_ps_state(struct brw_context *brw) -{ - /* BRW_NEW_FS_PROG_DATA */ - const struct brw_wm_prog_data *prog_data = - brw_wm_prog_data(brw->wm.base.prog_data); - gen8_upload_ps_state(brw, &brw->wm.base, prog_data, brw->wm.fast_clear_op); -} - -const struct brw_tracked_state gen8_ps_state = { - .dirty = { - .mesa = _NEW_MULTISAMPLE, - .brw = BRW_NEW_BATCH | - BRW_NEW_BLORP | - BRW_NEW_FS_PROG_DATA, - }, - .emit = upload_ps_state, -}; diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c index c06132cec6e..09537c3e584 100644 --- a/src/mesa/drivers/dri/i965/genX_state_upload.c +++ b/src/mesa/drivers/dri/i965/genX_state_upload.c @@ -1195,6 +1195,138 @@ static const struct brw_tracked_state genX(sol_state) = { .emit = genX(upload_sol), }; +/* ---------------------------------------------------------------------- */ + +static void +genX(upload_ps)(struct brw_context *brw) +{ + UNUSED const struct gl_context *ctx = &brw->ctx; + UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo; + + /* BRW_NEW_FS_PROG_DATA */ + const struct brw_wm_prog_data *prog_data = + brw_wm_prog_data(brw->wm.base.prog_data); + const struct brw_stage_state *stage_state = &brw->wm.base; + +#if GEN_GEN < 8 +#endif + + brw_batch_emit(brw, GENX(3DSTATE_PS), ps) { + /* Initialize the execution mask with VMask. Otherwise, derivatives are + * incorrect for subspans where some of the pixels are unlit. We believe + * the bit just didn't take effect in previous generations. + */ + ps.VectorMaskEnable = GEN_GEN >= 8; + + ps.SamplerCount = + DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4); + + /* BRW_NEW_FS_PROG_DATA */ + ps.BindingTableEntryCount = prog_data->base.binding_table.size_bytes / 4; + + if (prog_data->base.use_alt_mode) + ps.FloatingPointMode = Alternate; + + /* Haswell requires the sample mask to be set in this packet as well as + * in 3DSTATE_SAMPLE_MASK; the values should match. + */ + + /* _NEW_BUFFERS, _NEW_MULTISAMPLE */ +#if GEN_IS_HASWELL + ps.SampleMask = gen6_determine_sample_mask(brw); +#endif + + /* 3DSTATE_PS expects the number of threads per PSD, which is always 64; + * it implicitly scales for different GT levels (which have some # of + * PSDs). + * + * In Gen8 the format is U8-2 whereas in Gen9 it is U8-1. + */ +#if GEN_GEN >= 9 + ps.MaximumNumberofThreadsPerPSD = 64 - 1; +#elif GEN_GEN >= 8 + ps.MaximumNumberofThreadsPerPSD = 64 - 2; +#else + ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1; +#endif + + if (prog_data->base.nr_params > 0) + ps.PushConstantEnable = true; + +#if GEN_GEN < 8 + /* From the IVB PRM, volume 2 part 1, page 287: + * "This bit is inserted in the PS payload header and made available to + * the DataPort (either via the message header or via header bypass) to + * indicate that oMask data (one or two phases) is included in Render + * Target Write messages. If present, the oMask data is used to mask off + * samples." + */ + ps.oMaskPresenttoRenderTarget = prog_data->uses_omask; + + /* The hardware wedges if you have this bit set but don't turn on any + * dual source blend factors. + * + * BRW_NEW_FS_PROG_DATA | _NEW_COLOR + */ + ps.DualSourceBlendEnable = prog_data->dual_src_blend && + (ctx->Color.BlendEnabled & 1) && + ctx->Color.Blend[0]._UsesDualSrc; + + /* BRW_NEW_FS_PROG_DATA */ + ps.AttributeEnable = (prog_data->num_varying_inputs != 0); +#endif + + /* From the documentation for this packet: + * "If the PS kernel does not need the Position XY Offsets to + * compute a Position Value, then this field should be programmed + * to POSOFFSET_NONE." + * + * "SW Recommendation: If the PS kernel needs the Position Offsets + * to compute a Position XY value, this field should match Position + * ZW Interpolation Mode to ensure a consistent position.xyzw + * computation." + * + * We only require XY sample offsets. So, this recommendation doesn't + * look useful at the moment. We might need this in future. + */ + if (prog_data->uses_pos_offset) + ps.PositionXYOffsetSelect = POSOFFSET_SAMPLE; + else + ps.PositionXYOffsetSelect = POSOFFSET_NONE; + + ps.RenderTargetFastClearEnable = brw->wm.fast_clear_op; + ps._8PixelDispatchEnable = prog_data->dispatch_8; + ps._16PixelDispatchEnable = prog_data->dispatch_16; + ps.DispatchGRFStartRegisterForConstantSetupData0 = + prog_data->base.dispatch_grf_start_reg; + ps.DispatchGRFStartRegisterForConstantSetupData2 = + prog_data->dispatch_grf_start_reg_2; + + ps.KernelStartPointer0 = stage_state->prog_offset; + ps.KernelStartPointer2 = stage_state->prog_offset + + prog_data->prog_offset_2; + + if (prog_data->base.total_scratch) { + ps.ScratchSpaceBasePointer = + render_bo(stage_state->scratch_bo, + ffs(stage_state->per_thread_scratch) - 11); + } + } +} + +static const struct brw_tracked_state genX(ps_state) = { + .dirty = { + .mesa = _NEW_MULTISAMPLE | + (GEN_GEN < 8 ? _NEW_BUFFERS | + _NEW_COLOR + : 0), + .brw = BRW_NEW_BATCH | + BRW_NEW_BLORP | + BRW_NEW_FS_PROG_DATA, + }, + .emit = genX(upload_ps), +}; + #endif /* ---------------------------------------------------------------------- */ @@ -1518,7 +1650,7 @@ genX(init_atoms)(struct brw_context *brw) &genX(sbe_state), &genX(sf_state), &gen7_wm_state, - &gen7_ps_state, + &genX(ps_state), &gen6_scissor_state, @@ -1607,7 +1739,7 @@ genX(init_atoms)(struct brw_context *brw) &genX(sf_state), &gen8_ps_blend, &gen8_ps_extra, - &gen8_ps_state, + &genX(ps_state), &genX(depth_stencil_state), &gen8_wm_state, -- 2.30.2