From acc77947ca623472f9c7464553496314b57654ac Mon Sep 17 00:00:00 2001 From: Francisco Jerez Date: Tue, 8 Dec 2015 18:53:22 +0200 Subject: [PATCH] i965: Implement L3 state atom. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The L3 state atom calculates the target L3 partition weights when the program bound to some shader stage is modified, and in case they are far enough from the current partitioning it makes sure that the L3 state is re-emitted. v2: Fix for inconsistent units the context URB size is expressed in. Clamp URB size to 1008 KB on SKL due to FF hardware limitation. Reviewed-by: Samuel Iglesias Gonsálvez Acked-by: Kenneth Graunke Reviewed-by: Kristian Høgsberg --- src/mesa/drivers/dri/i965/brw_context.h | 6 ++ src/mesa/drivers/dri/i965/brw_state.h | 1 + src/mesa/drivers/dri/i965/gen7_l3_state.c | 81 +++++++++++++++++++++++ 3 files changed, 88 insertions(+) diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 99ac7ca135d..f8784bc2a70 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -702,6 +702,8 @@ enum brw_predicate_state { struct shader_times; +struct brw_l3_config; + /** * brw_context is derived from gl_context. */ @@ -1267,6 +1269,10 @@ struct brw_context uint32_t num_instances; int basevertex; + struct { + const struct brw_l3_config *config; + } l3; + struct { drm_intel_bo *bo; const char **names; diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h index 28bea0dbfd0..44537aac614 100644 --- a/src/mesa/drivers/dri/i965/brw_state.h +++ b/src/mesa/drivers/dri/i965/brw_state.h @@ -130,6 +130,7 @@ extern const struct brw_tracked_state gen7_clip_state; extern const struct brw_tracked_state gen7_ds_state; extern const struct brw_tracked_state gen7_gs_state; extern const struct brw_tracked_state gen7_hs_state; +extern const struct brw_tracked_state gen7_l3_state; extern const struct brw_tracked_state gen7_ps_state; extern const struct brw_tracked_state gen7_push_constant_space; extern const struct brw_tracked_state gen7_sbe_state; diff --git a/src/mesa/drivers/dri/i965/gen7_l3_state.c b/src/mesa/drivers/dri/i965/gen7_l3_state.c index 557b744b5e8..cacd1aa80fe 100644 --- a/src/mesa/drivers/dri/i965/gen7_l3_state.c +++ b/src/mesa/drivers/dri/i965/gen7_l3_state.c @@ -422,3 +422,84 @@ setup_l3_config(struct brw_context *brw, const struct brw_l3_config *cfg) } } } + +/** + * Return the unit brw_context::urb::size is expressed in, in KB. \sa + * brw_device_info::urb::size. + */ +static unsigned +get_urb_size_scale(const struct brw_device_info *devinfo) +{ + return (devinfo->gen >= 8 ? devinfo->num_slices : 1); +} + +/** + * Update the URB size in the context state for the specified L3 + * configuration. + */ +static void +update_urb_size(struct brw_context *brw, const struct brw_l3_config *cfg) +{ + const struct brw_device_info *devinfo = brw->intelScreen->devinfo; + /* From the SKL "L3 Allocation and Programming" documentation: + * + * "URB is limited to 1008KB due to programming restrictions. This is not + * a restriction of the L3 implementation, but of the FF and other clients. + * Therefore, in a GT4 implementation it is possible for the programmed + * allocation of the L3 data array to provide 3*384KB=1152KB for URB, but + * only 1008KB of this will be used." + */ + const unsigned max = (devinfo->gen == 9 ? 1008 : ~0); + const unsigned sz = + MIN2(max, cfg->n[L3P_URB] * get_l3_way_size(devinfo)) / + get_urb_size_scale(devinfo); + + if (brw->urb.size != sz) { + brw->urb.size = sz; + brw->ctx.NewDriverState |= BRW_NEW_URB_SIZE; + } +} + +static void +emit_l3_state(struct brw_context *brw) +{ + const struct brw_l3_weights w = get_pipeline_state_l3_weights(brw); + const float dw = diff_l3_weights(w, get_config_l3_weights(brw->l3.config)); + /* The distance between any two compatible weight vectors cannot exceed two + * due to the triangle inequality. + */ + const float large_dw_threshold = 2.0; + /* Somewhat arbitrary, simply makes sure that there will be no repeated + * transitions to the same L3 configuration, could probably do better here. + */ + const float small_dw_threshold = 0.5; + /* If we're emitting a new batch the caches should already be clean and the + * transition should be relatively cheap, so it shouldn't hurt much to use + * the smaller threshold. Otherwise use the larger threshold so that we + * only reprogram the L3 mid-batch if the most recently programmed + * configuration is incompatible with the current pipeline state. + */ + const float dw_threshold = (brw->ctx.NewDriverState & BRW_NEW_BATCH ? + small_dw_threshold : large_dw_threshold); + + if (dw > dw_threshold && brw->can_do_pipelined_register_writes) { + const struct brw_l3_config *const cfg = + get_l3_config(brw->intelScreen->devinfo, w); + + setup_l3_config(brw, cfg); + update_urb_size(brw, cfg); + brw->l3.config = cfg; + } +} + +const struct brw_tracked_state gen7_l3_state = { + .dirty = { + .mesa = 0, + .brw = BRW_NEW_BATCH | + BRW_NEW_CS_PROG_DATA | + BRW_NEW_FS_PROG_DATA | + BRW_NEW_GS_PROG_DATA | + BRW_NEW_VS_PROG_DATA, + }, + .emit = emit_l3_state +}; -- 2.30.2