From 9d6ca7c3d091e1ab71ce2f75bf4f13dc8844d801 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Mon, 26 Sep 2016 10:30:30 -0700 Subject: [PATCH] i965: Only emit 1 viewport when possible. In core profile, we support up to 16 viewports. However, in the majority of cases, only 1 of them is actually used - we only need the others if the last shader stage prior to the rasterizer writes gl_ViewportIndex. Processing all 16 viewports adds additional CPU overhead, which hurts CPU-intensive workloads such as Glamor. This meant that switching to core profile actually penalized Glamor to an extent, which is unfortunate. This patch tracks the number of relevant viewports, switching between 1 and ctx->Const.MaxViewports if gl_ViewportIndex is written. A new BRW_NEW_VIEWPORT_COUNT flag tracks this. This could mean re-emitting viewport state when switching, but hopefully this is offset by doing 1/16th of the work in the common case. The new flag is also lighter weight than BRW_NEW_VUE_MAP_GEOM_OUT, which we were using in one case. According to Eric Anholt, x11perf -copypixwin10 performance improves by 11.5094% +/- 3.10841% (n=10) on his Skylake. Signed-off-by: Kenneth Graunke Reviewed-by: Ian Romanick Acked-by: Anuj Phogat --- src/mesa/drivers/dri/i965/brw_cc.c | 10 ++++++--- src/mesa/drivers/dri/i965/brw_context.c | 1 + src/mesa/drivers/dri/i965/brw_context.h | 9 ++++++++ src/mesa/drivers/dri/i965/brw_gs_state.c | 6 +++-- src/mesa/drivers/dri/i965/brw_state_upload.c | 11 ++++++++++ src/mesa/drivers/dri/i965/gen6_clip_state.c | 16 ++++++-------- .../drivers/dri/i965/gen6_scissor_state.c | 10 ++++++--- .../drivers/dri/i965/gen6_viewport_state.c | 22 +++++++++++++------ .../drivers/dri/i965/gen7_viewport_state.c | 10 ++++++--- .../drivers/dri/i965/gen8_viewport_state.c | 10 ++++++--- 10 files changed, 75 insertions(+), 30 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_cc.c b/src/mesa/drivers/dri/i965/brw_cc.c index 5c58b448c1f..b11d7c85ca9 100644 --- a/src/mesa/drivers/dri/i965/brw_cc.c +++ b/src/mesa/drivers/dri/i965/brw_cc.c @@ -44,12 +44,15 @@ brw_upload_cc_vp(struct brw_context *brw) struct gl_context *ctx = &brw->ctx; struct brw_cc_viewport *ccv; + /* BRW_NEW_VIEWPORT_COUNT */ + const unsigned viewport_count = brw->clip.viewport_count; + ccv = brw_state_batch(brw, AUB_TRACE_CC_VP_STATE, - sizeof(*ccv) * ctx->Const.MaxViewports, 32, + sizeof(*ccv) * viewport_count, 32, &brw->cc.vp_offset); /* _NEW_TRANSFORM */ - for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) { + for (unsigned i = 0; i < viewport_count; i++) { if (ctx->Transform.DepthClamp) { /* _NEW_VIEWPORT */ ccv[i].min_depth = MIN2(ctx->ViewportArray[i].Near, @@ -77,7 +80,8 @@ const struct brw_tracked_state brw_cc_vp = { .mesa = _NEW_TRANSFORM | _NEW_VIEWPORT, .brw = BRW_NEW_BATCH | - BRW_NEW_BLORP, + BRW_NEW_BLORP | + BRW_NEW_VIEWPORT_COUNT, }, .emit = brw_upload_cc_vp }; diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index 6efad7849d6..b0eec16d826 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -1085,6 +1085,7 @@ brwCreateContext(gl_api api, brw->prim_restart.enable_cut_index = false; brw->gs.enabled = false; brw->sf.viewport_transform_enable = true; + brw->clip.viewport_count = 1; brw->predicate.state = BRW_PREDICATE_STATE_RENDER; diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 00f0adca4d5..b27fe51e706 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -226,6 +226,7 @@ enum brw_state_id { BRW_STATE_URB_SIZE, BRW_STATE_CC_STATE, BRW_STATE_BLORP, + BRW_STATE_VIEWPORT_COUNT, BRW_NUM_STATE_BITS }; @@ -294,6 +295,7 @@ enum brw_state_id { #define BRW_NEW_PROGRAM_CACHE (1ull << BRW_STATE_PROGRAM_CACHE) #define BRW_NEW_STATE_BASE_ADDRESS (1ull << BRW_STATE_STATE_BASE_ADDRESS) #define BRW_NEW_VUE_MAP_GEOM_OUT (1ull << BRW_STATE_VUE_MAP_GEOM_OUT) +#define BRW_NEW_VIEWPORT_COUNT (1ull << BRW_STATE_VIEWPORT_COUNT) #define BRW_NEW_TRANSFORM_FEEDBACK (1ull << BRW_STATE_TRANSFORM_FEEDBACK) #define BRW_NEW_RASTERIZER_DISCARD (1ull << BRW_STATE_RASTERIZER_DISCARD) #define BRW_NEW_STATS_WM (1ull << BRW_STATE_STATS_WM) @@ -1160,6 +1162,13 @@ struct brw_context * instead of vp_bo. */ uint32_t vp_offset; + + /** + * The number of viewports to use. If gl_ViewportIndex is written, + * we can have up to ctx->Const.MaxViewports viewports. If not, + * the viewport index is always 0, so we can only emit one. + */ + uint8_t viewport_count; } clip; diff --git a/src/mesa/drivers/dri/i965/brw_gs_state.c b/src/mesa/drivers/dri/i965/brw_gs_state.c index 1757201675d..8e3bf1ef651 100644 --- a/src/mesa/drivers/dri/i965/brw_gs_state.c +++ b/src/mesa/drivers/dri/i965/brw_gs_state.c @@ -83,7 +83,8 @@ brw_upload_gs_unit(struct brw_context *brw) if (unlikely(INTEL_DEBUG & DEBUG_STATS)) gs->thread4.stats_enable = 1; - gs->gs6.max_vp_index = brw->ctx.Const.MaxViewports - 1; + /* BRW_NEW_VIEWPORT_COUNT */ + gs->gs6.max_vp_index = brw->clip.viewport_count - 1; brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE; } @@ -96,7 +97,8 @@ const struct brw_tracked_state brw_gs_unit = { BRW_NEW_CURBE_OFFSETS | BRW_NEW_FF_GS_PROG_DATA | BRW_NEW_PROGRAM_CACHE | - BRW_NEW_URB_FENCE, + BRW_NEW_URB_FENCE | + BRW_NEW_VIEWPORT_COUNT, }, .emit = brw_upload_gs_unit, }; diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c index 8ce6851814b..055fed128a0 100644 --- a/src/mesa/drivers/dri/i965/brw_state_upload.c +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c @@ -655,6 +655,7 @@ static struct dirty_bit_map brw_bits[] = { DEFINE_BIT(BRW_NEW_URB_SIZE), DEFINE_BIT(BRW_NEW_CC_STATE), DEFINE_BIT(BRW_NEW_BLORP), + DEFINE_BIT(BRW_NEW_VIEWPORT_COUNT), {0, 0, 0} }; @@ -696,6 +697,8 @@ static inline void brw_upload_programs(struct brw_context *brw, enum brw_pipeline pipeline) { + struct gl_context *ctx = &brw->ctx; + if (pipeline == BRW_RENDER_PIPELINE) { brw_upload_vs_prog(brw); brw_upload_tess_programs(brw); @@ -722,6 +725,14 @@ brw_upload_programs(struct brw_context *brw, old_separate != brw->vue_map_geom_out.separate) brw->ctx.NewDriverState |= BRW_NEW_VUE_MAP_GEOM_OUT; + if ((old_slots ^ brw->vue_map_geom_out.slots_valid) & + VARYING_BIT_VIEWPORT) { + ctx->NewDriverState |= BRW_NEW_VIEWPORT_COUNT; + brw->clip.viewport_count = + (brw->vue_map_geom_out.slots_valid & VARYING_BIT_VIEWPORT) ? + ctx->Const.MaxViewports : 1; + } + if (brw->gen < 6) { brw_setup_vue_interpolation(brw); brw_upload_clip_prog(brw); diff --git a/src/mesa/drivers/dri/i965/gen6_clip_state.c b/src/mesa/drivers/dri/i965/gen6_clip_state.c index 7dc97404369..9c33e67c9aa 100644 --- a/src/mesa/drivers/dri/i965/gen6_clip_state.c +++ b/src/mesa/drivers/dri/i965/gen6_clip_state.c @@ -157,6 +157,9 @@ upload_clip_state(struct brw_context *brw) dw2 |= GEN6_CLIP_GB_TEST; + /* BRW_NEW_VIEWPORT_COUNT */ + const unsigned viewport_count = brw->clip.viewport_count; + /* We need to disable guardband clipping if the guardband (which we always * program to the maximum screen-space bounding box of 8K x 8K) will be * smaller than the viewport. @@ -180,7 +183,7 @@ upload_clip_state(struct brw_context *brw) * "objects must have a screenspace bounding box not exceeding 8K in the X * or Y direction" restriction. Instead, they're clipped. */ - for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) { + for (unsigned i = 0; i < viewport_count; i++) { if (ctx->ViewportArray[i].Width > 8192 || ctx->ViewportArray[i].Height > 8192) { dw2 &= ~GEN6_CLIP_GB_TEST; @@ -203,7 +206,7 @@ upload_clip_state(struct brw_context *brw) const float fb_width = (float)_mesa_geometric_width(fb); const float fb_height = (float)_mesa_geometric_height(fb); - for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) { + for (unsigned i = 0; i < viewport_count; i++) { if (ctx->ViewportArray[i].X != 0 || ctx->ViewportArray[i].Y != 0 || ctx->ViewportArray[i].Width != fb_width || @@ -236,11 +239,6 @@ upload_clip_state(struct brw_context *brw) if (!brw_is_drawing_points(brw) && !brw_is_drawing_lines(brw)) dw2 |= GEN6_CLIP_XY_TEST; - /* BRW_NEW_VUE_MAP_GEOM_OUT */ - const int max_vp_index = - (brw->vue_map_geom_out.slots_valid & VARYING_BIT_VIEWPORT) != 0 ? - ctx->Const.MaxViewports : 1; - BEGIN_BATCH(4); OUT_BATCH(_3DSTATE_CLIP << 16 | (4 - 2)); OUT_BATCH(dw1); @@ -250,7 +248,7 @@ upload_clip_state(struct brw_context *brw) OUT_BATCH(U_FIXED(0.125, 3) << GEN6_CLIP_MIN_POINT_WIDTH_SHIFT | U_FIXED(255.875, 3) << GEN6_CLIP_MAX_POINT_WIDTH_SHIFT | (_mesa_geometric_layers(fb) > 0 ? 0 : GEN6_CLIP_FORCE_ZERO_RTAINDEX) | - ((max_vp_index - 1) & GEN6_CLIP_MAX_VP_INDEX_MASK)); + ((viewport_count - 1) & GEN6_CLIP_MAX_VP_INDEX_MASK)); ADVANCE_BATCH(); } @@ -268,7 +266,7 @@ const struct brw_tracked_state gen6_clip_state = { BRW_NEW_PRIMITIVE | BRW_NEW_RASTERIZER_DISCARD | BRW_NEW_TES_PROG_DATA | - BRW_NEW_VUE_MAP_GEOM_OUT, + BRW_NEW_VIEWPORT_COUNT, }, .emit = upload_clip_state, }; diff --git a/src/mesa/drivers/dri/i965/gen6_scissor_state.c b/src/mesa/drivers/dri/i965/gen6_scissor_state.c index b03ac730a3e..860445a2b43 100644 --- a/src/mesa/drivers/dri/i965/gen6_scissor_state.c +++ b/src/mesa/drivers/dri/i965/gen6_scissor_state.c @@ -42,8 +42,11 @@ gen6_upload_scissor_state(struct brw_context *brw) const unsigned int fb_width= _mesa_geometric_width(ctx->DrawBuffer); const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer); + /* BRW_NEW_VIEWPORT_COUNT */ + const unsigned viewport_count = brw->clip.viewport_count; + scissor = brw_state_batch(brw, AUB_TRACE_SCISSOR_STATE, - sizeof(*scissor) * ctx->Const.MaxViewports, 32, + sizeof(*scissor) * viewport_count, 32, &scissor_state_offset); /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */ @@ -55,7 +58,7 @@ gen6_upload_scissor_state(struct brw_context *brw) * Note that the hardware's coordinates are inclusive, while Mesa's min is * inclusive but max is exclusive. */ - for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) { + for (unsigned i = 0; i < viewport_count; i++) { int bbox[4]; bbox[0] = MAX2(ctx->ViewportArray[i].X, 0); @@ -102,7 +105,8 @@ const struct brw_tracked_state gen6_scissor_state = { _NEW_SCISSOR | _NEW_VIEWPORT, .brw = BRW_NEW_BATCH | - BRW_NEW_BLORP, + BRW_NEW_BLORP | + BRW_NEW_VIEWPORT_COUNT, }, .emit = gen6_upload_scissor_state, }; diff --git a/src/mesa/drivers/dri/i965/gen6_viewport_state.c b/src/mesa/drivers/dri/i965/gen6_viewport_state.c index eacffb9ba0f..ad1e72d0a50 100644 --- a/src/mesa/drivers/dri/i965/gen6_viewport_state.c +++ b/src/mesa/drivers/dri/i965/gen6_viewport_state.c @@ -42,10 +42,13 @@ gen6_upload_clip_vp(struct brw_context *brw) struct gl_context *ctx = &brw->ctx; struct brw_clipper_viewport *vp; + /* BRW_NEW_VIEWPORT_COUNT */ + const unsigned viewport_count = brw->clip.viewport_count; + vp = brw_state_batch(brw, AUB_TRACE_CLIP_VP_STATE, - sizeof(*vp) * ctx->Const.MaxViewports, 32, &brw->clip.vp_offset); + sizeof(*vp) * viewport_count, 32, &brw->clip.vp_offset); - for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) { + for (unsigned i = 0; i < viewport_count; i++) { /* According to the "Vertex X,Y Clamping and Quantization" section of the * Strips and Fans documentation, objects must not have a screen-space * extents of over 8192 pixels, or they may be mis-rasterized. The maximum @@ -74,7 +77,8 @@ const struct brw_tracked_state gen6_clip_vp = { .dirty = { .mesa = _NEW_VIEWPORT, .brw = BRW_NEW_BATCH | - BRW_NEW_BLORP, + BRW_NEW_BLORP | + BRW_NEW_VIEWPORT_COUNT, }, .emit = gen6_upload_clip_vp, }; @@ -87,10 +91,13 @@ gen6_upload_sf_vp(struct brw_context *brw) GLfloat y_scale, y_bias; const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer); + /* BRW_NEW_VIEWPORT_COUNT */ + const unsigned viewport_count = brw->clip.viewport_count; + sfv = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE, - sizeof(*sfv) * ctx->Const.MaxViewports, + sizeof(*sfv) * viewport_count, 32, &brw->sf.vp_offset); - memset(sfv, 0, sizeof(*sfv) * ctx->Const.MaxViewports); + memset(sfv, 0, sizeof(*sfv) * viewport_count); /* _NEW_BUFFERS */ if (render_to_fbo) { @@ -101,7 +108,7 @@ gen6_upload_sf_vp(struct brw_context *brw) y_bias = (float)_mesa_geometric_height(ctx->DrawBuffer); } - for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) { + for (unsigned i = 0; i < viewport_count; i++) { float scale[3], translate[3]; /* _NEW_VIEWPORT */ @@ -123,7 +130,8 @@ const struct brw_tracked_state gen6_sf_vp = { .mesa = _NEW_BUFFERS | _NEW_VIEWPORT, .brw = BRW_NEW_BATCH | - BRW_NEW_BLORP, + BRW_NEW_BLORP | + BRW_NEW_VIEWPORT_COUNT, }, .emit = gen6_upload_sf_vp, }; diff --git a/src/mesa/drivers/dri/i965/gen7_viewport_state.c b/src/mesa/drivers/dri/i965/gen7_viewport_state.c index 34f93afdef6..c447331a2e5 100644 --- a/src/mesa/drivers/dri/i965/gen7_viewport_state.c +++ b/src/mesa/drivers/dri/i965/gen7_viewport_state.c @@ -37,8 +37,11 @@ gen7_upload_sf_clip_viewport(struct brw_context *brw) const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer); struct gen7_sf_clip_viewport *vp; + /* BRW_NEW_VIEWPORT_COUNT */ + const unsigned viewport_count = brw->clip.viewport_count; + vp = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE, - sizeof(*vp) * ctx->Const.MaxViewports, 64, + sizeof(*vp) * viewport_count, 64, &brw->sf.vp_offset); /* Also assign to clip.vp_offset in case something uses it. */ brw->clip.vp_offset = brw->sf.vp_offset; @@ -52,7 +55,7 @@ gen7_upload_sf_clip_viewport(struct brw_context *brw) y_bias = (float)_mesa_geometric_height(ctx->DrawBuffer); } - for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) { + for (unsigned i = 0; i < viewport_count; i++) { float scale[3], translate[3]; _mesa_get_viewport_xform(ctx, i, scale, translate); @@ -97,7 +100,8 @@ const struct brw_tracked_state gen7_sf_clip_viewport = { .mesa = _NEW_BUFFERS | _NEW_VIEWPORT, .brw = BRW_NEW_BATCH | - BRW_NEW_BLORP, + BRW_NEW_BLORP | + BRW_NEW_VIEWPORT_COUNT, }, .emit = gen7_upload_sf_clip_viewport, }; diff --git a/src/mesa/drivers/dri/i965/gen8_viewport_state.c b/src/mesa/drivers/dri/i965/gen8_viewport_state.c index acaee1a94e8..84000e3a7e2 100644 --- a/src/mesa/drivers/dri/i965/gen8_viewport_state.c +++ b/src/mesa/drivers/dri/i965/gen8_viewport_state.c @@ -37,8 +37,11 @@ gen8_upload_sf_clip_viewport(struct brw_context *brw) const float fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer); const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer); + /* BRW_NEW_VIEWPORT_COUNT */ + const unsigned viewport_count = brw->clip.viewport_count; + float *vp = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE, - 16 * 4 * ctx->Const.MaxViewports, + 16 * 4 * viewport_count, 64, &brw->sf.vp_offset); /* Also assign to clip.vp_offset in case something uses it. */ brw->clip.vp_offset = brw->sf.vp_offset; @@ -52,7 +55,7 @@ gen8_upload_sf_clip_viewport(struct brw_context *brw) y_bias = fb_height; } - for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) { + for (unsigned i = 0; i < viewport_count; i++) { float scale[3], translate[3]; _mesa_get_viewport_xform(ctx, i, scale, translate); @@ -136,7 +139,8 @@ const struct brw_tracked_state gen8_sf_clip_viewport = { .mesa = _NEW_BUFFERS | _NEW_VIEWPORT, .brw = BRW_NEW_BATCH | - BRW_NEW_BLORP, + BRW_NEW_BLORP | + BRW_NEW_VIEWPORT_COUNT, }, .emit = gen8_upload_sf_clip_viewport, }; -- 2.30.2