i965: Only emit 1 viewport when possible.
authorKenneth Graunke <kenneth@whitecape.org>
Mon, 26 Sep 2016 17:30:30 +0000 (10:30 -0700)
committerKenneth Graunke <kenneth@whitecape.org>
Tue, 4 Oct 2016 01:41:10 +0000 (18:41 -0700)
In core profile, we support up to 16 viewports.  However, in the
majority of cases, only 1 of them is actually used - we only need
the others if the last shader stage prior to the rasterizer writes
gl_ViewportIndex.

Processing all 16 viewports adds additional CPU overhead, which hurts
CPU-intensive workloads such as Glamor.  This meant that switching to
core profile actually penalized Glamor to an extent, which is
unfortunate.

This patch tracks the number of relevant viewports, switching between
1 and ctx->Const.MaxViewports if gl_ViewportIndex is written.  A new
BRW_NEW_VIEWPORT_COUNT flag tracks this.  This could mean re-emitting
viewport state when switching, but hopefully this is offset by doing
1/16th of the work in the common case.  The new flag is also lighter
weight than BRW_NEW_VUE_MAP_GEOM_OUT, which we were using in one case.

According to Eric Anholt, x11perf -copypixwin10 performance improves by
11.5094% +/- 3.10841% (n=10) on his Skylake.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Acked-by: Anuj Phogat <anuj.phogat@gmail.com>
src/mesa/drivers/dri/i965/brw_cc.c
src/mesa/drivers/dri/i965/brw_context.c
src/mesa/drivers/dri/i965/brw_context.h
src/mesa/drivers/dri/i965/brw_gs_state.c
src/mesa/drivers/dri/i965/brw_state_upload.c
src/mesa/drivers/dri/i965/gen6_clip_state.c
src/mesa/drivers/dri/i965/gen6_scissor_state.c
src/mesa/drivers/dri/i965/gen6_viewport_state.c
src/mesa/drivers/dri/i965/gen7_viewport_state.c
src/mesa/drivers/dri/i965/gen8_viewport_state.c

index 5c58b448c1fb32f049d990b3d2fe947f50b3a460..b11d7c85ca9e51625fec617e331b41fe05d41298 100644 (file)
@@ -44,12 +44,15 @@ brw_upload_cc_vp(struct brw_context *brw)
    struct gl_context *ctx = &brw->ctx;
    struct brw_cc_viewport *ccv;
 
+   /* BRW_NEW_VIEWPORT_COUNT */
+   const unsigned viewport_count = brw->clip.viewport_count;
+
    ccv = brw_state_batch(brw, AUB_TRACE_CC_VP_STATE,
-                        sizeof(*ccv) * ctx->Const.MaxViewports, 32,
+                        sizeof(*ccv) * viewport_count, 32,
                          &brw->cc.vp_offset);
 
    /* _NEW_TRANSFORM */
-   for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
+   for (unsigned i = 0; i < viewport_count; i++) {
       if (ctx->Transform.DepthClamp) {
          /* _NEW_VIEWPORT */
          ccv[i].min_depth = MIN2(ctx->ViewportArray[i].Near,
@@ -77,7 +80,8 @@ const struct brw_tracked_state brw_cc_vp = {
       .mesa = _NEW_TRANSFORM |
               _NEW_VIEWPORT,
       .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP,
+             BRW_NEW_BLORP |
+             BRW_NEW_VIEWPORT_COUNT,
    },
    .emit = brw_upload_cc_vp
 };
index 6efad7849d6e18c773ccf461a2413ddb8f93a206..b0eec16d826642959a0eda7588ed86a61d939bc1 100644 (file)
@@ -1085,6 +1085,7 @@ brwCreateContext(gl_api api,
    brw->prim_restart.enable_cut_index = false;
    brw->gs.enabled = false;
    brw->sf.viewport_transform_enable = true;
+   brw->clip.viewport_count = 1;
 
    brw->predicate.state = BRW_PREDICATE_STATE_RENDER;
 
index 00f0adca4d5c04b5e5e9eef6f3b96de15e179326..b27fe51e7062a64ce3ad9caff5189c57ad506aec 100644 (file)
@@ -226,6 +226,7 @@ enum brw_state_id {
    BRW_STATE_URB_SIZE,
    BRW_STATE_CC_STATE,
    BRW_STATE_BLORP,
+   BRW_STATE_VIEWPORT_COUNT,
    BRW_NUM_STATE_BITS
 };
 
@@ -294,6 +295,7 @@ enum brw_state_id {
 #define BRW_NEW_PROGRAM_CACHE           (1ull << BRW_STATE_PROGRAM_CACHE)
 #define BRW_NEW_STATE_BASE_ADDRESS      (1ull << BRW_STATE_STATE_BASE_ADDRESS)
 #define BRW_NEW_VUE_MAP_GEOM_OUT        (1ull << BRW_STATE_VUE_MAP_GEOM_OUT)
+#define BRW_NEW_VIEWPORT_COUNT          (1ull << BRW_STATE_VIEWPORT_COUNT)
 #define BRW_NEW_TRANSFORM_FEEDBACK      (1ull << BRW_STATE_TRANSFORM_FEEDBACK)
 #define BRW_NEW_RASTERIZER_DISCARD      (1ull << BRW_STATE_RASTERIZER_DISCARD)
 #define BRW_NEW_STATS_WM                (1ull << BRW_STATE_STATS_WM)
@@ -1160,6 +1162,13 @@ struct brw_context
        * instead of vp_bo.
        */
       uint32_t vp_offset;
+
+      /**
+       * The number of viewports to use.  If gl_ViewportIndex is written,
+       * we can have up to ctx->Const.MaxViewports viewports.  If not,
+       * the viewport index is always 0, so we can only emit one.
+       */
+      uint8_t viewport_count;
    } clip;
 
 
index 1757201675d324541f8533c70833bef4d9ddb147..8e3bf1ef651ab5bce4bd5d8577a62305c2509e37 100644 (file)
@@ -83,7 +83,8 @@ brw_upload_gs_unit(struct brw_context *brw)
    if (unlikely(INTEL_DEBUG & DEBUG_STATS))
       gs->thread4.stats_enable = 1;
 
-   gs->gs6.max_vp_index = brw->ctx.Const.MaxViewports - 1;
+   /* BRW_NEW_VIEWPORT_COUNT */
+   gs->gs6.max_vp_index = brw->clip.viewport_count - 1;
 
    brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
 }
@@ -96,7 +97,8 @@ const struct brw_tracked_state brw_gs_unit = {
                BRW_NEW_CURBE_OFFSETS |
                BRW_NEW_FF_GS_PROG_DATA |
                BRW_NEW_PROGRAM_CACHE |
-               BRW_NEW_URB_FENCE,
+               BRW_NEW_URB_FENCE |
+               BRW_NEW_VIEWPORT_COUNT,
    },
    .emit = brw_upload_gs_unit,
 };
index 8ce6851814b5d67a5e8fa4f69f8a65491372c1d5..055fed128a08dfad1a086759f1face71bc683a82 100644 (file)
@@ -655,6 +655,7 @@ static struct dirty_bit_map brw_bits[] = {
    DEFINE_BIT(BRW_NEW_URB_SIZE),
    DEFINE_BIT(BRW_NEW_CC_STATE),
    DEFINE_BIT(BRW_NEW_BLORP),
+   DEFINE_BIT(BRW_NEW_VIEWPORT_COUNT),
    {0, 0, 0}
 };
 
@@ -696,6 +697,8 @@ static inline void
 brw_upload_programs(struct brw_context *brw,
                     enum brw_pipeline pipeline)
 {
+   struct gl_context *ctx = &brw->ctx;
+
    if (pipeline == BRW_RENDER_PIPELINE) {
       brw_upload_vs_prog(brw);
       brw_upload_tess_programs(brw);
@@ -722,6 +725,14 @@ brw_upload_programs(struct brw_context *brw,
           old_separate != brw->vue_map_geom_out.separate)
          brw->ctx.NewDriverState |= BRW_NEW_VUE_MAP_GEOM_OUT;
 
+      if ((old_slots ^ brw->vue_map_geom_out.slots_valid) &
+          VARYING_BIT_VIEWPORT) {
+         ctx->NewDriverState |= BRW_NEW_VIEWPORT_COUNT;
+         brw->clip.viewport_count =
+            (brw->vue_map_geom_out.slots_valid & VARYING_BIT_VIEWPORT) ?
+            ctx->Const.MaxViewports : 1;
+      }
+
       if (brw->gen < 6) {
          brw_setup_vue_interpolation(brw);
          brw_upload_clip_prog(brw);
index 7dc974043696285989563caeaebbaab29b5161aa..9c33e67c9aad8abf26b93a87841670643b892500 100644 (file)
@@ -157,6 +157,9 @@ upload_clip_state(struct brw_context *brw)
 
    dw2 |= GEN6_CLIP_GB_TEST;
 
+   /* BRW_NEW_VIEWPORT_COUNT */
+   const unsigned viewport_count = brw->clip.viewport_count;
+
    /* We need to disable guardband clipping if the guardband (which we always
     * program to the maximum screen-space bounding box of 8K x 8K) will be
     * smaller than the viewport.
@@ -180,7 +183,7 @@ upload_clip_state(struct brw_context *brw)
     * "objects must have a screenspace bounding box not exceeding 8K in the X
     * or Y direction" restriction.  Instead, they're clipped.
     */
-   for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
+   for (unsigned i = 0; i < viewport_count; i++) {
       if (ctx->ViewportArray[i].Width > 8192 ||
           ctx->ViewportArray[i].Height > 8192) {
          dw2 &= ~GEN6_CLIP_GB_TEST;
@@ -203,7 +206,7 @@ upload_clip_state(struct brw_context *brw)
       const float fb_width = (float)_mesa_geometric_width(fb);
       const float fb_height = (float)_mesa_geometric_height(fb);
 
-      for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
+      for (unsigned i = 0; i < viewport_count; i++) {
          if (ctx->ViewportArray[i].X != 0 ||
              ctx->ViewportArray[i].Y != 0 ||
              ctx->ViewportArray[i].Width != fb_width ||
@@ -236,11 +239,6 @@ upload_clip_state(struct brw_context *brw)
    if (!brw_is_drawing_points(brw) && !brw_is_drawing_lines(brw))
       dw2 |= GEN6_CLIP_XY_TEST;
 
-   /* BRW_NEW_VUE_MAP_GEOM_OUT */
-   const int max_vp_index =
-      (brw->vue_map_geom_out.slots_valid & VARYING_BIT_VIEWPORT) != 0 ?
-      ctx->Const.MaxViewports : 1;
-
    BEGIN_BATCH(4);
    OUT_BATCH(_3DSTATE_CLIP << 16 | (4 - 2));
    OUT_BATCH(dw1);
@@ -250,7 +248,7 @@ upload_clip_state(struct brw_context *brw)
    OUT_BATCH(U_FIXED(0.125, 3) << GEN6_CLIP_MIN_POINT_WIDTH_SHIFT |
              U_FIXED(255.875, 3) << GEN6_CLIP_MAX_POINT_WIDTH_SHIFT |
              (_mesa_geometric_layers(fb) > 0 ? 0 : GEN6_CLIP_FORCE_ZERO_RTAINDEX) |
-             ((max_vp_index - 1) & GEN6_CLIP_MAX_VP_INDEX_MASK));
+             ((viewport_count - 1) & GEN6_CLIP_MAX_VP_INDEX_MASK));
    ADVANCE_BATCH();
 }
 
@@ -268,7 +266,7 @@ const struct brw_tracked_state gen6_clip_state = {
                BRW_NEW_PRIMITIVE |
                BRW_NEW_RASTERIZER_DISCARD |
                BRW_NEW_TES_PROG_DATA |
-               BRW_NEW_VUE_MAP_GEOM_OUT,
+               BRW_NEW_VIEWPORT_COUNT,
    },
    .emit = upload_clip_state,
 };
index b03ac730a3ea52065745cc5f39f1acb1c653f717..860445a2b43000c5b0ae82ff123eb5fbbc7d180c 100644 (file)
@@ -42,8 +42,11 @@ gen6_upload_scissor_state(struct brw_context *brw)
    const unsigned int fb_width= _mesa_geometric_width(ctx->DrawBuffer);
    const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer);
 
+   /* BRW_NEW_VIEWPORT_COUNT */
+   const unsigned viewport_count = brw->clip.viewport_count;
+
    scissor = brw_state_batch(brw, AUB_TRACE_SCISSOR_STATE,
-                            sizeof(*scissor) * ctx->Const.MaxViewports, 32,
+                            sizeof(*scissor) * viewport_count, 32,
                              &scissor_state_offset);
 
    /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */
@@ -55,7 +58,7 @@ gen6_upload_scissor_state(struct brw_context *brw)
     * Note that the hardware's coordinates are inclusive, while Mesa's min is
     * inclusive but max is exclusive.
     */
-   for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
+   for (unsigned i = 0; i < viewport_count; i++) {
       int bbox[4];
 
       bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
@@ -102,7 +105,8 @@ const struct brw_tracked_state gen6_scissor_state = {
               _NEW_SCISSOR |
               _NEW_VIEWPORT,
       .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP,
+             BRW_NEW_BLORP |
+             BRW_NEW_VIEWPORT_COUNT,
    },
    .emit = gen6_upload_scissor_state,
 };
index eacffb9ba0f459e20712355372210b9e948d291b..ad1e72d0a501132d600c41c75ff908d53d64f364 100644 (file)
@@ -42,10 +42,13 @@ gen6_upload_clip_vp(struct brw_context *brw)
    struct gl_context *ctx = &brw->ctx;
    struct brw_clipper_viewport *vp;
 
+   /* BRW_NEW_VIEWPORT_COUNT */
+   const unsigned viewport_count = brw->clip.viewport_count;
+
    vp = brw_state_batch(brw, AUB_TRACE_CLIP_VP_STATE,
-                        sizeof(*vp) * ctx->Const.MaxViewports, 32, &brw->clip.vp_offset);
+                        sizeof(*vp) * viewport_count, 32, &brw->clip.vp_offset);
 
-   for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
+   for (unsigned i = 0; i < viewport_count; i++) {
       /* According to the "Vertex X,Y Clamping and Quantization" section of the
        * Strips and Fans documentation, objects must not have a screen-space
        * extents of over 8192 pixels, or they may be mis-rasterized.  The maximum
@@ -74,7 +77,8 @@ const struct brw_tracked_state gen6_clip_vp = {
    .dirty = {
       .mesa = _NEW_VIEWPORT,
       .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP,
+             BRW_NEW_BLORP |
+             BRW_NEW_VIEWPORT_COUNT,
    },
    .emit = gen6_upload_clip_vp,
 };
@@ -87,10 +91,13 @@ gen6_upload_sf_vp(struct brw_context *brw)
    GLfloat y_scale, y_bias;
    const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
 
+   /* BRW_NEW_VIEWPORT_COUNT */
+   const unsigned viewport_count = brw->clip.viewport_count;
+
    sfv = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE,
-                         sizeof(*sfv) * ctx->Const.MaxViewports,
+                         sizeof(*sfv) * viewport_count,
                          32, &brw->sf.vp_offset);
-   memset(sfv, 0, sizeof(*sfv) * ctx->Const.MaxViewports);
+   memset(sfv, 0, sizeof(*sfv) * viewport_count);
 
    /* _NEW_BUFFERS */
    if (render_to_fbo) {
@@ -101,7 +108,7 @@ gen6_upload_sf_vp(struct brw_context *brw)
       y_bias = (float)_mesa_geometric_height(ctx->DrawBuffer);
    }
 
-   for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
+   for (unsigned i = 0; i < viewport_count; i++) {
       float scale[3], translate[3];
 
       /* _NEW_VIEWPORT */
@@ -123,7 +130,8 @@ const struct brw_tracked_state gen6_sf_vp = {
       .mesa = _NEW_BUFFERS |
               _NEW_VIEWPORT,
       .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP,
+             BRW_NEW_BLORP |
+             BRW_NEW_VIEWPORT_COUNT,
    },
    .emit = gen6_upload_sf_vp,
 };
index 34f93afdef647ec132556dbdbc5dd67851d6e01c..c447331a2e512c8fcc700b875d3ec473046ad18b 100644 (file)
@@ -37,8 +37,11 @@ gen7_upload_sf_clip_viewport(struct brw_context *brw)
    const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
    struct gen7_sf_clip_viewport *vp;
 
+   /* BRW_NEW_VIEWPORT_COUNT */
+   const unsigned viewport_count = brw->clip.viewport_count;
+
    vp = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE,
-                        sizeof(*vp) * ctx->Const.MaxViewports, 64,
+                        sizeof(*vp) * viewport_count, 64,
                         &brw->sf.vp_offset);
    /* Also assign to clip.vp_offset in case something uses it. */
    brw->clip.vp_offset = brw->sf.vp_offset;
@@ -52,7 +55,7 @@ gen7_upload_sf_clip_viewport(struct brw_context *brw)
       y_bias = (float)_mesa_geometric_height(ctx->DrawBuffer);
    }
 
-   for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
+   for (unsigned i = 0; i < viewport_count; i++) {
       float scale[3], translate[3];
       _mesa_get_viewport_xform(ctx, i, scale, translate);
 
@@ -97,7 +100,8 @@ const struct brw_tracked_state gen7_sf_clip_viewport = {
       .mesa = _NEW_BUFFERS |
               _NEW_VIEWPORT,
       .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP,
+             BRW_NEW_BLORP |
+             BRW_NEW_VIEWPORT_COUNT,
    },
    .emit = gen7_upload_sf_clip_viewport,
 };
index acaee1a94e8b147b62e3582d2dfbcff76bc9871c..84000e3a7e23f12de3d5b573de035edae1cb6ebf 100644 (file)
@@ -37,8 +37,11 @@ gen8_upload_sf_clip_viewport(struct brw_context *brw)
    const float fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer);
    const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
 
+   /* BRW_NEW_VIEWPORT_COUNT */
+   const unsigned viewport_count = brw->clip.viewport_count;
+
    float *vp = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE,
-                               16 * 4 * ctx->Const.MaxViewports,
+                               16 * 4 * viewport_count,
                                64, &brw->sf.vp_offset);
    /* Also assign to clip.vp_offset in case something uses it. */
    brw->clip.vp_offset = brw->sf.vp_offset;
@@ -52,7 +55,7 @@ gen8_upload_sf_clip_viewport(struct brw_context *brw)
       y_bias = fb_height;
    }
 
-   for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
+   for (unsigned i = 0; i < viewport_count; i++) {
       float scale[3], translate[3];
       _mesa_get_viewport_xform(ctx, i, scale, translate);
 
@@ -136,7 +139,8 @@ const struct brw_tracked_state gen8_sf_clip_viewport = {
       .mesa = _NEW_BUFFERS |
               _NEW_VIEWPORT,
       .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP,
+             BRW_NEW_BLORP |
+             BRW_NEW_VIEWPORT_COUNT,
    },
    .emit = gen8_upload_sf_clip_viewport,
 };