i965: Send the minimal number of STATE_BASE_ADDRESS packets.
authorKenneth Graunke <kenneth@whitecape.org>
Wed, 27 Apr 2016 16:35:03 +0000 (09:35 -0700)
committerKenneth Graunke <kenneth@whitecape.org>
Mon, 16 May 2016 07:11:51 +0000 (00:11 -0700)
STATE_BASE_ADDRESS stalls the whole pipeline, and the documentation
cautions us to emit it as little as possible for better performance.

We recently put some hacks in BLORP to try and avoid emitting it
if it was already set correctly.  However, this wasn't quite minimal:
if BLORP is the first operation (i.e. glClear()), then it would emit
it, and subsequent draw calls would emit it again.

This caused a small drop in performance in GPUTest Triangle when
switching from Meta to BLORP.

Unlike most packets, STATE_BASE_ADDRESS isn't influenced by GL state:
it needs to be emitted once per batch, before most other commands, or
whenever we change the program cache BO.  It's also valid in both the
3D and compute pipelines, which makes it even more unique.

This patch removes it from the atom mechanism and instead directly
calls it as part of every draw, compute dispatch, or BLORP operation.
We introduce a new flag indicating that STATE_BASE_ADDRESS has already
been emitted this batch, and if so, skip doing it again.  When we make
a new program cache BO, we simply reset the flag, so the next operation
will emit it again.  When we flush/reset the batch, we reset the flag.

This guarantees that we'll emit STATE_BASE_ADDRESS only when we have to.
It's also less code than the old atom mechanism.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
src/mesa/drivers/dri/i965/brw_context.h
src/mesa/drivers/dri/i965/brw_misc_state.c
src/mesa/drivers/dri/i965/brw_state.h
src/mesa/drivers/dri/i965/brw_state_cache.c
src/mesa/drivers/dri/i965/brw_state_upload.c
src/mesa/drivers/dri/i965/gen6_blorp.c
src/mesa/drivers/dri/i965/gen7_blorp.c
src/mesa/drivers/dri/i965/gen8_blorp.c
src/mesa/drivers/dri/i965/intel_batchbuffer.c

index e8c7f0245be6f6fbdfcbef84d56319d6e359f3bc..9a4dd31c4bbccfea8c49cb2fc152595f80014a93 100644 (file)
@@ -619,6 +619,7 @@ struct intel_batchbuffer {
    uint32_t state_batch_offset;
    enum brw_gpu_ring ring;
    bool needs_sol_reset;
+   bool state_base_address_emitted;
 
    struct {
       uint32_t *map_next;
index c72f607785acfdaab7049218c4a8c37c4bb53204..5510d2c36f40e1c1cc92082127cc71def50edc4b 100644 (file)
@@ -1066,6 +1066,9 @@ const struct brw_tracked_state brw_invariant_state = {
 void
 brw_upload_state_base_address(struct brw_context *brw)
 {
+   if (brw->batch.state_base_address_emitted)
+      return;
+
    /* FINISHME: According to section 3.6.1 "STATE_BASE_ADDRESS" of
     * vol1a of the G45 PRM, MI_FLUSH with the ISC invalidate should be
     * programmed prior to STATE_BASE_ADDRESS.
@@ -1201,13 +1204,5 @@ brw_upload_state_base_address(struct brw_context *brw)
     */
 
    brw->ctx.NewDriverState |= BRW_NEW_STATE_BASE_ADDRESS;
+   brw->batch.state_base_address_emitted = true;
 }
-
-const struct brw_tracked_state brw_state_base_address = {
-   .dirty = {
-      .mesa = 0,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_PROGRAM_CACHE,
-   },
-   .emit = brw_upload_state_base_address
-};
index 880f44e553b1fe548098940e679e650aa5ff37ed..70b17fd3f619fc2dfc9b2a653f2aec40c871518b 100644 (file)
@@ -66,7 +66,6 @@ extern const struct brw_tracked_state brw_polygon_stipple;
 extern const struct brw_tracked_state brw_recalculate_urb_fence;
 extern const struct brw_tracked_state brw_sf_unit;
 extern const struct brw_tracked_state brw_sf_vp;
-extern const struct brw_tracked_state brw_state_base_address;
 extern const struct brw_tracked_state brw_vs_samplers;
 extern const struct brw_tracked_state brw_tcs_samplers;
 extern const struct brw_tracked_state brw_tes_samplers;
@@ -195,7 +194,6 @@ brw_depthbuffer_format(struct brw_context *brw);
 
 void brw_upload_state_base_address(struct brw_context *brw);
 
-
 /* gen8_depth_state.c */
 void gen8_write_pma_stall_bits(struct brw_context *brw,
                                uint32_t pma_stall_bits);
index c6aa1344270032a1f45bdc475aa1fb7a5a2f2e14..0e98e654c1c7c67e41afc2de4a59999c5f701d93 100644 (file)
@@ -199,6 +199,7 @@ brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
     * that depend on it (state base address on gen5+, or unit state before).
     */
    brw->ctx.NewDriverState |= BRW_NEW_PROGRAM_CACHE;
+   brw->batch.state_base_address_emitted = false;
 }
 
 /**
index ed92a85f92341fea816fb0604b2e7af02c0075ec..0b47ebe3a145b272709bb01d881a0eac15429e6c 100644 (file)
@@ -79,7 +79,6 @@ static const struct brw_tracked_state *gen4_atoms[] =
    /* Command packets:
     */
    &brw_invariant_state,
-   &brw_state_base_address,
 
    &brw_binding_table_pointers,
    &brw_blend_constant_color,
@@ -109,9 +108,6 @@ static const struct brw_tracked_state *gen6_atoms[] =
 
    /* Command packets: */
 
-   /* must do before binding table pointers, cc state ptrs */
-   &brw_state_base_address,
-
    &brw_cc_vp,
    &gen6_viewport_state,       /* must do after *_vp stages */
 
@@ -175,9 +171,6 @@ static const struct brw_tracked_state *gen7_render_atoms[] =
 {
    /* Command packets: */
 
-   /* must do before binding table pointers, cc state ptrs */
-   &brw_state_base_address,
-
    &brw_cc_vp,
    &gen7_sf_clip_viewport,
 
@@ -268,7 +261,6 @@ static const struct brw_tracked_state *gen7_render_atoms[] =
 
 static const struct brw_tracked_state *gen7_compute_atoms[] =
 {
-   &brw_state_base_address,
    &gen7_l3_state,
    &brw_cs_image_surfaces,
    &gen7_cs_push_constants,
@@ -283,9 +275,6 @@ static const struct brw_tracked_state *gen7_compute_atoms[] =
 
 static const struct brw_tracked_state *gen8_render_atoms[] =
 {
-   /* Command packets: */
-   &brw_state_base_address,
-
    &brw_cc_vp,
    &gen8_sf_clip_viewport,
 
@@ -383,7 +372,6 @@ static const struct brw_tracked_state *gen8_render_atoms[] =
 
 static const struct brw_tracked_state *gen8_compute_atoms[] =
 {
-   &brw_state_base_address,
    &gen7_l3_state,
    &brw_cs_image_surfaces,
    &gen7_cs_push_constants,
@@ -847,6 +835,8 @@ brw_upload_pipeline_state(struct brw_context *brw,
    brw_upload_programs(brw, pipeline);
    merge_ctx_state(brw, &state);
 
+   brw_upload_state_base_address(brw);
+
    const struct brw_tracked_state *atoms =
       brw_get_pipeline_atoms(brw, pipeline);
    const int num_atoms = brw->num_atoms[pipeline];
index 9d72745d31ef315ff40993c2bc79c139b2b524b3..5f84ab09e10d9ddbbab3d09c2b67bde0ff093bf6 100644 (file)
@@ -996,8 +996,7 @@ gen6_blorp_exec(struct brw_context *brw,
    /* Emit workaround flushes when we switch from drawing to blorping. */
    brw_emit_post_sync_nonzero_flush(brw);
 
-   if (brw_state_base_address.dirty.brw & brw->ctx.NewDriverState)
-      brw_upload_state_base_address(brw);
+   brw_upload_state_base_address(brw);
 
    gen6_emit_3dstate_multisample(brw, params->dst.num_samples);
    gen6_emit_3dstate_sample_mask(brw,
index 494ed46dc70668d6dba3aa2d988988b889f0e1a1..3584864417af4b3468b3a577e1df8a1fff57271c 100644 (file)
@@ -817,8 +817,7 @@ gen7_blorp_exec(struct brw_context *brw,
    uint32_t wm_push_const_offset = 0;
    uint32_t wm_bind_bo_offset = 0;
 
-   if (brw_state_base_address.dirty.brw & brw->ctx.NewDriverState)
-      brw_upload_state_base_address(brw);
+   brw_upload_state_base_address(brw);
 
    gen6_emit_3dstate_multisample(brw, params->dst.num_samples);
    gen6_emit_3dstate_sample_mask(brw,
index 6a783a75df9fea04371693271e32502640e3d23a..a9a400d906631fea492e25a9042870c316b1a38d 100644 (file)
@@ -653,8 +653,7 @@ gen8_blorp_exec(struct brw_context *brw, const struct brw_blorp_params *params)
 {
    uint32_t wm_bind_bo_offset = 0;
 
-   if (brw_state_base_address.dirty.brw & brw->ctx.NewDriverState)
-      brw_upload_state_base_address(brw);
+   brw_upload_state_base_address(brw);
 
    gen7_blorp_emit_cc_viewport(brw);
    gen7_l3_state.emit(brw);
index f50b2b473c903f658ad8a5cbda72d03f8b7930a6..f220311842ac0af9bb9d7ff251863c9a6ec5df1e 100644 (file)
@@ -73,6 +73,7 @@ intel_batchbuffer_reset(struct brw_context *brw)
    brw->batch.reserved_space = BATCH_RESERVED;
    brw->batch.state_batch_offset = brw->batch.bo->size;
    brw->batch.needs_sol_reset = false;
+   brw->batch.state_base_address_emitted = false;
 
    /* We don't know what ring the new batch will be sent to until we see the
     * first BEGIN_BATCH or BEGIN_BATCH_BLT.  Mark it as unknown.