From 7c5988e615e580f771f6f478631d63aaada872a6 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Tue, 5 Sep 2017 15:14:18 -0700
Subject: [PATCH] i965: Disentangle batch and state buffer flushing.

We now flush the batch when either the batchbuffer or statebuffer
reaches the original intended batch size, instead of when the sum of
the two reaches a certain size (which makes no sense now that they're
separate buffers).

With this change, we also need to update our "are we near the end?"
estimate to require separate batch and state buffer space.  I obtained
these estimates by looking at the size of draw calls in the Unreal 4
Elemental Demo (using INTEL_DEBUG=flush and always_flush_batch=true).

This will significantly impact the size of our batches.  I've adjusted
both down to try and be roughly similar to what we had been doing.  On
various benchmarks, a 20kB batch and 16kB statebuffer seemed to about
right, but we may need to adjust this further.  I tried a 16kB batch,
but that regressed Synmark OglMultithread performance by a fair bit.
32kB for both would have significantly increased our batch sizes.

Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 src/mesa/drivers/dri/i965/brw_compute.c       | 18 +++---------
 src/mesa/drivers/dri/i965/brw_draw.c          | 18 +++---------
 src/mesa/drivers/dri/i965/brw_state.h         |  1 +
 src/mesa/drivers/dri/i965/genX_blorp_exec.c   |  4 +--
 src/mesa/drivers/dri/i965/intel_batchbuffer.c | 29 ++++++++++++-------
 5 files changed, 30 insertions(+), 40 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_compute.c b/src/mesa/drivers/dri/i965/brw_compute.c
index 1bad7ac7a0c..7f0278ac92b 100644
--- a/src/mesa/drivers/dri/i965/brw_compute.c
+++ b/src/mesa/drivers/dri/i965/brw_compute.c
@@ -167,7 +167,6 @@ static void
 brw_dispatch_compute_common(struct gl_context *ctx)
 {
    struct brw_context *brw = brw_context(ctx);
-   int estimated_buffer_space_needed;
    bool fail_next = false;
 
    if (!_mesa_check_conditional_render(ctx))
@@ -180,20 +179,11 @@ brw_dispatch_compute_common(struct gl_context *ctx)
 
    brw_predraw_resolve_inputs(brw);
 
-   const int sampler_state_size = 16; /* 16 bytes */
-   estimated_buffer_space_needed = 512; /* batchbuffer commands */
-   estimated_buffer_space_needed += (BRW_MAX_TEX_UNIT *
-                                     (sampler_state_size +
-                                      sizeof(struct gen5_sampler_default_color)));
-   estimated_buffer_space_needed += 1024; /* push constants */
-   estimated_buffer_space_needed += 512; /* misc. pad */
-
-   /* Flush the batch if it's approaching full, so that we don't wrap while
-    * we've got validated state that needs to be in the same batch as the
-    * primitives.
+   /* Flush the batch if the batch/state buffers are nearly full.  We can
+    * grow them if needed, but this is not free, so we'd like to avoid it.
     */
-   intel_batchbuffer_require_space(brw, estimated_buffer_space_needed,
-                                   RENDER_RING);
+   intel_batchbuffer_require_space(brw, 600, RENDER_RING);
+   brw_require_statebuffer_space(brw, 2500);
    intel_batchbuffer_save_state(brw);
 
  retry:
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index d1ec2e3f09d..06c6ed72c98 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -669,26 +669,16 @@ brw_try_draw_prims(struct gl_context *ctx,
    brw->ctx.NewDriverState |= BRW_NEW_VERTICES;
 
    for (i = 0; i < nr_prims; i++) {
-      int estimated_max_prim_size;
-      const int sampler_state_size = 16;
-
-      estimated_max_prim_size = 512; /* batchbuffer commands */
-      estimated_max_prim_size += BRW_MAX_TEX_UNIT *
-         (sampler_state_size + sizeof(struct gen5_sampler_default_color));
-      estimated_max_prim_size += 1024; /* gen6 VS push constants */
-      estimated_max_prim_size += 1024; /* gen6 WM push constants */
-      estimated_max_prim_size += 512; /* misc. pad */
-
       /* Flag BRW_NEW_DRAW_CALL on every draw.  This allows us to have
        * atoms that happen on every draw call.
        */
       brw->ctx.NewDriverState |= BRW_NEW_DRAW_CALL;
 
-      /* Flush the batch if it's approaching full, so that we don't wrap while
-       * we've got validated state that needs to be in the same batch as the
-       * primitives.
+      /* Flush the batch if the batch/state buffers are nearly full.  We can
+       * grow them if needed, but this is not free, so we'd like to avoid it.
        */
-      intel_batchbuffer_require_space(brw, estimated_max_prim_size, RENDER_RING);
+      intel_batchbuffer_require_space(brw, 1500, RENDER_RING);
+      brw_require_statebuffer_space(brw, 2400);
       intel_batchbuffer_save_state(brw);
 
       if (brw->num_instances != prims[i].num_instances ||
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index c8b71e72de5..9718739dea9 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -185,6 +185,7 @@ void brw_destroy_caches( struct brw_context *brw );
 void brw_print_program_cache(struct brw_context *brw);
 
 /* intel_batchbuffer.c */
+void brw_require_statebuffer_space(struct brw_context *brw, int size);
 void *brw_state_batch(struct brw_context *brw,
                       int size, int alignment, uint32_t *out_offset);
 uint32_t brw_state_batch_size(struct brw_context *brw, uint32_t offset);
diff --git a/src/mesa/drivers/dri/i965/genX_blorp_exec.c b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
index 5bff7eaff59..3fe81c7c6a1 100644
--- a/src/mesa/drivers/dri/i965/genX_blorp_exec.c
+++ b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
@@ -205,7 +205,6 @@ genX(blorp_exec)(struct blorp_batch *batch,
    assert(batch->blorp->driver_ctx == batch->driver_batch);
    struct brw_context *brw = batch->driver_batch;
    struct gl_context *ctx = &brw->ctx;
-   const uint32_t estimated_max_batch_usage = GEN_GEN >= 8 ? 1920 : 1700;
    bool check_aperture_failed_once = false;
 
    /* Flush the sampler and render caches.  We definitely need to flush the
@@ -222,7 +221,8 @@ genX(blorp_exec)(struct blorp_batch *batch,
    brw_select_pipeline(brw, BRW_RENDER_PIPELINE);
 
 retry:
-   intel_batchbuffer_require_space(brw, estimated_max_batch_usage, RENDER_RING);
+   intel_batchbuffer_require_space(brw, 1400, RENDER_RING);
+   brw_require_statebuffer_space(brw, 600);
    intel_batchbuffer_save_state(brw);
    brw->no_batch_wrap = true;
 
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 5aa34e74293..fddc84fcf9b 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -49,8 +49,8 @@
  * should flush.  Each time we flush the batch, we recreate both buffers
  * at the original target size, so it doesn't grow without bound.
  */
-#define BATCH_SZ (8192*sizeof(uint32_t))
-#define STATE_SZ (8192*sizeof(uint32_t))
+#define BATCH_SZ (20 * 1024)
+#define STATE_SZ (16 * 1024)
 
 /* The kernel assumes batchbuffers are smaller than 256kB. */
 #define MAX_BATCH_SIZE (256 * 1024)
@@ -369,9 +369,8 @@ intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
       intel_batchbuffer_flush(brw);
    }
 
-   /* For now, flush as if the batch and state buffers still shared a BO */
    const unsigned batch_used = USED_BATCH(*batch) * 4;
-   if (batch_used + sz >= BATCH_SZ - batch->state_used) {
+   if (batch_used + sz >= BATCH_SZ) {
       if (!brw->no_batch_wrap) {
          intel_batchbuffer_flush(brw);
       } else {
@@ -380,7 +379,7 @@ intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
          grow_buffer(brw, &batch->bo, &batch->map, &batch->batch_cpu_map,
                      batch_used, new_size);
          batch->map_next = (void *) batch->map + batch_used;
-         assert(batch_used + sz < batch->bo->size - batch->state_used);
+         assert(batch_used + sz < batch->bo->size);
       }
    }
 
@@ -1011,6 +1010,19 @@ brw_state_batch_size(struct brw_context *brw, uint32_t offset)
    return entry ? (uintptr_t) entry->data : 0;
 }
 
+/**
+ * Reserve some space in the statebuffer, or flush.
+ *
+ * This is used to estimate when we're near the end of the batch,
+ * so we can flush early.
+ */
+void
+brw_require_statebuffer_space(struct brw_context *brw, int size)
+{
+   if (brw->batch.state_used + size >= STATE_SZ)
+      intel_batchbuffer_flush(brw);
+}
+
 /**
  * Allocates a block of space in the batchbuffer for indirect state.
  */
@@ -1026,10 +1038,7 @@ brw_state_batch(struct brw_context *brw,
 
    uint32_t offset = ALIGN(batch->state_used, alignment);
 
-   /* For now, follow the old flushing behavior. */
-   int batch_space = USED_BATCH(*batch) * 4;
-
-   if (offset + size >= STATE_SZ - batch_space) {
+   if (offset + size >= STATE_SZ) {
       if (!brw->no_batch_wrap) {
          intel_batchbuffer_flush(brw);
          offset = ALIGN(batch->state_used, alignment);
@@ -1039,7 +1048,7 @@ brw_state_batch(struct brw_context *brw,
                  MAX_STATE_SIZE);
          grow_buffer(brw, &batch->state_bo, &batch->state_map,
                      &batch->state_cpu_map, batch->state_used, new_size);
-         assert(offset + size < batch->state_bo->size - batch_space);
+         assert(offset + size < batch->state_bo->size);
       }
    }
 
-- 
2.30.2