From 77449d7c41acab8b365023032512cf7eaed6b886 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Fri, 5 Apr 2019 11:54:10 -0700
Subject: [PATCH] iris: Track valid data range and infer unsynchronized
 mappings.

Applications frequently call glBufferSubData() to consecutive regions
of a VBO to append new vertex data.  If no data exists there yet, we
can promote these to unsynchronized writes, even if the buffer is busy,
since the GPU can't be doing anything useful with undefined content.
This can avoid a bunch of unnecessary blitting on the GPU.

u_threaded_context would do this for us, and in fact prohibits us from
doing so (see TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED).  But we haven't
hooked that up yet, and it may be useful to disable u_threaded_context
when debugging...at which point we'd still want this optimization.  At
the very least, it would let us measure the benefit of threading
independently from this optimization.  And it's not a lot of code.

Removes most stall avoidance blits in "Total War: WARHAMMER."

On my Skylake GT4e at 1920x1080, this appears to improve performance
in games by the following (but I did not do many runs for proper
statistics gathering):

   ----------------------------------------------
   | DiRT Rally        | +2% (avg) | + 2% (max) |
   | Bioshock Infinite | +3% (avg) | + 9% (max) |
   | Shadow of Mordor  | +7% (avg) | +20% (max) |
   ----------------------------------------------
---
 src/gallium/drivers/iris/iris_blit.c     |  6 +++
 src/gallium/drivers/iris/iris_clear.c    |  3 ++
 src/gallium/drivers/iris/iris_resource.c | 49 ++++++++++++++++++++++++
 src/gallium/drivers/iris/iris_resource.h | 11 ++++++
 src/gallium/drivers/iris/iris_state.c    |  9 +++++
 5 files changed, 78 insertions(+)

diff --git a/src/gallium/drivers/iris/iris_blit.c b/src/gallium/drivers/iris/iris_blit.c
index 8f5a9c984da..67283307ae6 100644
--- a/src/gallium/drivers/iris/iris_blit.c
+++ b/src/gallium/drivers/iris/iris_blit.c
@@ -429,6 +429,9 @@ iris_blit(struct pipe_context *ctx, const struct pipe_blit_info *info)
    if (flush_hack)
       tex_cache_flush_hack(batch);
 
+   if (dst_res->base.target == PIPE_BUFFER)
+      util_range_add(&dst_res->valid_buffer_range, dst_x0, dst_x1);
+
    struct blorp_batch blorp_batch;
    blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags);
 
@@ -550,6 +553,9 @@ iris_copy_region(struct blorp_context *blorp,
    if (flush_hack)
       tex_cache_flush_hack(batch);
 
+   if (dst->target == PIPE_BUFFER)
+      util_range_add(&dst_res->valid_buffer_range, dstx, dstx + src_box->width);
+
    if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
       struct blorp_address src_addr = {
          .buffer = iris_resource_bo(src), .offset = src_box->x,
diff --git a/src/gallium/drivers/iris/iris_clear.c b/src/gallium/drivers/iris/iris_clear.c
index 6e0a569e7b0..962b6fb414e 100644
--- a/src/gallium/drivers/iris/iris_clear.c
+++ b/src/gallium/drivers/iris/iris_clear.c
@@ -303,6 +303,9 @@ clear_color(struct iris_context *ice,
          blorp_flags |= BLORP_BATCH_PREDICATE_ENABLE;
    }
 
+   if (p_res->target == PIPE_BUFFER)
+      util_range_add(&res->valid_buffer_range, box->x, box->x + box->width);
+
    iris_batch_maybe_flush(batch, 1500);
 
    bool can_fast_clear = can_fast_clear_color(ice, p_res, level, box,
diff --git a/src/gallium/drivers/iris/iris_resource.c b/src/gallium/drivers/iris/iris_resource.c
index ddff9296c60..b1fdf9e0afc 100644
--- a/src/gallium/drivers/iris/iris_resource.c
+++ b/src/gallium/drivers/iris/iris_resource.c
@@ -257,6 +257,9 @@ iris_resource_destroy(struct pipe_screen *screen,
 {
    struct iris_resource *res = (struct iris_resource *)resource;
 
+   if (resource->target == PIPE_BUFFER)
+      util_range_destroy(&res->valid_buffer_range);
+
    iris_resource_disable_aux(res);
 
    iris_bo_unreference(res->bo);
@@ -278,6 +281,9 @@ iris_alloc_resource(struct pipe_screen *pscreen,
    res->aux.possible_usages = 1 << ISL_AUX_USAGE_NONE;
    res->aux.sampler_usages = 1 << ISL_AUX_USAGE_NONE;
 
+   if (templ->target == PIPE_BUFFER)
+      util_range_init(&res->valid_buffer_range);
+
    return res;
 }
 
@@ -735,6 +741,8 @@ iris_resource_from_user_memory(struct pipe_screen *pscreen,
       return NULL;
    }
 
+   util_range_add(&res->valid_buffer_range, 0, templ->width0);
+
    return &res->base;
 }
 
@@ -901,6 +909,16 @@ iris_invalidate_resource(struct pipe_context *ctx,
    if (resource->target != PIPE_BUFFER)
       return;
 
+   if (!resource_is_busy(ice, res)) {
+      /* The resource is idle, so just mark that it contains no data and
+       * keep using the same underlying buffer object.
+       */
+      util_range_set_empty(&res->valid_buffer_range);
+      return;
+   }
+
+   /* Otherwise, try and replace the backing storage with a new BO. */
+
    /* We can't reallocate memory we didn't allocate in the first place. */
    if (res->bo->userptr)
       return;
@@ -916,8 +934,16 @@ iris_invalidate_resource(struct pipe_context *ctx,
    if (!new_bo)
       return;
 
+   /* Swap out the backing storage */
    res->bo = new_bo;
+
+   /* Rebind the buffer, replacing any state referring to the old BO's
+    * address, and marking state dirty so it's reemitted.
+    */
    ice->vtbl.rebind_buffer(ice, res, old_bo->gtt_offset);
+
+   util_range_set_empty(&res->valid_buffer_range);
+
    iris_bo_unreference(old_bo);
 }
 
@@ -1312,6 +1338,21 @@ iris_map_direct(struct iris_transfer *map)
    }
 }
 
+static bool
+can_promote_to_async(const struct iris_resource *res,
+                     const struct pipe_box *box,
+                     enum pipe_transfer_usage usage)
+{
+   /* If we're writing to a section of the buffer that hasn't even been
+    * initialized with useful data, then we can safely promote this write
+    * to be unsynchronized.  This helps the common pattern of appending data.
+    */
+   return res->base.target == PIPE_BUFFER && (usage & PIPE_TRANSFER_WRITE) &&
+          !(usage & TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED) &&
+          !util_ranges_intersect(&res->valid_buffer_range, box->x,
+                                 box->x + box->width);
+}
+
 static void *
 iris_transfer_map(struct pipe_context *ctx,
                   struct pipe_resource *resource,
@@ -1342,6 +1383,11 @@ iris_transfer_map(struct pipe_context *ctx,
                                usage & PIPE_TRANSFER_WRITE);
    }
 
+   if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
+       can_promote_to_async(res, box, usage)) {
+      usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+   }
+
    if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
       map_would_stall = resource_is_busy(ice, res);
 
@@ -1369,6 +1415,9 @@ iris_transfer_map(struct pipe_context *ctx,
    xfer->box = *box;
    *ptransfer = xfer;
 
+   if (usage & PIPE_TRANSFER_WRITE)
+      util_range_add(&res->valid_buffer_range, box->x, box->x + box->width);
+
    /* Avoid using GPU copies for persistent/coherent buffers, as the idea
     * there is to access them simultaneously on the CPU & GPU.  This also
     * avoids trying to use GPU copies for our u_upload_mgr buffers which
diff --git a/src/gallium/drivers/iris/iris_resource.h b/src/gallium/drivers/iris/iris_resource.h
index 43d7e3f15f8..2a21f4edb90 100644
--- a/src/gallium/drivers/iris/iris_resource.h
+++ b/src/gallium/drivers/iris/iris_resource.h
@@ -25,6 +25,7 @@
 
 #include "pipe/p_state.h"
 #include "util/u_inlines.h"
+#include "util/u_range.h"
 #include "intel/isl/isl.h"
 
 struct iris_batch;
@@ -73,6 +74,16 @@ struct iris_resource {
     */
    unsigned bind_history;
 
+   /**
+    * For PIPE_BUFFER resources, a range which may contain valid data.
+    *
+    * This is a conservative estimate of what part of the buffer contains
+    * valid data that we have to preserve.  The rest of the buffer is
+    * considered invalid, and we can promote writes to that region to
+    * be unsynchronized writes, avoiding blit copies.
+    */
+   struct util_range valid_buffer_range;
+
    /**
     * Auxiliary buffer information (CCS, MCS, or HiZ).
     */
diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c
index d6a8ba4fb47..55ed7be647a 100644
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@@ -2009,6 +2009,9 @@ iris_set_shader_images(struct pipe_context *ctx,
                                       &image_params[start_slot + i],
                                       &res->surf, &view);
          } else {
+            util_range_add(&res->valid_buffer_range, img->u.buf.offset,
+                           img->u.buf.offset + img->u.buf.size);
+
             fill_buffer_surface_state(&screen->isl_dev, res->bo, map,
                                       isl_fmt, ISL_SWIZZLE_IDENTITY,
                                       img->u.buf.offset, img->u.buf.size);
@@ -2626,6 +2629,9 @@ iris_set_shader_buffers(struct pipe_context *ctx,
          upload_ubo_ssbo_surf_state(ice, ssbo, surf_state, true);
 
          res->bind_history |= PIPE_BIND_SHADER_BUFFER;
+
+         util_range_add(&res->valid_buffer_range, ssbo->buffer_offset,
+                        ssbo->buffer_offset + ssbo->buffer_size);
       } else {
          pipe_resource_reference(&shs->ssbo[start_slot + i].buffer, NULL);
          pipe_resource_reference(&shs->ssbo_surf_state[start_slot + i].res,
@@ -2866,6 +2872,9 @@ iris_create_stream_output_target(struct pipe_context *ctx,
    cso->base.buffer_size = buffer_size;
    cso->base.context = ctx;
 
+   util_range_add(&res->valid_buffer_range, buffer_offset,
+                  buffer_offset + buffer_size);
+
    upload_state(ctx->stream_uploader, &cso->offset, sizeof(uint32_t), 4);
 
    return &cso->base;
-- 
2.30.2