From: Kenneth Graunke <kenneth@whitecape.org>
Date: Sun, 9 Sep 2018 02:43:34 +0000 (-0700)
Subject: iris: Support multiple binder BOs, update Surface State Base Address
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=eff081cdd9524f787339c13433b6b7758be474df;p=mesa.git

iris: Support multiple binder BOs, update Surface State Base Address
---

diff --git a/src/gallium/drivers/iris/iris_batch.c b/src/gallium/drivers/iris/iris_batch.c
index 2271513f6c9..b35466d69f1 100644
--- a/src/gallium/drivers/iris/iris_batch.c
+++ b/src/gallium/drivers/iris/iris_batch.c
@@ -38,7 +38,6 @@
  */
 
 #include "iris_batch.h"
-#include "iris_binder.h"
 #include "iris_bufmgr.h"
 #include "iris_context.h"
 
@@ -158,8 +157,6 @@ iris_init_batch(struct iris_batch *batch,
    batch->validation_list =
       malloc(batch->exec_array_size * sizeof(batch->validation_list[0]));
 
-   batch->binder.bo = NULL;
-
    batch->cache.render = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
                                                  _mesa_key_pointer_equal);
    batch->cache.depth = _mesa_set_create(NULL, _mesa_hash_pointer,
@@ -254,9 +251,6 @@ iris_batch_reset(struct iris_batch *batch)
    create_batch(batch);
    assert(batch->bo->index == 0);
 
-   iris_destroy_binder(&batch->binder);
-   iris_init_binder(&batch->binder, batch->bo->bufmgr);
-
    if (batch->state_sizes)
       _mesa_hash_table_clear(batch->state_sizes, NULL);
 
@@ -281,8 +275,6 @@ iris_batch_free(struct iris_batch *batch)
    _mesa_hash_table_destroy(batch->cache.render, NULL);
    _mesa_set_destroy(batch->cache.depth, NULL);
 
-   iris_destroy_binder(&batch->binder);
-
    if (batch->state_sizes) {
       _mesa_hash_table_destroy(batch->state_sizes, NULL);
       gen_batch_decode_ctx_finish(&batch->decoder);
@@ -432,18 +424,16 @@ _iris_batch_flush_fence(struct iris_batch *batch,
 
    if (unlikely(INTEL_DEBUG & (DEBUG_BATCH | DEBUG_SUBMIT))) {
       int bytes_for_commands = iris_batch_bytes_used(batch);
-      int bytes_for_binder = batch->binder.insert_point;
       int second_bytes = 0;
       if (batch->bo != batch->exec_bos[0]) {
          second_bytes = bytes_for_commands;
          bytes_for_commands += batch->primary_batch_size;
       }
       fprintf(stderr, "%19s:%-3d: Batchbuffer flush with %5d+%5db (%0.1f%%) "
-              "(cmds), %5db (%0.1f%%) (binder), %4d BOs (%0.1fMb aperture)\n",
+              "(cmds), %4d BOs (%0.1fMb aperture)\n",
               file, line,
               batch->primary_batch_size, second_bytes,
               100.0f * bytes_for_commands / BATCH_SZ,
-              bytes_for_binder, 100.0f * bytes_for_binder / IRIS_BINDER_SIZE,
               batch->exec_count,
               (float) batch->aperture_space / (1024 * 1024));
       dump_validation_list(batch);
diff --git a/src/gallium/drivers/iris/iris_batch.h b/src/gallium/drivers/iris/iris_batch.h
index 7d446817d3d..8ff3f60fa9d 100644
--- a/src/gallium/drivers/iris/iris_batch.h
+++ b/src/gallium/drivers/iris/iris_batch.h
@@ -29,7 +29,6 @@
 #include <string.h>
 #include "i915_drm.h"
 #include "common/gen_decoder.h"
-#include "iris_binder.h"
 
 /* The kernel assumes batchbuffers are smaller than 256kB. */
 #define MAX_BATCH_SIZE (256 * 1024)
@@ -58,6 +57,9 @@ struct iris_batch {
    /** Last BO submitted to the hardware.  Used for glFinish(). */
    struct iris_bo *last_bo;
 
+   /** Last Surface State Base Address set in this hardware context. */
+   uint64_t last_surface_base_address;
+
    uint32_t hw_ctx_id;
 
    /** Which engine this batch targets - a I915_EXEC_RING_MASK value */
@@ -72,9 +74,6 @@ struct iris_batch {
    /** The amount of aperture space (in bytes) used by all exec_bos */
    int aperture_space;
 
-   /** Binder (containing binding tables) */
-   struct iris_binder binder;
-
    struct {
       /**
        * Set of struct brw_bo * that have been rendered to within this
diff --git a/src/gallium/drivers/iris/iris_binder.c b/src/gallium/drivers/iris/iris_binder.c
index cba84f5fa53..2cac1b71256 100644
--- a/src/gallium/drivers/iris/iris_binder.c
+++ b/src/gallium/drivers/iris/iris_binder.c
@@ -49,6 +49,8 @@
  * and cycling back around where possible to avoid replacing it at all costs.
  *
  * XXX: if we do have to flush, we should emit a performance warning.
+ *
+ * XXX: these comments are out of date
  */
 
 #include <stdlib.h>
@@ -62,98 +64,131 @@
 /* Avoid using offset 0, tools consider it NULL */
 #define INIT_INSERT_POINT BTP_ALIGNMENT
 
-/**
- * Reserve a block of space in the binder, given the raw size in bytes.
- */
-uint32_t
-iris_binder_reserve(struct iris_batch *batch, unsigned size)
+static bool
+binder_has_space(struct iris_binder *binder, unsigned size)
+{
+   return binder->insert_point + size <= IRIS_BINDER_SIZE;
+}
+
+static void
+binder_realloc(struct iris_context *ice)
 {
-   struct iris_binder *binder = &batch->binder;
+   struct iris_screen *screen = (void *) ice->ctx.screen;
+   struct iris_bufmgr *bufmgr = screen->bufmgr;
+   struct iris_binder *binder = &ice->state.binder;
 
-   assert(size > 0);
-   assert((binder->insert_point % BTP_ALIGNMENT) == 0);
+   iris_bo_unreference(binder->bo);
+
+   binder->bo =
+      iris_bo_alloc(bufmgr, "binder", IRIS_BINDER_SIZE, IRIS_MEMZONE_BINDER);
+   binder->map = iris_bo_map(NULL, binder->bo, MAP_WRITE);
+   binder->insert_point = INIT_INSERT_POINT;
 
-   /* If we can't fit all stages in the binder, flush the batch which
-    * will cause us to gain a new empty binder.
+   /* Allocating a new binder requires changing Surface State Base Address,
+    * which also invalidates all our previous binding tables - each entry
+    * in those tables is an offset from the old base.
+    *
+    * We do this here so that iris_binder_reserve_3d correctly gets a new
+    * larger total_size when making the updated reservation.
     */
-   if (binder->insert_point + size > IRIS_BINDER_SIZE)
-      iris_batch_flush(batch);
+   ice->state.dirty |= IRIS_ALL_DIRTY_BINDINGS;
+}
 
+static uint32_t
+binder_insert(struct iris_binder *binder, unsigned size)
+{
    uint32_t offset = binder->insert_point;
 
-   /* It had better fit now. */
-   assert(offset + size <= IRIS_BINDER_SIZE);
-
    binder->insert_point = align(binder->insert_point + size, BTP_ALIGNMENT);
 
-   iris_use_pinned_bo(batch, binder->bo, false);
-
    return offset;
 }
 
+/**
+ * Reserve a block of space in the binder, given the raw size in bytes.
+ */
+uint32_t
+iris_binder_reserve(struct iris_context *ice,
+                    unsigned size)
+{
+   struct iris_binder *binder = &ice->state.binder;
+
+   if (!binder_has_space(binder, size))
+      binder_realloc(ice);
+
+   assert(size > 0);
+   return binder_insert(binder, size);
+}
+
 /**
  * Reserve and record binder space for 3D pipeline shader stages.
  *
  * Note that you must actually populate the new binding tables after
  * calling this command - the new area is uninitialized.
  */
-bool
-iris_binder_reserve_3d(struct iris_batch *batch,
-                       struct iris_context *ice)
+void
+iris_binder_reserve_3d(struct iris_context *ice)
 {
    struct iris_compiled_shader **shaders = ice->shaders.prog;
-   struct iris_binder *binder = &batch->binder;
-   unsigned total_size = 0;
+   struct iris_binder *binder = &ice->state.binder;
    unsigned sizes[MESA_SHADER_STAGES] = {};
+   unsigned total_size;
 
-   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
-      if (!(ice->state.dirty & (IRIS_DIRTY_BINDINGS_VS << stage)))
-         continue;
+   /* If nothing is dirty, skip all this. */
+   if (!(ice->state.dirty & IRIS_ALL_DIRTY_BINDINGS))
+      return;
 
+   /* Get the binding table sizes for each stage */
+   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
       if (!shaders[stage])
          continue;
 
       const struct brw_stage_prog_data *prog_data =
          (const void *) shaders[stage]->prog_data;
 
+      /* Round up the size so our next table has an aligned starting offset */
       sizes[stage] = align(prog_data->binding_table.size_bytes, BTP_ALIGNMENT);
-      total_size += sizes[stage];
    }
 
-   if (total_size == 0)
-      return false;
+   /* Make space for the new binding tables...this may take two tries. */
+   while (true) {
+      total_size = 0;
+      for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
+         if (ice->state.dirty & (IRIS_DIRTY_BINDINGS_VS << stage))
+            total_size += sizes[stage];
+      }
 
-   uint32_t offset = iris_binder_reserve(batch, total_size);
-   bool flushed = offset == INIT_INSERT_POINT;
+      assert(total_size < IRIS_BINDER_SIZE);
 
-   /* Assign space and record the current binding table. */
-   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
-      if (!(ice->state.dirty & (IRIS_DIRTY_BINDINGS_VS << stage)))
-         continue;
+      if (total_size == 0)
+         return;
+
+      if (binder_has_space(binder, total_size))
+         break;
 
-      binder->bt_offset[stage] = sizes[stage] > 0 ? offset : 0;
-      offset += sizes[stage];
+      /* It didn't fit.  Allocate a new buffer and try again.  Note that
+       * this will flag all bindings dirty, which may increase total_size
+       * on the next iteration.
+       */
+      binder_realloc(ice);
    }
 
-   return flushed;
-}
+   /* Assign space and record the new binding table offsets. */
+   uint32_t offset = binder_insert(binder, total_size);
 
-void
-iris_init_binder(struct iris_binder *binder, struct iris_bufmgr *bufmgr)
-{
-   binder->bo =
-      iris_bo_alloc(bufmgr, "binder", IRIS_BINDER_SIZE, IRIS_MEMZONE_BINDER);
-   binder->map = iris_bo_map(NULL, binder->bo, MAP_WRITE);
-   binder->insert_point = INIT_INSERT_POINT;
+   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
+      if (ice->state.dirty & (IRIS_DIRTY_BINDINGS_VS << stage)) {
+         binder->bt_offset[stage] = sizes[stage] > 0 ? offset : 0;
+         offset += sizes[stage];
+      }
+   }
 }
 
-/**
- * Is the binder empty?  (If so, old binding table pointers are stale.)
- */
-bool
-iris_binder_is_empty(struct iris_binder *binder)
+void
+iris_init_binder(struct iris_context *ice)
 {
-   return binder->insert_point <= INIT_INSERT_POINT;
+   memset(&ice->state.binder, 0, sizeof(struct iris_binder));
+   binder_realloc(ice);
 }
 
 void
diff --git a/src/gallium/drivers/iris/iris_binder.h b/src/gallium/drivers/iris/iris_binder.h
index bd1e17ae4c4..e63170e298f 100644
--- a/src/gallium/drivers/iris/iris_binder.h
+++ b/src/gallium/drivers/iris/iris_binder.h
@@ -49,11 +49,9 @@ struct iris_binder
    uint32_t bt_offset[MESA_SHADER_STAGES];
 };
 
-void iris_init_binder(struct iris_binder *binder, struct iris_bufmgr *bufmgr);
-bool iris_binder_is_empty(struct iris_binder *binder);
+void iris_init_binder(struct iris_context *ice);
 void iris_destroy_binder(struct iris_binder *binder);
-uint32_t iris_binder_reserve(struct iris_batch *batch, unsigned size);
-bool iris_binder_reserve_3d(struct iris_batch *batch,
-                            struct iris_context *ice);
+uint32_t iris_binder_reserve(struct iris_context *ice, unsigned size);
+void iris_binder_reserve_3d(struct iris_context *ice);
 
 #endif
diff --git a/src/gallium/drivers/iris/iris_blorp.c b/src/gallium/drivers/iris/iris_blorp.c
index 3ff48ed00f5..e7718eab7eb 100644
--- a/src/gallium/drivers/iris/iris_blorp.c
+++ b/src/gallium/drivers/iris/iris_blorp.c
@@ -120,7 +120,7 @@ blorp_get_surface_address(struct blorp_batch *blorp_batch,
 UNUSED static struct blorp_address
 blorp_get_surface_base_address(UNUSED struct blorp_batch *blorp_batch)
 {
-   return (struct blorp_address) { .offset = IRIS_MEMZONE_SURFACE_START };
+   return (struct blorp_address) { .offset = IRIS_MEMZONE_BINDER_START };
 }
 
 static void *
@@ -146,17 +146,22 @@ blorp_alloc_binding_table(struct blorp_batch *blorp_batch,
                           void **surface_maps)
 {
    struct iris_context *ice = blorp_batch->blorp->driver_ctx;
+   struct iris_binder *binder = &ice->state.binder;
    struct iris_batch *batch = blorp_batch->driver_batch;
 
-   *bt_offset = iris_binder_reserve(batch, num_entries * sizeof(uint32_t));
-   uint32_t *bt_map = batch->binder.map + *bt_offset;
+   *bt_offset = iris_binder_reserve(ice, num_entries * sizeof(uint32_t));
+   uint32_t *bt_map = binder->map + *bt_offset;
 
    for (unsigned i = 0; i < num_entries; i++) {
       surface_maps[i] = stream_state(batch, ice->state.surface_uploader,
                                      state_size, state_alignment,
                                      &surface_offsets[i], NULL);
-      bt_map[i] = surface_offsets[i];
+      bt_map[i] = surface_offsets[i] - (uint32_t) binder->bo->gtt_offset;
    }
+
+   iris_use_pinned_bo(batch, binder->bo, false);
+
+   ice->vtbl.update_surface_base_address(batch, binder);
 }
 
 static void *
diff --git a/src/gallium/drivers/iris/iris_bufmgr.c b/src/gallium/drivers/iris/iris_bufmgr.c
index 058ae15ab81..50e7d4f715d 100644
--- a/src/gallium/drivers/iris/iris_bufmgr.c
+++ b/src/gallium/drivers/iris/iris_bufmgr.c
@@ -244,10 +244,10 @@ bucket_for_size(struct iris_bufmgr *bufmgr, uint64_t size)
 static enum iris_memory_zone
 memzone_for_address(uint64_t address)
 {
-   STATIC_ASSERT(IRIS_MEMZONE_OTHER_START > IRIS_MEMZONE_DYNAMIC_START);
+   STATIC_ASSERT(IRIS_MEMZONE_OTHER_START   > IRIS_MEMZONE_DYNAMIC_START);
    STATIC_ASSERT(IRIS_MEMZONE_DYNAMIC_START > IRIS_MEMZONE_SURFACE_START);
-   STATIC_ASSERT(IRIS_MEMZONE_SURFACE_START > IRIS_MEMZONE_SHADER_START);
-   STATIC_ASSERT(IRIS_BINDER_ADDRESS == IRIS_MEMZONE_SURFACE_START);
+   STATIC_ASSERT(IRIS_MEMZONE_SURFACE_START > IRIS_MEMZONE_BINDER_START);
+   STATIC_ASSERT(IRIS_MEMZONE_BINDER_START  > IRIS_MEMZONE_SHADER_START);
    STATIC_ASSERT(IRIS_BORDER_COLOR_POOL_ADDRESS == IRIS_MEMZONE_DYNAMIC_START);
 
    if (address >= IRIS_MEMZONE_OTHER_START)
@@ -259,7 +259,7 @@ memzone_for_address(uint64_t address)
    if (address > IRIS_MEMZONE_DYNAMIC_START)
       return IRIS_MEMZONE_DYNAMIC;
 
-   if (address == IRIS_BINDER_ADDRESS)
+   if (address > IRIS_MEMZONE_BINDER_START)
       return IRIS_MEMZONE_BINDER;
 
    if (address > IRIS_MEMZONE_SURFACE_START)
@@ -365,8 +365,14 @@ bucket_vma_free(struct bo_cache_bucket *bucket, uint64_t address)
 }
 
 static struct bo_cache_bucket *
-get_bucket_allocator(struct iris_bufmgr *bufmgr, uint64_t size)
+get_bucket_allocator(struct iris_bufmgr *bufmgr,
+                     enum iris_memory_zone memzone,
+                     uint64_t size)
 {
+   /* Bucketing is not worth using for binders...we'll never have 64... */
+   if (memzone == IRIS_MEMZONE_BINDER)
+      return NULL;
+
    /* Skip using the bucket allocator for very large sizes, as it allocates
     * 64 of them and this can balloon rather quickly.
     */
@@ -393,12 +399,11 @@ vma_alloc(struct iris_bufmgr *bufmgr,
           uint64_t size,
           uint64_t alignment)
 {
-   if (memzone == IRIS_MEMZONE_BINDER)
-      return IRIS_BINDER_ADDRESS;
-   else if (memzone == IRIS_MEMZONE_BORDER_COLOR_POOL)
+   if (memzone == IRIS_MEMZONE_BORDER_COLOR_POOL)
       return IRIS_BORDER_COLOR_POOL_ADDRESS;
 
-   struct bo_cache_bucket *bucket = get_bucket_allocator(bufmgr, size);
+   struct bo_cache_bucket *bucket =
+      get_bucket_allocator(bufmgr, memzone, size);
    uint64_t addr;
 
    if (bucket) {
@@ -419,8 +424,7 @@ vma_free(struct iris_bufmgr *bufmgr,
          uint64_t address,
          uint64_t size)
 {
-   if (address == IRIS_BINDER_ADDRESS ||
-       address == IRIS_BORDER_COLOR_POOL_ADDRESS)
+   if (address == IRIS_BORDER_COLOR_POOL_ADDRESS)
       return;
 
    /* Un-canonicalize the address. */
@@ -429,12 +433,13 @@ vma_free(struct iris_bufmgr *bufmgr,
    if (address == 0ull)
       return;
 
-   struct bo_cache_bucket *bucket = get_bucket_allocator(bufmgr, size);
+   enum iris_memory_zone memzone = memzone_for_address(address);
+   struct bo_cache_bucket *bucket =
+      get_bucket_allocator(bufmgr, memzone, size);
 
    if (bucket) {
       bucket_vma_free(bucket, address);
    } else {
-      enum iris_memory_zone memzone = memzone_for_address(address);
       util_vma_heap_free(&bufmgr->vma_allocator[memzone], address, size);
    }
 }
@@ -1599,9 +1604,12 @@ iris_bufmgr_init(struct gen_device_info *devinfo, int fd)
 
    util_vma_heap_init(&bufmgr->vma_allocator[IRIS_MEMZONE_SHADER],
                       PAGE_SIZE, _4GB);
+   util_vma_heap_init(&bufmgr->vma_allocator[IRIS_MEMZONE_BINDER],
+                      IRIS_MEMZONE_BINDER_START,
+                      IRIS_MAX_BINDERS * IRIS_BINDER_SIZE);
    util_vma_heap_init(&bufmgr->vma_allocator[IRIS_MEMZONE_SURFACE],
-                      IRIS_MEMZONE_SURFACE_START + IRIS_BINDER_SIZE,
-                      _4GB - IRIS_BINDER_SIZE);
+                      IRIS_MEMZONE_SURFACE_START,
+                      _4GB - IRIS_MAX_BINDERS * IRIS_BINDER_SIZE);
    util_vma_heap_init(&bufmgr->vma_allocator[IRIS_MEMZONE_DYNAMIC],
                       IRIS_MEMZONE_DYNAMIC_START + IRIS_BORDER_COLOR_POOL_SIZE,
                       _4GB - IRIS_BORDER_COLOR_POOL_SIZE);
diff --git a/src/gallium/drivers/iris/iris_bufmgr.h b/src/gallium/drivers/iris/iris_bufmgr.h
index 8be545cb04b..9210f44c944 100644
--- a/src/gallium/drivers/iris/iris_bufmgr.h
+++ b/src/gallium/drivers/iris/iris_bufmgr.h
@@ -48,14 +48,11 @@ struct pipe_debug_callback;
  *
  * We lay out the virtual address space as follows:
  *
- * - [0,   4K): Nothing  (empty page for null address)
- * - [4K,  4G): Shaders  (Instruction Base Address)
- * - [4G,  8G): Surfaces (Surface State Base Address, Bindless ...)
- * - [8G, 12G): Dynamic  (Dynamic State Base Address)
- * - [12G, *):  Other    (everything else in the full 48-bit VMA)
- *
- * A special 64kB "binder" buffer lives at the start of the surface memory
- * zone, holding binding tables referring to objects in the rest of the zone.
+ * - [0,   4K): Nothing            (empty page for null address)
+ * - [4K,  4G): Shaders            (Instruction Base Address)
+ * - [4G,  8G): Surfaces & Binders (Surface State Base Address, Bindless ...)
+ * - [8G, 12G): Dynamic            (Dynamic State Base Address)
+ * - [12G, *):  Other              (everything else in the full 48-bit VMA)
  *
  * A special buffer for border color lives at the start of the dynamic state
  * memory zone.  This unfortunately has to be handled specially because the
@@ -65,32 +62,29 @@ struct pipe_debug_callback;
  * each a separate VMA.  However, we assign address globally, so buffers will
  * have the same address in all GEM contexts.  This lets us have a single BO
  * field for the address, which is easy and cheap.
- *
- * One exception is the special "binder" BO.  Binders are context-local,
- * so while there are many of them, all binders are stored at the same
- * fixed address (in different VMAs).
  */
 enum iris_memory_zone {
    IRIS_MEMZONE_SHADER,
+   IRIS_MEMZONE_BINDER,
    IRIS_MEMZONE_SURFACE,
    IRIS_MEMZONE_DYNAMIC,
    IRIS_MEMZONE_OTHER,
 
-   IRIS_MEMZONE_BINDER,
    IRIS_MEMZONE_BORDER_COLOR_POOL,
 };
 
 /* Intentionally exclude single buffer "zones" */
 #define IRIS_MEMZONE_COUNT (IRIS_MEMZONE_OTHER + 2)
 
+#define IRIS_BINDER_SIZE (64 * 1024)
+#define IRIS_MAX_BINDERS 100
+
 #define IRIS_MEMZONE_SHADER_START     (0ull * (1ull << 32))
-#define IRIS_MEMZONE_SURFACE_START    (1ull * (1ull << 32))
+#define IRIS_MEMZONE_BINDER_START     (1ull * (1ull << 32))
+#define IRIS_MEMZONE_SURFACE_START    (IRIS_MEMZONE_BINDER_START + IRIS_MAX_BINDERS * IRIS_BINDER_SIZE)
 #define IRIS_MEMZONE_DYNAMIC_START    (2ull * (1ull << 32))
 #define IRIS_MEMZONE_OTHER_START      (3ull * (1ull << 32))
 
-#define IRIS_BINDER_ADDRESS IRIS_MEMZONE_SURFACE_START
-#define IRIS_BINDER_SIZE (64 * 1024)
-
 #define IRIS_BORDER_COLOR_POOL_ADDRESS IRIS_MEMZONE_DYNAMIC_START
 #define IRIS_BORDER_COLOR_POOL_SIZE (64 * 1024)
 
diff --git a/src/gallium/drivers/iris/iris_context.c b/src/gallium/drivers/iris/iris_context.c
index daaa9409d2e..bc637ea0492 100644
--- a/src/gallium/drivers/iris/iris_context.c
+++ b/src/gallium/drivers/iris/iris_context.c
@@ -130,6 +130,7 @@ iris_destroy_context(struct pipe_context *ctx)
    slab_destroy_child(&ice->transfer_pool);
 
    iris_batch_free(&ice->render_batch);
+   iris_destroy_binder(&ice->state.binder);
 
    ralloc_free(ice);
 }
@@ -189,14 +190,15 @@ iris_create_context(struct pipe_screen *pscreen, void *priv, unsigned flags)
 
    iris_init_program_cache(ice);
    iris_init_border_color_pool(ice);
+   iris_init_binder(ice);
 
    slab_create_child(&ice->transfer_pool, &screen->transfer_pool);
 
    ice->state.surface_uploader =
-      u_upload_create(&ice->ctx, 16384, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE,
+      u_upload_create(ctx, 16384, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE,
                       IRIS_RESOURCE_FLAG_SURFACE_MEMZONE);
    ice->state.dynamic_uploader =
-      u_upload_create(&ice->ctx, 16384, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE,
+      u_upload_create(ctx, 16384, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE,
                       IRIS_RESOURCE_FLAG_DYNAMIC_MEMZONE);
 
    genX_call(devinfo, init_state, ice);
diff --git a/src/gallium/drivers/iris/iris_context.h b/src/gallium/drivers/iris/iris_context.h
index f7411727bb0..a01e0d13eb4 100644
--- a/src/gallium/drivers/iris/iris_context.h
+++ b/src/gallium/drivers/iris/iris_context.h
@@ -30,6 +30,7 @@
 #include "intel/common/gen_debug.h"
 #include "intel/compiler/brw_compiler.h"
 #include "iris_batch.h"
+#include "iris_binder.h"
 #include "iris_resource.h"
 #include "iris_screen.h"
 
@@ -109,6 +110,13 @@ struct blorp_params;
 #define IRIS_DIRTY_VF                       (1ull << 52)
 #define IRIS_DIRTY_VF_TOPOLOGY              (1ull << 53)
 
+#define IRIS_ALL_DIRTY_BINDINGS (IRIS_DIRTY_BINDINGS_VS  | \
+                                 IRIS_DIRTY_BINDINGS_TCS | \
+                                 IRIS_DIRTY_BINDINGS_TES | \
+                                 IRIS_DIRTY_BINDINGS_GS  | \
+                                 IRIS_DIRTY_BINDINGS_FS  | \
+                                 IRIS_DIRTY_BINDINGS_CS)
+
 /**
  * Non-orthogonal state (NOS) dependency flags.
  *
@@ -262,6 +270,8 @@ struct iris_vtable {
    void (*upload_render_state)(struct iris_context *ice,
                                struct iris_batch *batch,
                                const struct pipe_draw_info *draw);
+   void (*update_surface_base_address)(struct iris_batch *batch,
+                                       struct iris_binder *binder);
    void (*emit_raw_pipe_control)(struct iris_batch *batch, uint32_t flags,
                                  struct iris_bo *bo, uint32_t offset,
                                  uint64_t imm);
@@ -382,6 +392,8 @@ struct iris_context {
       // "I'm streaming this out at draw time and never want it again!"
       struct u_upload_mgr *dynamic_uploader;
 
+      struct iris_binder binder;
+
       struct iris_border_color_pool border_color_pool;
 
       /**
diff --git a/src/gallium/drivers/iris/iris_draw.c b/src/gallium/drivers/iris/iris_draw.c
index f6911350a7b..0567bbac72e 100644
--- a/src/gallium/drivers/iris/iris_draw.c
+++ b/src/gallium/drivers/iris/iris_draw.c
@@ -80,21 +80,9 @@ iris_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
    iris_predraw_resolve_inputs(ice, batch);
    iris_predraw_resolve_framebuffer(ice, batch);
 
-   if (iris_binder_is_empty(&batch->binder)) {
-      ice->state.dirty |= IRIS_DIRTY_BINDINGS_VS |
-                          IRIS_DIRTY_BINDINGS_TCS |
-                          IRIS_DIRTY_BINDINGS_TES |
-                          IRIS_DIRTY_BINDINGS_GS |
-                          IRIS_DIRTY_BINDINGS_FS;
-   }
+   iris_binder_reserve_3d(ice);
 
-   if (iris_binder_reserve_3d(batch, ice)) {
-      ice->state.dirty |= IRIS_DIRTY_BINDINGS_VS |
-                          IRIS_DIRTY_BINDINGS_TCS |
-                          IRIS_DIRTY_BINDINGS_TES |
-                          IRIS_DIRTY_BINDINGS_GS |
-                          IRIS_DIRTY_BINDINGS_FS;
-   }
+   ice->vtbl.update_surface_base_address(batch, &ice->state.binder);
    ice->vtbl.upload_render_state(ice, batch, info);
 
    ice->state.dirty = 0ull;
diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c
index 454e05979e4..54bf3fd6023 100644
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@@ -445,6 +445,36 @@ emit_state(struct iris_batch *batch,
 #define cso_changed_memcmp(x) \
    (!old_cso || memcmp(old_cso->x, new_cso->x, sizeof(old_cso->x)) != 0)
 
+static void
+flush_for_state_base_change(struct iris_batch *batch)
+{
+   /* Flush before emitting STATE_BASE_ADDRESS.
+    *
+    * This isn't documented anywhere in the PRM.  However, it seems to be
+    * necessary prior to changing the surface state base adress.  We've
+    * seen issues in Vulkan where we get GPU hangs when using multi-level
+    * command buffers which clear depth, reset state base address, and then
+    * go render stuff.
+    *
+    * Normally, in GL, we would trust the kernel to do sufficient stalls
+    * and flushes prior to executing our batch.  However, it doesn't seem
+    * as if the kernel's flushing is always sufficient and we don't want to
+    * rely on it.
+    *
+    * We make this an end-of-pipe sync instead of a normal flush because we
+    * do not know the current status of the GPU.  On Haswell at least,
+    * having a fast-clear operation in flight at the same time as a normal
+    * rendering operation can cause hangs.  Since the kernel's flushing is
+    * insufficient, we need to ensure that any rendering operations from
+    * other processes are definitely complete before we try to do our own
+    * rendering.  It's a bit of a big hammer but it appears to work.
+    */
+   iris_emit_end_of_pipe_sync(batch,
+                              PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                              PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                              PIPE_CONTROL_DATA_CACHE_FLUSH);
+}
+
 /**
  * Upload the initial GPU state for a render context.
  *
@@ -459,18 +489,19 @@ iris_init_render_context(struct iris_screen *screen,
 {
    iris_init_batch(batch, screen, vtbl, dbg, I915_EXEC_RENDER);
 
-   /* XXX: PIPE_CONTROLs */
+   flush_for_state_base_change(batch);
 
    /* We program STATE_BASE_ADDRESS once at context initialization time.
     * Each base address points at a 4GB memory zone, and never needs to
     * change.  See iris_bufmgr.h for a description of the memory zones.
+    *
+    * Except for Surface State Base Address.  That one changes.
     */
    iris_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
    #if 0
    // XXX: MOCS is stupid for this.
       sba.GeneralStateMemoryObjectControlState            = MOCS_WB;
       sba.StatelessDataPortAccessMemoryObjectControlState = MOCS_WB;
-      sba.SurfaceStateMemoryObjectControlState            = MOCS_WB;
       sba.DynamicStateMemoryObjectControlState            = MOCS_WB;
       sba.IndirectObjectMemoryObjectControlState          = MOCS_WB;
       sba.InstructionMemoryObjectControlState             = MOCS_WB;
@@ -478,7 +509,6 @@ iris_init_render_context(struct iris_screen *screen,
    #endif
 
       sba.GeneralStateBaseAddressModifyEnable   = true;
-      sba.SurfaceStateBaseAddressModifyEnable   = true;
       sba.DynamicStateBaseAddressModifyEnable   = true;
       sba.IndirectObjectBaseAddressModifyEnable = true;
       sba.InstructionBaseAddressModifyEnable    = true;
@@ -489,7 +519,6 @@ iris_init_render_context(struct iris_screen *screen,
       sba.InstructionBuffersizeModifyEnable     = true;
 
       sba.InstructionBaseAddress  = ro_bo(NULL, IRIS_MEMZONE_SHADER_START);
-      sba.SurfaceStateBaseAddress = ro_bo(NULL, IRIS_MEMZONE_SURFACE_START);
       sba.DynamicStateBaseAddress = ro_bo(NULL, IRIS_MEMZONE_DYNAMIC_START);
 
       sba.GeneralStateBufferSize   = 0xfffff;
@@ -3063,6 +3092,9 @@ use_ssbo(struct iris_batch *batch, struct iris_context *ice,
    return surf_state->offset;
 }
 
+#define push_bt_entry(addr) \
+   assert(addr >= binder_addr); bt_map[s++] = (addr) - binder_addr;
+
 /**
  * Populate the binding table for a given shader stage.
  *
@@ -3075,13 +3107,14 @@ iris_populate_binding_table(struct iris_context *ice,
                             struct iris_batch *batch,
                             gl_shader_stage stage)
 {
-   const struct iris_binder *binder = &batch->binder;
+   const struct iris_binder *binder = &ice->state.binder;
    struct iris_compiled_shader *shader = ice->shaders.prog[stage];
    if (!shader)
       return;
 
    const struct shader_info *info = iris_get_shader_info(ice, stage);
    struct iris_shader_state *shs = &ice->state.shaders[stage];
+   uint32_t binder_addr = binder->bo->gtt_offset;
 
    //struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;
    uint32_t *bt_map = binder->map + binder->bt_offset[stage];
@@ -3092,13 +3125,14 @@ iris_populate_binding_table(struct iris_context *ice,
       /* Note that cso_fb->nr_cbufs == fs_key->nr_color_regions. */
       if (cso_fb->nr_cbufs) {
          for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
-            if (cso_fb->cbufs[i])
-               bt_map[s++] = use_surface(batch, cso_fb->cbufs[i], true);
-            else
-               bt_map[s++] = use_null_fb_surface(batch, ice);
+            uint32_t addr =
+               cso_fb->cbufs[i] ? use_surface(batch, cso_fb->cbufs[i], true)
+                                : use_null_fb_surface(batch, ice);
+            push_bt_entry(addr);
          }
       } else {
-         bt_map[s++] = use_null_fb_surface(batch, ice);
+         uint32_t addr = use_null_fb_surface(batch, ice);
+         push_bt_entry(addr);
       }
    }
 
@@ -3107,8 +3141,9 @@ iris_populate_binding_table(struct iris_context *ice,
 
    for (int i = 0; i < shs->num_textures; i++) {
       struct iris_sampler_view *view = shs->textures[i];
-      bt_map[s++] = view ? use_sampler_view(batch, view)
-                         : use_null_surface(batch, ice);
+      uint32_t addr = view ? use_sampler_view(batch, view)
+                           : use_null_surface(batch, ice);
+      push_bt_entry(addr);
    }
 
    for (int i = 0; i < 1 + info->num_ubos; i++) {
@@ -3116,7 +3151,8 @@ iris_populate_binding_table(struct iris_context *ice,
       if (!cbuf->surface_state.res)
          break;
 
-      bt_map[s++] = use_const_buffer(batch, cbuf);
+      uint32_t addr = use_const_buffer(batch, cbuf);
+      push_bt_entry(addr);
    }
 
    /* XXX: st is wasting 16 binding table slots for ABOs.  Should add a cap
@@ -3126,7 +3162,8 @@ iris_populate_binding_table(struct iris_context *ice,
     */
    if (info->num_abos + info->num_ssbos > 0) {
       for (int i = 0; i < IRIS_MAX_ABOS + info->num_ssbos; i++) {
-         bt_map[s++] = use_ssbo(batch, ice, shs, i);
+         uint32_t addr = use_ssbo(batch, ice, shs, i);
+         push_bt_entry(addr);
       }
    }
 
@@ -3263,6 +3300,27 @@ iris_restore_context_saved_bos(struct iris_context *ice,
    }
 }
 
+/**
+ * Possibly emit STATE_BASE_ADDRESS to update Surface State Base Address.
+ */
+static void
+iris_update_surface_base_address(struct iris_batch *batch,
+                                 struct iris_binder *binder)
+{
+   if (batch->last_surface_base_address == binder->bo->gtt_offset)
+      return;
+
+   flush_for_state_base_change(batch);
+
+   iris_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
+      // XXX: sba.SurfaceStateMemoryObjectControlState = MOCS_WB;
+      sba.SurfaceStateBaseAddressModifyEnable = true;
+      sba.SurfaceStateBaseAddress = ro_bo(binder->bo, 0);
+   }
+
+   batch->last_surface_base_address = binder->bo->gtt_offset;
+}
+
 static void
 iris_upload_dirty_render_state(struct iris_context *ice,
                                struct iris_batch *batch,
@@ -3274,6 +3332,7 @@ iris_upload_dirty_render_state(struct iris_context *ice,
       return;
 
    struct iris_genx_state *genx = ice->state.genx;
+   struct iris_binder *binder = &ice->state.binder;
    struct brw_wm_prog_data *wm_prog_data = (void *)
       ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
 
@@ -3426,7 +3485,12 @@ iris_upload_dirty_render_state(struct iris_context *ice,
       }
    }
 
-   struct iris_binder *binder = &batch->binder;
+   /* Always pin the binder.  If we're emitting new binding table pointers,
+    * we need it.  If not, we're probably inheriting old tables via the
+    * context, and need it anyway.  Since true zero-bindings cases are
+    * practically non-existent, just pin it and avoid last_res tracking.
+    */
+   iris_use_pinned_bo(batch, binder->bo, false);
 
    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
       if (dirty & (IRIS_DIRTY_BINDINGS_VS << stage)) {
@@ -4309,6 +4373,7 @@ genX(init_state)(struct iris_context *ice)
    ice->vtbl.destroy_state = iris_destroy_state;
    ice->vtbl.init_render_context = iris_init_render_context;
    ice->vtbl.upload_render_state = iris_upload_render_state;
+   ice->vtbl.update_surface_base_address = iris_update_surface_base_address;
    ice->vtbl.emit_raw_pipe_control = iris_emit_raw_pipe_control;
    ice->vtbl.derived_program_state_size = iris_derived_program_state_size;
    ice->vtbl.store_derived_program_state = iris_store_derived_program_state;