anv/pipeline: Handle output lowering in anv_pipeline instead of spirv_to_nir

[mesa.git] / src / vulkan / anv_batch_chain.c
diff --git a/src/vulkan/anv_batch_chain.c b/src/vulkan/anv_batch_chain.c

index 2f09248aceec4064b7dd3b5bd31c048e09a2ed96..466a3624234b61070f8ad5b19d7b09b47292898b 100644 (file)
--- a/src/vulkan/anv_batch_chain.c
+++ b/src/vulkan/anv_batch_chain.c
@@ -29,6 +29,9 @@
  
  #include "anv_private.h"
  
+#include "gen7_pack.h"
+#include "gen8_pack.h"
+
  /** \file anv_batch_chain.c
   *
   * This file contains functions related to anv_cmd_buffer as a data
@@ -44,7 +47,7 @@
  
  static VkResult
  anv_reloc_list_init_clone(struct anv_reloc_list *list,
-                          struct anv_device *device,
+                          const VkAllocationCallbacks *alloc,
                            const struct anv_reloc_list *other_list)
  {
     if (other_list) {
@@ -56,18 +59,18 @@ anv_reloc_list_init_clone(struct anv_reloc_list *list,
     }
  
     list->relocs =
-      anv_device_alloc(device, list->array_length * sizeof(*list->relocs), 8,
-                       VK_SYSTEM_ALLOC_TYPE_INTERNAL);
+      anv_alloc(alloc, list->array_length * sizeof(*list->relocs), 8,
+                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
  
     if (list->relocs == NULL)
        return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
  
     list->reloc_bos =
-      anv_device_alloc(device, list->array_length * sizeof(*list->reloc_bos), 8,
-                       VK_SYSTEM_ALLOC_TYPE_INTERNAL);
+      anv_alloc(alloc, list->array_length * sizeof(*list->reloc_bos), 8,
+                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
  
     if (list->reloc_bos == NULL) {
-      anv_device_free(device, list->relocs);
+      anv_free(alloc, list->relocs);
        return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
     }
  
@@ -82,20 +85,23 @@ anv_reloc_list_init_clone(struct anv_reloc_list *list,
  }
  
  VkResult
-anv_reloc_list_init(struct anv_reloc_list *list, struct anv_device *device)
+anv_reloc_list_init(struct anv_reloc_list *list,
+                    const VkAllocationCallbacks *alloc)
  {
-   return anv_reloc_list_init_clone(list, device, NULL);
+   return anv_reloc_list_init_clone(list, alloc, NULL);
  }
  
  void
-anv_reloc_list_finish(struct anv_reloc_list *list, struct anv_device *device)
+anv_reloc_list_finish(struct anv_reloc_list *list,
+                      const VkAllocationCallbacks *alloc)
  {
-   anv_device_free(device, list->relocs);
-   anv_device_free(device, list->reloc_bos);
+   anv_free(alloc, list->relocs);
+   anv_free(alloc, list->reloc_bos);
  }
  
  static VkResult
-anv_reloc_list_grow(struct anv_reloc_list *list, struct anv_device *device,
+anv_reloc_list_grow(struct anv_reloc_list *list,
+                    const VkAllocationCallbacks *alloc,
                      size_t num_additional_relocs)
  {
     if (list->num_relocs + num_additional_relocs <= list->array_length)
@@ -106,16 +112,16 @@ anv_reloc_list_grow(struct anv_reloc_list *list, struct anv_device *device,
        new_length *= 2;
  
     struct drm_i915_gem_relocation_entry *new_relocs =
-      anv_device_alloc(device, new_length * sizeof(*list->relocs), 8,
-                       VK_SYSTEM_ALLOC_TYPE_INTERNAL);
+      anv_alloc(alloc, new_length * sizeof(*list->relocs), 8,
+                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
     if (new_relocs == NULL)
        return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
  
     struct anv_bo **new_reloc_bos =
-      anv_device_alloc(device, new_length * sizeof(*list->reloc_bos), 8,
-                       VK_SYSTEM_ALLOC_TYPE_INTERNAL);
+      anv_alloc(alloc, new_length * sizeof(*list->reloc_bos), 8,
+                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
     if (new_relocs == NULL) {
-      anv_device_free(device, new_relocs);
+      anv_free(alloc, new_relocs);
        return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
     }
  
@@ -123,8 +129,8 @@ anv_reloc_list_grow(struct anv_reloc_list *list, struct anv_device *device,
     memcpy(new_reloc_bos, list->reloc_bos,
            list->num_relocs * sizeof(*list->reloc_bos));
  
-   anv_device_free(device, list->relocs);
-   anv_device_free(device, list->reloc_bos);
+   anv_free(alloc, list->relocs);
+   anv_free(alloc, list->reloc_bos);
  
     list->array_length = new_length;
     list->relocs = new_relocs;
@@ -134,13 +140,14 @@ anv_reloc_list_grow(struct anv_reloc_list *list, struct anv_device *device,
  }
  
  uint64_t
-anv_reloc_list_add(struct anv_reloc_list *list, struct anv_device *device,
+anv_reloc_list_add(struct anv_reloc_list *list,
+                   const VkAllocationCallbacks *alloc,
                     uint32_t offset, struct anv_bo *target_bo, uint32_t delta)
  {
     struct drm_i915_gem_relocation_entry *entry;
     int index;
  
-   anv_reloc_list_grow(list, device, 1);
+   anv_reloc_list_grow(list, alloc, 1);
     /* TODO: Handle failure */
  
     /* XXX: Can we use I915_EXEC_HANDLE_LUT? */
@@ -153,15 +160,17 @@ anv_reloc_list_add(struct anv_reloc_list *list, struct anv_device *device,
     entry->presumed_offset = target_bo->offset;
     entry->read_domains = 0;
     entry->write_domain = 0;
+   VG(VALGRIND_CHECK_MEM_IS_DEFINED(entry, sizeof(*entry)));
  
     return target_bo->offset + delta;
  }
  
  static void
-anv_reloc_list_append(struct anv_reloc_list *list, struct anv_device *device,
+anv_reloc_list_append(struct anv_reloc_list *list,
+                      const VkAllocationCallbacks *alloc,
                        struct anv_reloc_list *other, uint32_t offset)
  {
-   anv_reloc_list_grow(list, device, other->num_relocs);
+   anv_reloc_list_grow(list, alloc, other->num_relocs);
     /* TODO: Handle failure */
  
     memcpy(&list->relocs[list->num_relocs], &other->relocs[0],
@@ -197,7 +206,7 @@ uint64_t
  anv_batch_emit_reloc(struct anv_batch *batch,
                       void *location, struct anv_bo *bo, uint32_t delta)
  {
-   return anv_reloc_list_add(batch->relocs, batch->device,
+   return anv_reloc_list_add(batch->relocs, batch->alloc,
                               location - batch->start, bo, delta);
  }
  
@@ -218,7 +227,7 @@ anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other)
     memcpy(batch->next, other->start, size);
  
     offset = batch->next - batch->start;
-   anv_reloc_list_append(batch->relocs, batch->device,
+   anv_reloc_list_append(batch->relocs, batch->alloc,
                           other->relocs, offset);
  
     batch->next += size;
@@ -229,20 +238,21 @@ anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other)
   *-----------------------------------------------------------------------*/
  
  static VkResult
-anv_batch_bo_create(struct anv_device *device, struct anv_batch_bo **bbo_out)
+anv_batch_bo_create(struct anv_cmd_buffer *cmd_buffer,
+                    struct anv_batch_bo **bbo_out)
  {
     VkResult result;
  
-   struct anv_batch_bo *bbo =
-      anv_device_alloc(device, sizeof(*bbo), 8, VK_SYSTEM_ALLOC_TYPE_INTERNAL);
+   struct anv_batch_bo *bbo = anv_alloc(&cmd_buffer->pool->alloc, sizeof(*bbo),
+                                        8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
     if (bbo == NULL)
        return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
  
-   result = anv_bo_pool_alloc(&device->batch_bo_pool, &bbo->bo);
+   result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, &bbo->bo);
     if (result != VK_SUCCESS)
        goto fail_alloc;
  
-   result = anv_reloc_list_init(&bbo->relocs, device);
+   result = anv_reloc_list_init(&bbo->relocs, &cmd_buffer->pool->alloc);
     if (result != VK_SUCCESS)
        goto fail_bo_alloc;
  
@@ -251,44 +261,47 @@ anv_batch_bo_create(struct anv_device *device, struct anv_batch_bo **bbo_out)
     return VK_SUCCESS;
  
   fail_bo_alloc:
-   anv_bo_pool_free(&device->batch_bo_pool, &bbo->bo);
+   anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, &bbo->bo);
   fail_alloc:
-   anv_device_free(device, bbo);
+   anv_free(&cmd_buffer->pool->alloc, bbo);
  
     return result;
  }
  
  static VkResult
-anv_batch_bo_clone(struct anv_device *device,
+anv_batch_bo_clone(struct anv_cmd_buffer *cmd_buffer,
                     const struct anv_batch_bo *other_bbo,
                     struct anv_batch_bo **bbo_out)
  {
     VkResult result;
  
-   struct anv_batch_bo *bbo =
-      anv_device_alloc(device, sizeof(*bbo), 8, VK_SYSTEM_ALLOC_TYPE_INTERNAL);
+   struct anv_batch_bo *bbo = anv_alloc(&cmd_buffer->pool->alloc, sizeof(*bbo),
+                                        8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
     if (bbo == NULL)
        return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
  
-   result = anv_bo_pool_alloc(&device->batch_bo_pool, &bbo->bo);
+   result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, &bbo->bo);
     if (result != VK_SUCCESS)
        goto fail_alloc;
  
-   result = anv_reloc_list_init_clone(&bbo->relocs, device, &other_bbo->relocs);
+   result = anv_reloc_list_init_clone(&bbo->relocs, &cmd_buffer->pool->alloc,
+                                      &other_bbo->relocs);
     if (result != VK_SUCCESS)
        goto fail_bo_alloc;
  
     bbo->length = other_bbo->length;
     memcpy(bbo->bo.map, other_bbo->bo.map, other_bbo->length);
  
+   bbo->last_ss_pool_bo_offset = other_bbo->last_ss_pool_bo_offset;
+
     *bbo_out = bbo;
  
     return VK_SUCCESS;
  
   fail_bo_alloc:
-   anv_bo_pool_free(&device->batch_bo_pool, &bbo->bo);
+   anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, &bbo->bo);
   fail_alloc:
-   anv_device_free(device, bbo);
+   anv_free(&cmd_buffer->pool->alloc, bbo);
  
     return result;
  }
@@ -300,6 +313,7 @@ anv_batch_bo_start(struct anv_batch_bo *bbo, struct anv_batch *batch,
     batch->next = batch->start = bbo->bo.map;
     batch->end = bbo->bo.map + bbo->bo.size - batch_padding;
     batch->relocs = &bbo->relocs;
+   bbo->last_ss_pool_bo_offset = 0;
     bbo->relocs.num_relocs = 0;
  }
  
@@ -322,15 +336,17 @@ anv_batch_bo_finish(struct anv_batch_bo *bbo, struct anv_batch *batch)
  }
  
  static void
-anv_batch_bo_destroy(struct anv_batch_bo *bbo, struct anv_device *device)
+anv_batch_bo_destroy(struct anv_batch_bo *bbo,
+                     struct anv_cmd_buffer *cmd_buffer)
  {
-   anv_reloc_list_finish(&bbo->relocs, device);
-   anv_bo_pool_free(&device->batch_bo_pool, &bbo->bo);
-   anv_device_free(device, bbo);
+   anv_reloc_list_finish(&bbo->relocs, &cmd_buffer->pool->alloc);
+   anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, &bbo->bo);
+   anv_free(&cmd_buffer->pool->alloc, bbo);
  }
  
  static VkResult
-anv_batch_bo_list_clone(const struct list_head *list, struct anv_device *device,
+anv_batch_bo_list_clone(const struct list_head *list,
+                        struct anv_cmd_buffer *cmd_buffer,
                          struct list_head *new_list)
  {
     VkResult result = VK_SUCCESS;
@@ -340,7 +356,7 @@ anv_batch_bo_list_clone(const struct list_head *list, struct anv_device *device,
     struct anv_batch_bo *prev_bbo = NULL;
     list_for_each_entry(struct anv_batch_bo, bbo, list, link) {
        struct anv_batch_bo *new_bbo;
-      result = anv_batch_bo_clone(device, bbo, &new_bbo);
+      result = anv_batch_bo_clone(cmd_buffer, bbo, &new_bbo);
        if (result != VK_SUCCESS)
           break;
        list_addtail(&new_bbo->link, new_list);
@@ -361,7 +377,7 @@ anv_batch_bo_list_clone(const struct list_head *list, struct anv_device *device,
  
     if (result != VK_SUCCESS) {
        list_for_each_entry_safe(struct anv_batch_bo, bbo, new_list, link)
-         anv_batch_bo_destroy(bbo, device);
+         anv_batch_bo_destroy(bbo, cmd_buffer);
     }
  
     return result;
@@ -377,22 +393,38 @@ anv_cmd_buffer_current_batch_bo(struct anv_cmd_buffer *cmd_buffer)
     return LIST_ENTRY(struct anv_batch_bo, cmd_buffer->batch_bos.prev, link);
  }
  
-static inline struct anv_batch_bo *
-anv_cmd_buffer_current_surface_bbo(struct anv_cmd_buffer *cmd_buffer)
+struct anv_address
+anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer *cmd_buffer)
  {
-   return LIST_ENTRY(struct anv_batch_bo, cmd_buffer->surface_bos.prev, link);
+   return (struct anv_address) {
+      .bo = &cmd_buffer->device->surface_state_block_pool.bo,
+      .offset = *(int32_t *)anv_vector_head(&cmd_buffer->bt_blocks),
+   };
  }
  
-struct anv_bo *
-anv_cmd_buffer_current_surface_bo(struct anv_cmd_buffer *cmd_buffer)
+static void
+emit_batch_buffer_start(struct anv_cmd_buffer *cmd_buffer,
+                        struct anv_bo *bo, uint32_t offset)
  {
-   return &anv_cmd_buffer_current_surface_bbo(cmd_buffer)->bo;
-}
+   /* In gen8+ the address field grew to two dwords to accomodate 48 bit
+    * offsets. The high 16 bits are in the last dword, so we can use the gen8
+    * version in either case, as long as we set the instruction length in the
+    * header accordingly.  This means that we always emit three dwords here
+    * and all the padding and adjustment we do in this file works for all
+    * gens.
+    */
  
-struct anv_reloc_list *
-anv_cmd_buffer_current_surface_relocs(struct anv_cmd_buffer *cmd_buffer)
-{
-   return &anv_cmd_buffer_current_surface_bbo(cmd_buffer)->relocs;
+   const uint32_t gen7_length =
+      GEN7_MI_BATCH_BUFFER_START_length - GEN7_MI_BATCH_BUFFER_START_length_bias;
+   const uint32_t gen8_length =
+      GEN8_MI_BATCH_BUFFER_START_length - GEN8_MI_BATCH_BUFFER_START_length_bias;
+
+   anv_batch_emit(&cmd_buffer->batch, GEN8_MI_BATCH_BUFFER_START,
+      .DwordLength = cmd_buffer->device->info.gen < 8 ?
+                     gen7_length : gen8_length,
+      ._2ndLevelBatchBuffer = _1stlevelbatch,
+      .AddressSpaceIndicator = ASI_PPGTT,
+      .BatchBufferStartAddress = { bo, offset });
  }
  
  static void
@@ -410,12 +442,7 @@ cmd_buffer_chain_to_batch_bo(struct anv_cmd_buffer *cmd_buffer,
     batch->end += GEN8_MI_BATCH_BUFFER_START_length * 4;
     assert(batch->end == current_bbo->bo.map + current_bbo->bo.size);
  
-   anv_batch_emit(batch, GEN8_MI_BATCH_BUFFER_START,
-      GEN8_MI_BATCH_BUFFER_START_header,
-      ._2ndLevelBatchBuffer = _1stlevelbatch,
-      .AddressSpaceIndicator = ASI_PPGTT,
-      .BatchBufferStartAddress = { &bbo->bo, 0 },
-   );
+   emit_batch_buffer_start(cmd_buffer, &bbo->bo, 0);
  
     anv_batch_bo_finish(current_bbo, batch);
  }
@@ -426,13 +453,13 @@ anv_cmd_buffer_chain_batch(struct anv_batch *batch, void *_data)
     struct anv_cmd_buffer *cmd_buffer = _data;
     struct anv_batch_bo *new_bbo;
  
-   VkResult result = anv_batch_bo_create(cmd_buffer->device, &new_bbo);
+   VkResult result = anv_batch_bo_create(cmd_buffer, &new_bbo);
     if (result != VK_SUCCESS)
        return result;
  
     struct anv_batch_bo **seen_bbo = anv_vector_add(&cmd_buffer->seen_bbos);
     if (seen_bbo == NULL) {
-      anv_batch_bo_destroy(new_bbo, cmd_buffer->device);
+      anv_batch_bo_destroy(new_bbo, cmd_buffer);
        return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
     }
     *seen_bbo = new_bbo;
@@ -447,26 +474,36 @@ anv_cmd_buffer_chain_batch(struct anv_batch *batch, void *_data)
  }
  
  struct anv_state
-anv_cmd_buffer_alloc_surface_state(struct anv_cmd_buffer *cmd_buffer,
-                                   uint32_t size, uint32_t alignment)
+anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer,
+                                   uint32_t entries, uint32_t *state_offset)
  {
-   struct anv_bo *surface_bo =
-      anv_cmd_buffer_current_surface_bo(cmd_buffer);
+   struct anv_block_pool *block_pool =
+       &cmd_buffer->device->surface_state_block_pool;
+   int32_t *bt_block = anv_vector_head(&cmd_buffer->bt_blocks);
     struct anv_state state;
  
-   state.offset = align_u32(cmd_buffer->surface_next, alignment);
-   if (state.offset + size > surface_bo->size)
+   state.alloc_size = align_u32(entries * 4, 32);
+
+   if (cmd_buffer->bt_next + state.alloc_size > block_pool->block_size)
        return (struct anv_state) { 0 };
  
-   state.map = surface_bo->map + state.offset;
-   state.alloc_size = size;
-   cmd_buffer->surface_next = state.offset + size;
+   state.offset = cmd_buffer->bt_next;
+   state.map = block_pool->map + *bt_block + state.offset;
  
-   assert(state.offset + size <= surface_bo->size);
+   cmd_buffer->bt_next += state.alloc_size;
+
+   assert(*bt_block < 0);
+   *state_offset = -(*bt_block);
  
     return state;
  }
  
+struct anv_state
+anv_cmd_buffer_alloc_surface_state(struct anv_cmd_buffer *cmd_buffer)
+{
+   return anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64);
+}
+
  struct anv_state
  anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer *cmd_buffer,
                                     uint32_t size, uint32_t alignment)
@@ -476,28 +513,17 @@ anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer *cmd_buffer,
  }
  
  VkResult
-anv_cmd_buffer_new_surface_state_bo(struct anv_cmd_buffer *cmd_buffer)
+anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer)
  {
-   struct anv_batch_bo *new_bbo, *old_bbo =
-      anv_cmd_buffer_current_surface_bbo(cmd_buffer);
-
-   /* Finish off the old buffer */
-   old_bbo->length = cmd_buffer->surface_next;
-
-   VkResult result = anv_batch_bo_create(cmd_buffer->device, &new_bbo);
-   if (result != VK_SUCCESS)
-      return result;
+   struct anv_block_pool *block_pool =
+       &cmd_buffer->device->surface_state_block_pool;
  
-   struct anv_batch_bo **seen_bbo = anv_vector_add(&cmd_buffer->seen_bbos);
-   if (seen_bbo == NULL) {
-      anv_batch_bo_destroy(new_bbo, cmd_buffer->device);
+   int32_t *offset = anv_vector_add(&cmd_buffer->bt_blocks);
+   if (offset == NULL)
        return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-   }
-   *seen_bbo = new_bbo;
-
-   cmd_buffer->surface_next = 1;
  
-   list_addtail(&new_bbo->link, &cmd_buffer->surface_bos);
+   *offset = anv_block_pool_alloc_back(block_pool);
+   cmd_buffer->bt_next = 0;
  
     return VK_SUCCESS;
  }
@@ -505,43 +531,43 @@ anv_cmd_buffer_new_surface_state_bo(struct anv_cmd_buffer *cmd_buffer)
  VkResult
  anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
  {
-   struct anv_batch_bo *batch_bo, *surface_bbo;
-   struct anv_device *device = cmd_buffer->device;
+   struct anv_batch_bo *batch_bo;
     VkResult result;
  
     list_inithead(&cmd_buffer->batch_bos);
-   list_inithead(&cmd_buffer->surface_bos);
  
-   result = anv_batch_bo_create(device, &batch_bo);
+   result = anv_batch_bo_create(cmd_buffer, &batch_bo);
     if (result != VK_SUCCESS)
        return result;
  
     list_addtail(&batch_bo->link, &cmd_buffer->batch_bos);
  
-   cmd_buffer->batch.device = device;
+   cmd_buffer->batch.alloc = &cmd_buffer->pool->alloc;
     cmd_buffer->batch.extend_cb = anv_cmd_buffer_chain_batch;
     cmd_buffer->batch.user_data = cmd_buffer;
  
     anv_batch_bo_start(batch_bo, &cmd_buffer->batch,
                        GEN8_MI_BATCH_BUFFER_START_length * 4);
  
-   result = anv_batch_bo_create(device, &surface_bbo);
-   if (result != VK_SUCCESS)
-      goto fail_batch_bo;
-
-   list_addtail(&surface_bbo->link, &cmd_buffer->surface_bos);
-
     int success = anv_vector_init(&cmd_buffer->seen_bbos,
                                   sizeof(struct anv_bo *),
                                   8 * sizeof(struct anv_bo *));
     if (!success)
-      goto fail_surface_bo;
+      goto fail_batch_bo;
  
     *(struct anv_batch_bo **)anv_vector_add(&cmd_buffer->seen_bbos) = batch_bo;
-   *(struct anv_batch_bo **)anv_vector_add(&cmd_buffer->seen_bbos) = surface_bbo;
  
-   /* Start surface_next at 1 so surface offset 0 is invalid. */
-   cmd_buffer->surface_next = 1;
+   success = anv_vector_init(&cmd_buffer->bt_blocks, sizeof(int32_t),
+                             8 * sizeof(int32_t));
+   if (!success)
+      goto fail_seen_bbos;
+
+   result = anv_reloc_list_init(&cmd_buffer->surface_relocs,
+                                &cmd_buffer->pool->alloc);
+   if (result != VK_SUCCESS)
+      goto fail_bt_blocks;
+
+   anv_cmd_buffer_new_binding_table_block(cmd_buffer);
  
     cmd_buffer->execbuf2.objects = NULL;
     cmd_buffer->execbuf2.bos = NULL;
@@ -549,10 +575,12 @@ anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
  
     return VK_SUCCESS;
  
- fail_surface_bo:
-   anv_batch_bo_destroy(surface_bbo, device);
+ fail_bt_blocks:
+   anv_vector_finish(&cmd_buffer->bt_blocks);
+ fail_seen_bbos:
+   anv_vector_finish(&cmd_buffer->seen_bbos);
   fail_batch_bo:
-   anv_batch_bo_destroy(batch_bo, device);
+   anv_batch_bo_destroy(batch_bo, cmd_buffer);
  
     return result;
  }
@@ -560,37 +588,36 @@ anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
  void
  anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
  {
-   struct anv_device *device = cmd_buffer->device;
+   int32_t *bt_block;
+   anv_vector_foreach(bt_block, &cmd_buffer->bt_blocks) {
+      anv_block_pool_free(&cmd_buffer->device->surface_state_block_pool,
+                          *bt_block);
+   }
+   anv_vector_finish(&cmd_buffer->bt_blocks);
+
+   anv_reloc_list_finish(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc);
  
     anv_vector_finish(&cmd_buffer->seen_bbos);
  
     /* Destroy all of the batch buffers */
     list_for_each_entry_safe(struct anv_batch_bo, bbo,
                              &cmd_buffer->batch_bos, link) {
-      anv_batch_bo_destroy(bbo, device);
-   }
-
-   /* Destroy all of the surface state buffers */
-   list_for_each_entry_safe(struct anv_batch_bo, bbo,
-                            &cmd_buffer->surface_bos, link) {
-      anv_batch_bo_destroy(bbo, device);
+      anv_batch_bo_destroy(bbo, cmd_buffer);
     }
  
-   anv_device_free(device, cmd_buffer->execbuf2.objects);
-   anv_device_free(device, cmd_buffer->execbuf2.bos);
+   anv_free(&cmd_buffer->pool->alloc, cmd_buffer->execbuf2.objects);
+   anv_free(&cmd_buffer->pool->alloc, cmd_buffer->execbuf2.bos);
  }
  
  void
  anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
  {
-   struct anv_device *device = cmd_buffer->device;
-
     /* Delete all but the first batch bo */
     assert(!list_empty(&cmd_buffer->batch_bos));
     while (cmd_buffer->batch_bos.next != cmd_buffer->batch_bos.prev) {
        struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
        list_del(&bbo->link);
-      anv_batch_bo_destroy(bbo, device);
+      anv_batch_bo_destroy(bbo, cmd_buffer);
     }
     assert(!list_empty(&cmd_buffer->batch_bos));
  
@@ -598,18 +625,15 @@ anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
                        &cmd_buffer->batch,
                        GEN8_MI_BATCH_BUFFER_START_length * 4);
  
-   /* Delete all but the first batch bo */
-   assert(!list_empty(&cmd_buffer->batch_bos));
-   while (cmd_buffer->surface_bos.next != cmd_buffer->surface_bos.prev) {
-      struct anv_batch_bo *bbo = anv_cmd_buffer_current_surface_bbo(cmd_buffer);
-      list_del(&bbo->link);
-      anv_batch_bo_destroy(bbo, device);
+   while (anv_vector_length(&cmd_buffer->bt_blocks) > 1) {
+      int32_t *bt_block = anv_vector_remove(&cmd_buffer->bt_blocks);
+      anv_block_pool_free(&cmd_buffer->device->surface_state_block_pool,
+                          *bt_block);
     }
-   assert(!list_empty(&cmd_buffer->batch_bos));
-
-   anv_cmd_buffer_current_surface_bbo(cmd_buffer)->relocs.num_relocs = 0;
+   assert(anv_vector_length(&cmd_buffer->bt_blocks) == 1);
+   cmd_buffer->bt_next = 0;
  
-   cmd_buffer->surface_next = 1;
+   cmd_buffer->surface_relocs.num_relocs = 0;
  
     /* Reset the list of seen buffers */
     cmd_buffer->seen_bbos.head = 0;
@@ -617,26 +641,26 @@ anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
  
     *(struct anv_batch_bo **)anv_vector_add(&cmd_buffer->seen_bbos) =
        anv_cmd_buffer_current_batch_bo(cmd_buffer);
-   *(struct anv_batch_bo **)anv_vector_add(&cmd_buffer->seen_bbos) =
-      anv_cmd_buffer_current_surface_bbo(cmd_buffer);
  }
  
  void
  anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer)
  {
     struct anv_batch_bo *batch_bo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
-   struct anv_batch_bo *surface_bbo =
-      anv_cmd_buffer_current_surface_bbo(cmd_buffer);
  
-   if (cmd_buffer->level == VK_CMD_BUFFER_LEVEL_PRIMARY) {
-      anv_batch_emit(&cmd_buffer->batch, GEN8_MI_BATCH_BUFFER_END);
+   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
+      anv_batch_emit(&cmd_buffer->batch, GEN7_MI_BATCH_BUFFER_END);
  
        /* Round batch up to an even number of dwords. */
        if ((cmd_buffer->batch.next - cmd_buffer->batch.start) & 4)
-         anv_batch_emit(&cmd_buffer->batch, GEN8_MI_NOOP);
+         anv_batch_emit(&cmd_buffer->batch, GEN7_MI_NOOP);
  
        cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_PRIMARY;
-   } else {
+   }
+
+   anv_batch_bo_finish(batch_bo, &cmd_buffer->batch);
+
+   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
        /* If this is a secondary command buffer, we need to determine the
         * mode in which it will be executed with vkExecuteCommands.  We
         * determine this statically here so that this stays in sync with the
@@ -650,26 +674,21 @@ anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer)
            * probably better of simply copying it into our batch.
            */
           cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_EMIT;
-      } else if (cmd_buffer->opt_flags &
-                 VK_CMD_BUFFER_OPTIMIZE_NO_SIMULTANEOUS_USE_BIT) {
+      } else if (!(cmd_buffer->usage_flags &
+                   VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) {
           cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_CHAIN;
  
-         /* For chaining mode, we need to increment the number of
-          * relocations.  This is because, when we chain, we need to add
-          * an MI_BATCH_BUFFER_START command.  Adding this command will
-          * also add a relocation.  In order to handle theis we'll
-          * increment it here and decrement it right before adding the
+         /* When we chain, we need to add an MI_BATCH_BUFFER_START command
+          * with its relocation.  In order to handle this we'll increment here
+          * so we can unconditionally decrement right before adding the
            * MI_BATCH_BUFFER_START command.
            */
           anv_cmd_buffer_current_batch_bo(cmd_buffer)->relocs.num_relocs++;
+         cmd_buffer->batch.next += GEN8_MI_BATCH_BUFFER_START_length * 4;
        } else {
           cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN;
        }
     }
-
-   anv_batch_bo_finish(batch_bo, &cmd_buffer->batch);
-
-   surface_bbo->length = cmd_buffer->surface_next;
  }
  
  static inline VkResult
@@ -701,35 +720,45 @@ anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary,
        struct anv_batch_bo *last_bbo =
           list_last_entry(&secondary->batch_bos, struct anv_batch_bo, link);
  
-      anv_batch_emit(&primary->batch, GEN8_MI_BATCH_BUFFER_START,
-         GEN8_MI_BATCH_BUFFER_START_header,
-         ._2ndLevelBatchBuffer = _1stlevelbatch,
-         .AddressSpaceIndicator = ASI_PPGTT,
-         .BatchBufferStartAddress = { &first_bbo->bo, 0 },
-      );
+      emit_batch_buffer_start(primary, &first_bbo->bo, 0);
  
        struct anv_batch_bo *this_bbo = anv_cmd_buffer_current_batch_bo(primary);
        assert(primary->batch.start == this_bbo->bo.map);
        uint32_t offset = primary->batch.next - primary->batch.start;
+      const uint32_t inst_size = GEN8_MI_BATCH_BUFFER_START_length * 4;
  
-      struct GEN8_MI_BATCH_BUFFER_START ret = {
-         GEN8_MI_BATCH_BUFFER_START_header,
-         ._2ndLevelBatchBuffer = _1stlevelbatch,
-         .AddressSpaceIndicator = ASI_PPGTT,
-         .BatchBufferStartAddress = { &this_bbo->bo, offset },
-      };
-      last_bbo->relocs.num_relocs++;
-      GEN8_MI_BATCH_BUFFER_START_pack(&secondary->batch,
-                                      last_bbo->bo.map + last_bbo->length,
-                                      &ret);
-
+      /* Roll back the previous MI_BATCH_BUFFER_START and its relocation so we
+       * can emit a new command and relocation for the current splice.  In
+       * order to handle the initial-use case, we incremented next and
+       * num_relocs in end_batch_buffer() so we can alyways just subtract
+       * here.
+       */
+      last_bbo->relocs.num_relocs--;
+      secondary->batch.next -= inst_size;
+      emit_batch_buffer_start(secondary, &this_bbo->bo, offset);
        anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos);
+
+      /* After patching up the secondary buffer, we need to clflush the
+       * modified instruction in case we're on a !llc platform. We use a
+       * little loop to handle the case where the instruction crosses a cache
+       * line boundary.
+       */
+      if (!primary->device->info.has_llc) {
+         void *inst = secondary->batch.next - inst_size;
+         void *p = (void *) (((uintptr_t) inst) & ~CACHELINE_MASK);
+         __builtin_ia32_sfence();
+         while (p < secondary->batch.next) {
+            __builtin_ia32_clflush(p);
+            p += CACHELINE_SIZE;
+         }
+      }
+
        break;
     }
     case ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN: {
        struct list_head copy_list;
        VkResult result = anv_batch_bo_list_clone(&secondary->batch_bos,
-                                                secondary->device,
+                                                secondary,
                                                  &copy_list);
        if (result != VK_SUCCESS)
           return; /* FIXME */
@@ -755,8 +784,8 @@ anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary,
        assert(!"Invalid execution mode");
     }
  
-   /* Mark the surface buffer from the secondary as seen */
-   anv_cmd_buffer_add_seen_bbos(primary, &secondary->surface_bos);
+   anv_reloc_list_append(&primary->surface_relocs, &primary->pool->alloc,
+                         &secondary->surface_relocs, 0);
  }
  
  static VkResult
@@ -779,16 +808,16 @@ anv_cmd_buffer_add_bo(struct anv_cmd_buffer *cmd_buffer,
                              cmd_buffer->execbuf2.array_length * 2 : 64;
  
           struct drm_i915_gem_exec_object2 *new_objects =
-            anv_device_alloc(cmd_buffer->device, new_len * sizeof(*new_objects),
-                             8, VK_SYSTEM_ALLOC_TYPE_INTERNAL);
+            anv_alloc(&cmd_buffer->pool->alloc, new_len * sizeof(*new_objects),
+                      8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
           if (new_objects == NULL)
              return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
  
           struct anv_bo **new_bos =
-            anv_device_alloc(cmd_buffer->device, new_len * sizeof(*new_bos),
-                             8, VK_SYSTEM_ALLOC_TYPE_INTERNAL);
+            anv_alloc(&cmd_buffer->pool->alloc, new_len * sizeof(*new_bos),
+                      8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
           if (new_objects == NULL) {
-            anv_device_free(cmd_buffer->device, new_objects);
+            anv_free(&cmd_buffer->pool->alloc, new_objects);
              return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
           }
  
@@ -828,8 +857,11 @@ anv_cmd_buffer_add_bo(struct anv_cmd_buffer *cmd_buffer,
        obj->relocation_count = relocs->num_relocs;
        obj->relocs_ptr = (uintptr_t) relocs->relocs;
  
-      for (size_t i = 0; i < relocs->num_relocs; i++)
+      for (size_t i = 0; i < relocs->num_relocs; i++) {
+         /* A quick sanity check on relocations */
+         assert(relocs->relocs[i].offset < bo->size);
           anv_cmd_buffer_add_bo(cmd_buffer, relocs->reloc_bos[i], NULL);
+      }
     }
  
     return VK_SUCCESS;
@@ -858,20 +890,98 @@ anv_cmd_buffer_process_relocs(struct anv_cmd_buffer *cmd_buffer,
     }
  }
  
+static void
+adjust_relocations_from_block_pool(struct anv_block_pool *pool,
+                                   struct anv_reloc_list *relocs)
+{
+   for (size_t i = 0; i < relocs->num_relocs; i++) {
+      /* In general, we don't know how stale the relocated value is.  It
+       * may have been used last time or it may not.  Since we don't want
+       * to stomp it while the GPU may be accessing it, we haven't updated
+       * it anywhere else in the code.  Instead, we just set the presumed
+       * offset to what it is now based on the delta and the data in the
+       * block pool.  Then the kernel will update it for us if needed.
+       */
+      assert(relocs->relocs[i].offset < pool->state.end);
+      uint32_t *reloc_data = pool->map + relocs->relocs[i].offset;
+
+      /* We're reading back the relocated value from potentially incoherent
+       * memory here. However, any change to the value will be from the kernel
+       * writing out relocations, which will keep the CPU cache up to date.
+       */
+      relocs->relocs[i].presumed_offset = *reloc_data - relocs->relocs[i].delta;
+
+      /* All of the relocations from this block pool to other BO's should
+       * have been emitted relative to the surface block pool center.  We
+       * need to add the center offset to make them relative to the
+       * beginning of the actual GEM bo.
+       */
+      relocs->relocs[i].offset += pool->center_bo_offset;
+   }
+}
+
+static void
+adjust_relocations_to_block_pool(struct anv_block_pool *pool,
+                                 struct anv_bo *from_bo,
+                                 struct anv_reloc_list *relocs,
+                                 uint32_t *last_pool_center_bo_offset)
+{
+   assert(*last_pool_center_bo_offset <= pool->center_bo_offset);
+   uint32_t delta = pool->center_bo_offset - *last_pool_center_bo_offset;
+
+   /* When we initially emit relocations into a block pool, we don't
+    * actually know what the final center_bo_offset will be so we just emit
+    * it as if center_bo_offset == 0.  Now that we know what the center
+    * offset is, we need to walk the list of relocations and adjust any
+    * relocations that point to the pool bo with the correct offset.
+    */
+   for (size_t i = 0; i < relocs->num_relocs; i++) {
+      if (relocs->reloc_bos[i] == &pool->bo) {
+         /* Adjust the delta value in the relocation to correctly
+          * correspond to the new delta.  Initially, this value may have
+          * been negative (if treated as unsigned), but we trust in
+          * uint32_t roll-over to fix that for us at this point.
+          */
+         relocs->relocs[i].delta += delta;
+
+         /* Since the delta has changed, we need to update the actual
+          * relocated value with the new presumed value.  This function
+          * should only be called on batch buffers, so we know it isn't in
+          * use by the GPU at the moment.
+          */
+         assert(relocs->relocs[i].offset < from_bo->size);
+         uint32_t *reloc_data = from_bo->map + relocs->relocs[i].offset;
+         *reloc_data = relocs->relocs[i].presumed_offset +
+                       relocs->relocs[i].delta;
+      }
+   }
+
+   *last_pool_center_bo_offset = pool->center_bo_offset;
+}
+
  void
  anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer)
  {
     struct anv_batch *batch = &cmd_buffer->batch;
+   struct anv_block_pool *ss_pool =
+      &cmd_buffer->device->surface_state_block_pool;
  
     cmd_buffer->execbuf2.bo_count = 0;
     cmd_buffer->execbuf2.need_reloc = false;
  
+   adjust_relocations_from_block_pool(ss_pool, &cmd_buffer->surface_relocs);
+   anv_cmd_buffer_add_bo(cmd_buffer, &ss_pool->bo, &cmd_buffer->surface_relocs);
+
     /* First, we walk over all of the bos we've seen and add them and their
      * relocations to the validate list.
      */
     struct anv_batch_bo **bbo;
-   anv_vector_foreach(bbo, &cmd_buffer->seen_bbos)
+   anv_vector_foreach(bbo, &cmd_buffer->seen_bbos) {
+      adjust_relocations_to_block_pool(ss_pool, &(*bbo)->bo, &(*bbo)->relocs,
+                                       &(*bbo)->last_ss_pool_bo_offset);
+
        anv_cmd_buffer_add_bo(cmd_buffer, &(*bbo)->bo, &(*bbo)->relocs);
+   }
  
     struct anv_batch_bo *first_batch_bo =
        list_first_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link);
@@ -883,21 +993,19 @@ anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer)
      */
     if (first_batch_bo->bo.index != cmd_buffer->execbuf2.bo_count - 1) {
        uint32_t idx = first_batch_bo->bo.index;
+      uint32_t last_idx = cmd_buffer->execbuf2.bo_count - 1;
  
        struct drm_i915_gem_exec_object2 tmp_obj =
           cmd_buffer->execbuf2.objects[idx];
        assert(cmd_buffer->execbuf2.bos[idx] == &first_batch_bo->bo);
  
-      cmd_buffer->execbuf2.objects[idx] =
-         cmd_buffer->execbuf2.objects[cmd_buffer->execbuf2.bo_count - 1];
-      cmd_buffer->execbuf2.bos[idx] =
-         cmd_buffer->execbuf2.bos[cmd_buffer->execbuf2.bo_count - 1];
+      cmd_buffer->execbuf2.objects[idx] = cmd_buffer->execbuf2.objects[last_idx];
+      cmd_buffer->execbuf2.bos[idx] = cmd_buffer->execbuf2.bos[last_idx];
        cmd_buffer->execbuf2.bos[idx]->index = idx;
  
-      cmd_buffer->execbuf2.objects[cmd_buffer->execbuf2.bo_count - 1] = tmp_obj;
-      cmd_buffer->execbuf2.bos[cmd_buffer->execbuf2.bo_count - 1] =
-         &first_batch_bo->bo;
-      first_batch_bo->bo.index = cmd_buffer->execbuf2.bo_count - 1;
+      cmd_buffer->execbuf2.objects[last_idx] = tmp_obj;
+      cmd_buffer->execbuf2.bos[last_idx] = &first_batch_bo->bo;
+      first_batch_bo->bo.index = last_idx;
     }
  
     /* Now we go through and fixup all of the relocation lists to point to
@@ -907,6 +1015,16 @@ anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer)
     anv_vector_foreach(bbo, &cmd_buffer->seen_bbos)
        anv_cmd_buffer_process_relocs(cmd_buffer, &(*bbo)->relocs);
  
+   anv_cmd_buffer_process_relocs(cmd_buffer, &cmd_buffer->surface_relocs);
+
+   if (!cmd_buffer->device->info.has_llc) {
+      __builtin_ia32_sfence();
+      anv_vector_foreach(bbo, &cmd_buffer->seen_bbos) {
+         for (uint32_t i = 0; i < (*bbo)->length; i += CACHELINE_SIZE)
+            __builtin_ia32_clflush((*bbo)->bo.map + i);
+      }
+   }
+
     cmd_buffer->execbuf2.execbuf = (struct drm_i915_gem_execbuffer2) {
        .buffers_ptr = (uintptr_t) cmd_buffer->execbuf2.objects,
        .buffer_count = cmd_buffer->execbuf2.bo_count,
@@ -916,7 +1034,8 @@ anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer)
        .num_cliprects = 0,
        .DR1 = 0,
        .DR4 = 0,
-      .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_RENDER,
+      .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_RENDER |
+               I915_EXEC_CONSTANTS_REL_GENERAL,
        .rsvd1 = cmd_buffer->device->context_id,
        .rsvd2 = 0,
     };